In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]})
threshold = 30
df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [6]:
# select rows from a DataFrame where any value in the row exceeds a thresho

df[(df > threshold).any(axis=1)]

Unnamed: 0,A,B
3,4,40
4,5,50


In [38]:
df = pd.DataFrame({'A': [2, 1, 2, 2], 'B': [2, 3, 1, 2], 'C': [1, 2, 3, 1], 'D': [2, 1, 2, 2]})
df

Unnamed: 0,A,B,C,D
0,2,2,1,2
1,1,3,2,1
2,2,1,3,2
3,2,2,1,2


In [None]:
# remove duplicate rows from a DataFrame based on a subset of columns
# here, we remove duplicates based on columns 'A', 'B', and 'D'
# e.g., rows where 'A', 'B', and 'D' all have the same values are considered duplicates
df.drop_duplicates(subset=['A', 'B', 'D'])

Unnamed: 0,A,B,C,D
0,2,2,1,2
1,1,3,2,1
2,2,1,3,2


In [42]:
# create a sample dataframe to illustrate converting
# a column of strings to categorical data type

data = {'Category': ['A', 'B', 'A', 'C', 'B', 'A']}
df = pd.DataFrame(data)
df['Category'] = df['Category'].astype('category')
df['Category'].dtype

CategoricalDtype(categories=['A', 'B', 'C'], ordered=False)

In [43]:
# create dummy variables for the 'Category' column
pd.get_dummies(df, columns=['Category'])


Unnamed: 0,Category_A,Category_B,Category_C
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0
5,1,0,0


In [40]:
# rename column B to 'Beta' in the DataFrame
df.rename(columns={'B': 'Beta'}, inplace=True)
df

Unnamed: 0,A,Beta,C,D
0,2,2,1,2
1,1,3,2,1
2,2,1,3,2
3,2,2,1,2


In [9]:
# sort a DataFrame by columns `A` and `B`
df.sort_values(by=['A', 'B'])

Unnamed: 0,A,B,C
1,1,3,2
2,2,1,3
0,2,2,1


In [11]:
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
print(df1)
print(df2)

   A  B
0  1  4
1  2  5
2  3  6
   A   B
0  7  10
1  8  11
2  9  12


In [52]:
#concatenate the two DataFrames vertically
pd.concat([df1, df2], ignore_index=True)
#ignore_index=True to reset the index in the concatenated DataFrame

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6
3,7,10
4,8,11
5,9,12


In [57]:
#  handle outliers by capping them to a maximum or minimum value uising thresholding and remove
df = pd.DataFrame({'Values': [10, 20, 30, 1000, 50, 60, -200, 80]})
threshold_min = 0
threshold_max = 100
df['Values'] = df['Values'].clip(lower=threshold_min, upper=threshold_max)
df

Unnamed: 0,Values
0,10
1,20
2,30
3,100
4,50
5,60
6,0
7,80


In [53]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
print(df)


   A
0  1
1  2
2  3
3  4
4  5


In [19]:
# compute the cumulative sum of a column in a DataFr
df['A'].cumsum()

0     1
1     3
2     6
3    10
4    15
Name: A, dtype: int64

In [None]:
series = pd.Series(['apple', 'banana', 'cherry'])
print(series)

0     apple
1    banana
2    cherry
dtype: object


In [26]:
# convert a Series of strings to uppercase
series.str.upper()

0     APPLE
1    BANANA
2    CHERRY
dtype: object

In [58]:
# read csv file from ../data/ex4.csv
df = pd.read_csv('../data/ex4.csv')
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [None]:
# skipped first and last rows while reading csv
# engine='python' is specified to use the Python parsing engine. is this optional?
# it is necessary when using skipfooter
df = pd.read_csv('../data/ex4.csv', skiprows=1, skipfooter=1, engine='python')
df

Unnamed: 0,a,b,c,d,message
0,# just wanted to make things more difficult fo...,,,,
1,# who reads CSV files with computers,anyway?,,,
2,1,2,3.0,4.0,hello
3,5,6,7.0,8.0,world


In [47]:
# randomly permute the rows of a DataFrame
df.sample(frac=1).reset_index(drop=True)


Unnamed: 0,a,b,c,d,message
0,5,6,7.0,8.0,world
1,# just wanted to make things more difficult fo...,,,,
2,1,2,3.0,4.0,hello
3,# who reads CSV files with computers,anyway?,,,


In [62]:
# output first 2 columns to CSV file without the index
df.to_csv('../data/output6.csv', columns=df.columns[0:3], index=False)