In [8]:
import numpy as np
import pandas as pd

### Handling Duplicate

In [14]:
# Handling Duplicate
data={'Name':['Ali','Abu','Siti','Lee','Ali'],
     'Age':[25,30,35,30,35],
     'City':['KL','SGR','MLK','PHG','KL']}
df=pd.DataFrame(data)
print(df)

# Check duplicate
duplicate=df.duplicated()
print(duplicate)

   Name  Age City
0   Ali   25   KL
1   Abu   30  SGR
2  Siti   35  MLK
3   Lee   30  PHG
4   Ali   35   KL
0    False
1    False
2    False
3    False
4    False
dtype: bool


In [15]:
df_no_dup=df.drop_duplicates()
print(df_no_dup)

   Name  Age City
0   Ali   25   KL
1   Abu   30  SGR
2  Siti   35  MLK
3   Lee   30  PHG
4   Ali   35   KL


In [17]:
dup_name=df.duplicated(subset='Name')
print(dup_name)

0    False
1    False
2    False
3    False
4     True
dtype: bool


In [20]:
df_no_dup_name=df.duplicated(subset='Name',keep='first')
print(df_no_dup_name)

0    False
1    False
2    False
3    False
4     True
dtype: bool


In [21]:
df_no_dup_name=df.duplicated(subset='Name',keep=False)
print(df_no_dup_name)

0     True
1    False
2    False
3    False
4     True
dtype: bool


### Handling Missing Data

In [28]:
# Detecting missing data
data={
    'A':[1,2,None,4], #for string None
    'B':[np.nan,2,3,4], #for numerical np.nan
    'C':['na','valid',None,'missing']
}
df=pd.DataFrame(data)
print(df)
checkmiss=df.notna() # or isna
print(checkmiss)

     A    B        C
0  1.0  NaN       na
1  2.0  2.0    valid
2  NaN  3.0     None
3  4.0  4.0  missing
       A      B     C
0   True  False  True
1   True   True  True
2  False   True  True
3   True   True  True


In [29]:
# Dropping missing data
df_drop=df.dropna(axis=1) #columns
print(df_drop)

         C
0       na
1    valid
2     None
3  missing


In [30]:
# Dropping missing data
df_drop=df.dropna(subset=['A'], axis=0)
print(df_drop)

     A    B        C
0  1.0  NaN       na
1  2.0  2.0    valid
3  4.0  4.0  missing


In [31]:
# Filling missing data 
df_fill=df.fillna(9999) #(Fill missing data with a specified value)
print(df_fill)

        A       B        C
0     1.0  9999.0       na
1     2.0     2.0    valid
2  9999.0     3.0     None
3     4.0     4.0  missing


In [32]:
df_ffill=df.ffill() #Forward fill (use previous value)
print(df_ffill)

     A    B        C
0  1.0  NaN       na
1  2.0  2.0    valid
2  2.0  3.0     None
3  4.0  4.0  missing


In [33]:
df_bfill=df.bfill() #Backward fill (use next value)
print(df_bfill)

     A    B        C
0  1.0  2.0       na
1  2.0  2.0    valid
2  4.0  3.0     None
3  4.0  4.0  missing


In [34]:
df_linearfill=df.interpolate(method='linear') # only on numerical columns
print(df_linearfill)

     A    B        C
0  1.0  NaN       na
1  2.0  2.0    valid
2  3.0  3.0     None
3  4.0  4.0  missing


  df_linearfill=df.interpolate(method='linear') # only on numerical columns


In [35]:
# fill with mean column
data={
    'A':[1,2,None,4], #for string None
    'B':[np.nan,2,3,4], #for numerical np.nan
    'C':['na','valid',None,'missing']
}
df=pd.DataFrame(data)
print(df)
checkmiss=df.notna() # or isna
print(checkmiss)

     A    B        C
0  1.0  NaN       na
1  2.0  2.0    valid
2  NaN  3.0     None
3  4.0  4.0  missing
       A      B      C
0   True  False   True
1   True   True   True
2  False   True  False
3   True   True   True


In [43]:
df['A'].mode()

0    1.000000
1    2.000000
2    2.333333
3    4.000000
Name: A, dtype: float64

In [41]:
df['A']=df['A'].fillna(df['A'].mode()[0]) 
print(df)

          A    B        C
0  1.000000  3.0       na
1  2.000000  2.0    valid
2  2.333333  3.0     None
3  4.000000  4.0  missing


In [37]:
df['A']=df['A'].fillna(df['A'].mean()) 
print(df)

          A    B        C
0  1.000000  NaN       na
1  2.000000  2.0    valid
2  2.333333  3.0     None
3  4.000000  4.0  missing


In [38]:
df['B']=df['B'].fillna(df['B'].median()) 
print(df)

          A    B        C
0  1.000000  3.0       na
1  2.000000  2.0    valid
2  2.333333  3.0     None
3  4.000000  4.0  missing


### Replacing Values

In [45]:
df=pd.DataFrame({
    'A':[1,2,3,4],
    'B':['apple','banana','apple','kiwi']
})
print(df)

   A       B
0  1   apple
1  2  banana
2  3   apple
3  4    kiwi


In [48]:
df['B']=df['B'].replace('apple','orange')
print(df)

   A       B
0  1  orange
1  2  banana
2  3  orange
3  4    kiwi


In [49]:
df['B']=df['B'].replace(['banana','orange'],'fruit')
print(df)

   A      B
0  1  fruit
1  2  fruit
2  3  fruit
3  4   kiwi


In [70]:
df=pd.DataFrame({
    'A':[1,2,3,4],
    'B':['apple','banana','apple','kiwi']
})
print(df)
df['B']=df['B'].replace({'apple':'fruit1','banana':'fruit2'})
print(df)

   A       B
0  1   apple
1  2  banana
2  3   apple
3  4    kiwi
   A       B
0  1  fruit1
1  2  fruit2
2  3  fruit1
3  4    kiwi


### Handling Outliers

In [64]:
df=pd.DataFrame({'Value':[10,1,250,200,15,300,20,25,30,12,5000]})
print(df)
# compute mean and sd
mean=df['Value'].mean()
std=df['Value'].std()

lower_bound=mean-3*std
upper_bound=mean+3*std

# extract outlier with 3 sd of mean
df[(df['Value']<lower_bound)|(df['Value']>upper_bound)]

# or outlier with thres
thres=3*std
df[abs(df['Value']-mean)>thres]

    Value
0      10
1       1
2     250
3     200
4      15
5     300
6      20
7      25
8      30
9      12
10   5000


Unnamed: 0,Value
10,5000


In [62]:
# replace outlier (using .loc)
df.loc[abs(df['Value']-mean)>thres]=np.nan

In [63]:
print(df)

    Value
0    10.0
1     1.0
2   250.0
3   200.0
4    15.0
5   300.0
6    20.0
7    25.0
8    30.0
9    12.0
10    NaN


In [65]:
# (np.where())
df['Value']=np.where(abs(df['Value']-mean)>thres,9999,df['Value'])
print(df)

    Value
0      10
1       1
2     250
3     200
4      15
5     300
6      20
7      25
8      30
9      12
10   9999


In [68]:
# (apply())
df=pd.DataFrame({'Value':[10,1,250,200,15,300,20,25,30,12,5000]})
thres=3*std
mean=df['Value'].mean()
def replace_outlier(val):
    if abs(val-mean)>thres:
        return(9999)
    return(val)

df['Value']=df['Value'].apply(replace_outlier)
print(df)

    Value
0      10
1       1
2     250
3     200
4      15
5     300
6      20
7      25
8      30
9      12
10   9999


### Data Transformation

In [71]:
# using APPLY/MAP

df=pd.DataFrame({
    'Name':['Ali','Abu','Siti','Ahmad'],
    'Weight':[60,85,72,95],
    'Height':[1.65,1.78,1.75,1.80]
})

In [74]:
df['Weight']/(df['Height']**2)

0    22.038567
1    26.827421
2    23.510204
3    29.320988
dtype: float64

In [75]:
# create new column
df['BMI']=df['Weight']/(df['Height']**2)
print(df)

    Name  Weight  Height        BMI
0    Ali      60    1.65  22.038567
1    Abu      85    1.78  26.827421
2   Siti      72    1.75  23.510204
3  Ahmad      95    1.80  29.320988


In [78]:
# (apply())
df=pd.DataFrame({
    'Name':['Ali','Abu','Siti','Ahmad'],
    'Weight':[60,85,72,95],
    'Height':[1.65,1.78,1.75,1.80]
})

def calculate_bmi(row):
    return row['Weight']/(row['Height']**2)

df['BMI']=df.apply(calculate_bmi,axis=1)
print(df)

    Name  Weight  Height        BMI
0    Ali      60    1.65  22.038567
1    Abu      85    1.78  26.827421
2   Siti      72    1.75  23.510204
3  Ahmad      95    1.80  29.320988


In [85]:
# (apply())
df=pd.DataFrame({
    'Name':['Ali','Abu','Siti','Ahmad'],
    'Weight':[60,85,72,95],
    'Height':[1.65,1.78,1.75,1.80]
})

def weight_category(row):
    if row['Weight']<65:
        return 'Normal'
    else:
        return 'Overweight'

df['weight category']=df.apply(weight_category,axis=1)
print(df)

    Name  Weight  Height weight category
0    Ali      60    1.65          Normal
1    Abu      85    1.78      Overweight
2   Siti      72    1.75      Overweight
3  Ahmad      95    1.80      Overweight


In [83]:
df=pd.DataFrame({
    'Category':['A','B','A','C','B','c','A']
})

# use MAP to map category using the following dictionary
dict={'A':'Apple',
      'B':'Banana',
      'C':'Cherry'   
}

# create a new column called fruit
df['Fruit']=df['Category'].map(dict)
print(df)

  Category   Fruit
0        A   Apple
1        B  Banana
2        A   Apple
3        C  Cherry
4        B  Banana
5        c     NaN
6        A   Apple


### Discretization

In [86]:
# pd.cut
df=pd.DataFrame({'Mark':[12,20,35,50,65,75,88,95]})
df['Mark_Bin']=pd.cut(df['Mark'],bins=3) # cut into equal size
print(df)

   Mark          Mark_Bin
0    12  (11.917, 39.667]
1    20  (11.917, 39.667]
2    35  (11.917, 39.667]
3    50  (39.667, 67.333]
4    65  (39.667, 67.333]
5    75    (67.333, 95.0]
6    88    (67.333, 95.0]
7    95    (67.333, 95.0]


In [88]:
# pd.cut(x, bins, labels=None, right=True)

df=pd.DataFrame({'Mark':[12,20,35,50,65,75,88,95]})
df['Mark_Bin']=pd.cut(df['Mark'],labels=['C','B','A'],bins=3) # cut into equal size
print(df)

   Mark Mark_Bin
0    12        C
1    20        C
2    35        C
3    50        B
4    65        B
5    75        A
6    88        A
7    95        A


In [91]:
df=pd.DataFrame({'Mark':[12,20,35,50,65,75,88,95]})
labels=['C','B','A']
bins=[0,70,80,100]
df['Mark_Bin']=pd.cut(df['Mark'],bins=bins,right=False) # right is choosing which interval to be closed
print(df)

   Mark   Mark_Bin
0    12    [0, 70)
1    20    [0, 70)
2    35    [0, 70)
3    50    [0, 70)
4    65    [0, 70)
5    75   [70, 80)
6    88  [80, 100)
7    95  [80, 100)


In [92]:
# pd.qcut(x, q, labels=None)

df=pd.DataFrame({'Mark':[12,20,35,50,65,75,88,95]})
df['Mark_Bin']=pd.qcut(df['Mark'],q=4)
print(df)

   Mark         Mark_Bin
0    12  (11.999, 31.25]
1    20  (11.999, 31.25]
2    35    (31.25, 57.5]
3    50    (31.25, 57.5]
4    65    (57.5, 78.25]
5    75    (57.5, 78.25]
6    88    (78.25, 95.0]
7    95    (78.25, 95.0]


### Permutation and Random Sampling

In [94]:
arr=np.array([1,2,3,4,5])

shuffle_arr=np.random.permutation(arr)

print('original arr:',arr)
print('shuffled arr:',shuffle_arr)

original arr: [1 2 3 4 5]
shuffled arr: [5 4 3 1 2]


In [98]:
df=pd.DataFrame({
    'A':[1,2,3,4,5],
    'B':['a','b','c','d','e']
})

np.random.seed(42)
shuffle_df=df.sample(frac=1)
print('original arr:',arr)
print('shuffled arr:',shuffle_df)

original arr: [1 2 3 4 5]
shuffled arr:    A  B
1  2  b
4  5  e
2  3  c
0  1  a
3  4  d


In [99]:
# numpy array
arr=np.array([1,2,3,4,5])

np.random.seed(42)
arr_rep=np.random.choice(arr,size=3,replace=True)
arr_wrep=np.random.choice(arr,size=3,replace=False)
print('original arr:',arr)
print('sample with replacement:',arr_rep)
print('sample with without replacement:',arr_wrep)

original arr: [1 2 3 4 5]
sample with replacement: [4 5 3]
sample with without replacement: [4 2 3]


In [100]:
## panda 
df=pd.DataFrame({
    'A':[1,2,3,4,5],
    'B':['a','b','c','d','e']
})

np.random.seed(42)
df_rep=df.sample(n=3,replace=True)
df_wrep=df.sample(n=3,replace=False)
print('original arr:',arr)
print('sample with replacement:',df_rep)
print('sample with without replacement:',df_wrep)

original arr: [1 2 3 4 5]
sample with replacement:    A  B
3  4  d
4  5  e
2  3  c
sample with without replacement:    A  B
3  4  d
1  2  b
2  3  c


In [101]:
df=pd.DataFrame({
    'A':[1,2,3,4,5],
    'B':['a','b','c','d','e']
})

np.random.seed(42)
shuffle_df=df.iloc[np.random.permutation(df.index)]
print('original df:',df)
print('shuffled df:',shuffle_arr)

original df:    A  B
0  1  a
1  2  b
2  3  c
3  4  d
4  5  e
shuffled df: [5 4 3 1 2]


### Dummy Variable

In [104]:
df=pd.DataFrame({
    'City':['KL','SGR','TRG','PHG'],
    'Category':['A','B','C','A']
})
df_dummy=pd.get_dummies(df,columns=['Category'],drop_first=False)
print(df_dummy)

  City  Category_A  Category_B  Category_C
0   KL        True       False       False
1  SGR       False        True       False
2  TRG       False       False        True
3  PHG        True       False       False


In [103]:
df

Unnamed: 0,City,Category
0,KL,A
1,SGR,B
2,TRG,C
3,PHG,A


### String Manipulation

In [105]:
text='Python is FUN!'

# convert to lowercase
print(text.upper())

PYTHON IS FUN!


In [111]:
# string validation
text='12345'

print(text.isdigit())

True


In [113]:
text='Hello Worlds!'
print(text.strip()) #remove whitespace

Hello Worlds!


In [115]:
text='Hello Python!'
print(text.find('r')) # -1 meaning it is not found

-1


In [117]:
text='Hello Python!'
print(text.startswith('H'))

True


In [118]:
text='I love Python'
print(text.replace('Python','R'))

I love R


In [121]:
text='apple,banana,cherry'
fruit=text.split(',')
print(fruit)

print('&'.join(fruit))

['apple', 'banana', 'cherry']
apple&banana&cherry


In [122]:
df=pd.DataFrame({
    'Name':['Ali','Abu','Siti','AHMAD'],
    'Comment':['I hate Python','Python is boring','I find R is easier','I just HATE both']    
})
df

Unnamed: 0,Name,Comment
0,Ali,I hate Python
1,Abu,Python is boring
2,Siti,I find R is easier
3,AHMAD,I just HATE both


In [124]:
df['Name']=df['Name'].str.lower()
df

Unnamed: 0,Name,Comment
0,ali,I hate Python
1,abu,Python is boring
2,siti,I find R is easier
3,ahmad,I just HATE both


In [125]:
df['Comment'].str.isdigit()

0    False
1    False
2    False
3    False
Name: Comment, dtype: bool

In [127]:
df=pd.DataFrame({
    'Name':['Ali','Abu','Siti','AHMAD'],
    'Comment':['I hate Python','Python is boring ','I find R is easier  ','I just HATE both']    
})
print(df)
df['Comment']=df['Comment'].str.strip()
print(df)

    Name               Comment
0    Ali         I hate Python
1    Abu     Python is boring 
2   Siti  I find R is easier  
3  AHMAD      I just HATE both
    Name             Comment
0    Ali       I hate Python
1    Abu    Python is boring
2   Siti  I find R is easier
3  AHMAD    I just HATE both


### Regular Expressions

In [132]:
text="John:25, Alice-30, Bob-35"

import re
outfind=re.findall(r'\w+:\d+',text)
print(outfind)

['John:25']


In [134]:
text="Do you like C++ or C#"

import re
outfind=re.findall(r'C\+\+',text)
print(outfind)

['C++']


In [135]:
text="There are 100 apples and 50 mangoes in this basket"

import re
outfind=re.findall(r'\d+',text)
print(outfind)

['100', '50']


In [138]:
# read data from directory

df=pd.read_csv('sample_data_cleaning.csv',skiprows=4)
df

Unnamed: 0,No,Name,sink Table,dishwasher,no Of mugs/plates,Note
0,1,Ali,sink only,no,9,-
1,2,Johan,sink and table,YES!,2-3,all looking GOOD!
2,3,Daus,"sink,table",Yes,8,
3,4,George,,,0,All of us at workshop
4,5,Amelia,just the table,yes,around 10,
5,6,Ahmad,both,NO,6,-
6,6,Ahmad,both,NO,6,-
7,7,Ava,didn't need doing,no.,4,not messy
8,8,Harry,just the sink,no,3,
9,9,Cindy,sink & table,YES,6-9,can't remember them


In [None]:
df=pd.read_csv('sample_data_cleaning.csv',skiprows=4,na_values==['-','NaN'])
df_columns=df.column.str.lower()
df=df.drop_duplicates()
df['dishwasher']=df['dishwasher'].str.lower()
df['dishwasher']=df['dishwasher'].str.replace(r'^\w\s',regex=True)