# Data Cleaning using Pandas
### Data Science Pipeline Workshop 11 Juni 2022
- Author : Randy Galawana
- Email  : randy_galawana1@telkomsel.co.id
&copy; Telkomsel 2022

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd

In [None]:
df = pd.read_csv("data/employee.csv")

In [None]:
## cast to save memory 
df['gender'] = df['gender'].astype('category')
df['employment_status'] = df['employment_status'].astype('category')
df['birth_date'] = df['birth_date'].astype('datetime64')


In [None]:
df.head()

### Check Statistics

In [None]:
df.info()

In [None]:
df.describe(include='all', datetime_is_numeric=True)

## Handling Missing Values / NA / Null / None

In [None]:
## to count the number of null / na / none values 
df.isnull().sum()

In [None]:
count_missing = df.isnull().sum()
missing = count_missing[count_missing>0]
df[missing.index].info()

### Drop Missing Features if to much missing values in it

In [None]:
## filter columns with missing value more than 60%
missing[missing>df.shape[0]*0.6]

In [None]:
df = df.drop(labels='tax_file_no', axis=1)
print(df.shape)
df.head()

### Drop Row if there is too much missing features

In [None]:
# drop any rows that have less than 2 elements.
df_new = df.dropna(axis='index', thresh=2, inplace=False)
print(df_new.shape)

In [None]:
# Check what rows is removed
rows_removed = df[~df.index.isin(df_new.index)]
rows_removed

In [None]:
df.dropna(axis='index', thresh=2, inplace=True)
df.info()

### Filling Missing Value/ Imputers

#### Simple Imputers : Filling with New Catagory

In [None]:
# Fill with simple value (simple Imputers)
# Set a default category for missing genders `U` (Undefined/Unknown)

df['gender'].cat.add_categories(new_categories=['U'], inplace=True)
df.fillna(value={'gender': 'U'}, inplace=True)
print(df.info())

#### Simple Imputers (2) : Filling with Constant Value

In [None]:
df2 = pd.read_excel("data/movies.xls")

In [None]:
count_missing = df2.isnull().sum()
missing = count_missing[count_missing>0]
df2[missing.index].info()

In [None]:
df2.fillna(value={'Reviews by Crtiics': 0}, inplace=True)

#### Imputers with Mean, Median and Mode values

In [None]:
df2[missing.index].describe()

In [None]:
### Check Statistic for Budget, Gross Earnings and Aspect Ratio
import matplotlib.pyplot as plt

plt.figure(figsize=[10,4])
plt.subplot(1,3,1)
plt.hist(df2['Budget'],color="red")
plt.title('Budget')
plt.subplot(1,3,2)
plt.hist(df2['Gross Earnings'],color="skyblue")
plt.title('Gross Earnings')
plt.subplot(1,3,3)
plt.hist(df2['Aspect Ratio'],color="darkgreen")
plt.title('Aspect Ratio')

plt.tight_layout()
plt.show()


In [None]:
# Budget and Gross Earnings data nya Skew, best approach menggunakan Median
median = df2[['Budget','Gross Earnings']].median()
median

In [None]:
df2[['Budget','Gross Earnings']] = df2[['Budget','Gross Earnings']].fillna(median,axis=0)

##### Walaupun Aspect Ratio sedikit skew, namun kita bisa coba menggunakan mean values

In [None]:
df2['Aspect Ratio'] = df2['Aspect Ratio'].fillna(df2['Aspect Ratio'].mean())

In [None]:
count_missing = df2.isnull().sum()
missing = count_missing[count_missing>0]
df2[missing.index].info()

#### mode untuk categorical features

In [None]:
df2['Language'].value_counts()

In [None]:
df2['Language'] = df2['Language'].fillna(df2['Language'].mode()[0])

In [None]:
count_missing = df2.isnull().sum()
missing = count_missing[count_missing>0]
df2[missing.index].info()

## De-Duplication
Remove DUplicate Records

In [None]:
df3 = pd.read_csv('data/imdb_database.csv')

#### Find Duplicated Rows

In [None]:
df3[df3.duplicated(subset=['movie_ID', 'movie_title'])]

In [None]:
# dropping ALL duplicate values
df3.drop_duplicates(subset = ['movie_ID', 'movie_title'], keep = False, inplace = True)

## Standardization and Normalization

### Outliers Treatment
how to treat outliers data to normalize distribution

In [None]:
### Find OUtliers
plt.figure(figsize=(10,6))

plt.subplot(1,2,1)
plt.boxplot(df2['Budget'])
plt.xlabel('Budget')

plt.subplot(1,2,2)
plt.boxplot(df2['Gross Earnings'])
plt.xlabel('Gross Earnings')

plt.show()

In [None]:
# IQR Method
## IQR Method
q1 = df2['Budget'].quantile(0.25)
q3 = df2['Budget'].quantile(0.75)
iqr = q3-q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 + 1.5*iqr

In [None]:
outliers = df2[(df2['Budget']>upper_bound)|(df2['Budget']<lower_bound)]
print(len(outliers['Budget']))
outliers['Budget'].head()

In [None]:
## Create a function to apply outlier detection
def find_outliers(x):
    if x.dtype == 'O':
        return x
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3-q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    outliers = x[(x>upper_bound)|(x<lower_bound)]
    x[(x>upper_bound)] = upper_bound
    x[(x<lower_bound)] = lower_bound
    return x

In [None]:
df2 = df2.apply(find_outliers, axis=0)

In [None]:
### Find OUtliers
plt.figure(figsize=(10,6))

plt.subplot(1,3,1)
plt.boxplot(df2['Budget'])
plt.xlabel('Budget')

plt.subplot(1,3,2)
plt.boxplot(df2['Gross Earnings'])
plt.xlabel('Gross Earnings')

plt.subplot(1,3,3)
plt.boxplot(df2['Aspect Ratio'])
plt.xlabel('Aspect Ratio')

plt.show()

### Standard Scaler

In [None]:
df_numeric = df2[['Aspect Ratio', 'Budget', 'Gross Earnings', 'User Votes', 'Reviews by Users', 'IMDB Score']]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_numeric)

In [None]:
df_standardized = pd.DataFrame(scaler.transform(df_numeric),columns = df_numeric.columns)
df_standardized.head()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(1,2,1)
plt.hist(df2['User Votes'])
plt.xlabel('User Votes Before StandartScaler')

plt.subplot(1,2,2)
plt.hist(df_standardized['User Votes'])
plt.xlabel('User Votes After StandartScaler')

plt.show()

### MinMax Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(df_numericeric)

In [None]:
df_minmax = pd.DataFrame(scaler.transform(df_numeric),columns = df_numeric.columns)
df_minmax.head()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(1,2,1)
plt.hist(df2['User Votes'])
plt.xlabel('User Votes Before MinMaxScaler')

plt.subplot(1,2,2)
plt.hist(df_minmax['User Votes'])
plt.xlabel('User Votes After MinMaxScaler')

plt.show()

### Polynomial Transformation

In [None]:
from sklearn.preprocessing import PolynomialFeatures

scaler = PolynomialFeatures(degree=2,)
scaler.fit(df_numeric)

In [None]:
df_poly = pd.DataFrame(scaler.transform(df_minmax),columns = scaler.get_feature_names(df_minmax.columns))
df_poly.head()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(1,2,1)
plt.hist(df_poly['User Votes'])
plt.xlabel('User Votes Before PolynomialFeatures')

plt.subplot(1,2,2)
plt.hist(df_poly['User Votes^2'])
plt.xlabel('User Votes After PolynomialFeatures')

plt.show()

### Normalization

In [None]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaler.fit(df_numeric)

In [None]:
df_normalized = pd.DataFrame(scaler.transform(df_numeric),columns = df_numeric.columns)
df_normalized.head()

In [None]:
plt.figure(figsize=(10,6))

plt.subplot(1,2,1)
plt.hist(df_numeric['Budget'])
plt.xlabel('Budget Before Normalization')

plt.subplot(1,2,2)
plt.hist(df_normalized['Budget'])
plt.xlabel('Budget After Normalization')

plt.show()

### Encoding (Categorical Encoding to Numerical Features)

#### One Hot Encoder

In [None]:
df2['Language'].value_counts()

In [None]:
df_object = df2[['Language']]

In [None]:
from sklearn.preprocessing import OneHotEncoder
scaler = OneHotEncoder()
scaler.fit(df_object)

In [None]:
df_dummy = pd.DataFrame(scaler.transform(df_object).toarray(),columns = scaler.get_feature_names())

In [None]:
df_dummy.head()