# preprocessing
    - imputation
        - missing data imputation
        - numerical data imputation
            - mean value is filled in missing places
        - categorical data imputation
            - frequenct (mode) vale is filled in missing places

    - encoding
        - change ur categorical(non numerical data) into numerical data
            - label Encoding
                - if a column has 2 unique categorical values
            - one hot encoding
                - if a column  has more than 2 categorical values
            - vectorization 
                - if you want to convert words to vector(numerical) format

    - scaling / normalization
        - how to make the all the column to same scale so that ML algo can work properly

In [1]:
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [50]:
    os.listdir('dataset')

['sample_data.csv']

In [51]:
os.path.exists('dataset/sample_data.csv')

True

In [52]:
df = pd.read_csv('dataset/sample_data.csv')

In [53]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000.0,35.0,yes
1,spain,10000.0,30.0,no
2,italy,230000.0,39.0,no
3,spain,200000.0,30.0,yes
4,italy,300000.0,30.0,yes
5,spain,31000.0,23.0,no
6,germany,,34.0,yes
7,spain,400000.0,,yes
8,italy,200000.0,29.0,no
9,italy,340000.0,35.0,yes


IMPUTATION

In [54]:
SimpleImputer?

[0;31mInit signature:[0m
[0mSimpleImputer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmissing_values[0m[0;34m=[0m[0mnan[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrategy[0m[0;34m=[0m[0;34m'mean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_indicator[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Imputation transformer for completing missing values.

Read more in the :ref:`User Guide <impute>`.

.. versionadded:: 0.20
   `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`
   estimator which is now removed.

Parameters
----------
missing_values : int, float, str, np.nan or None,

In [55]:
imputer = SimpleImputer()

In [56]:
cols= ['salary','age']
df[cols] = imputer.fit_transform(df[cols]).astype(int)

In [57]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000,35,yes
1,spain,10000,30,no
2,italy,230000,39,no
3,spain,200000,30,yes
4,italy,300000,30,yes
5,spain,31000,23,no
6,germany,179000,34,yes
7,spain,400000,31,yes
8,italy,200000,29,no
9,italy,340000,35,yes


ENCODING

In [58]:
df.nunique() # we select happy col for label encoding

country     3
salary     10
age         8
happy       2
dtype: int64

In [59]:
hpyEncoder = LabelEncoder()
df['happy'] = hpyEncoder.fit_transform(df['happy'])

In [60]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000,35,1
1,spain,10000,30,0
2,italy,230000,39,0
3,spain,200000,30,1
4,italy,300000,30,1
5,spain,31000,23,0
6,germany,179000,34,1
7,spain,400000,31,1
8,italy,200000,29,0
9,italy,340000,35,1


ONE HOT ENCODING

In [61]:
countryHotEnc = OneHotEncoder(drop='first')
country_enc = countryHotEnc.fit_transform(df[['country']]).toarray()      # 2 square bracket otherwise it will take row data

In [62]:
country_enc

array([[0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 0.]])

In [65]:
hot_enc_countrydf = pd.DataFrame(country_enc,columns=['italy','spain'])

In [66]:
df = pd.concat([hot_enc_countrydf,df],axis=1)

In [67]:
df.drop(columns=['country'],axis=1,inplace=True)

In [68]:
df

Unnamed: 0,italy,spain,salary,age,happy
0,0.0,0.0,49000,35,1
1,0.0,1.0,10000,30,0
2,1.0,0.0,230000,39,0
3,0.0,1.0,200000,30,1
4,1.0,0.0,300000,30,1
5,0.0,1.0,31000,23,0
6,0.0,0.0,179000,34,1
7,0.0,1.0,400000,31,1
8,1.0,0.0,200000,29,0
9,1.0,0.0,340000,35,1


SCALING & NORMALIZATION

In [69]:
cols = df.columns.tolist()[:-1]

In [70]:
cols

['italy', 'spain', 'salary', 'age']

In [71]:
scaling = StandardScaler()
df[cols] = scaling.fit_transform(df[cols])

In [72]:
df

Unnamed: 0,italy,spain,salary,age,happy
0,-0.755929,-0.755929,-1.009405,0.911147,1
1,-0.755929,1.322876,-1.312226,-0.227787,0
2,1.322876,-0.755929,0.395997,1.822294,0
3,-0.755929,1.322876,0.163058,-0.227787,1
4,1.322876,-0.755929,0.939523,-0.227787,1
5,-0.755929,1.322876,-1.149169,-1.822294,0
6,-0.755929,-0.755929,0.0,0.68336,1
7,-0.755929,1.322876,1.715988,0.0,1
8,1.322876,-0.755929,0.163058,-0.455573,0
9,1.322876,-0.755929,1.250109,0.911147,1


In [1]:
pip install seaborn


Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 3.5 MB/s 
Installing collected packages: seaborn
Successfully installed seaborn-0.11.1
Note: you may need to restart the kernel to use updated packages.
