In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./artifacts/CleanupData.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   State             10 non-null     object 
 1   Age               9 non-null      float64
 2   Pocket Money      9 non-null      float64
 3   Course Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


### Creation of independent matrix

In [3]:
X = df.iloc[:,:-1].values
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

### Creation of Independent matrix

In [4]:
y = df.iloc[:, 3].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Using scikit-learn to Impute NaN values

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy="mean")

In [6]:
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

### Handling Categorical Data

In [7]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()

In [8]:
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

### Numbers as Label can be deceptive for model as 0 < 1 < 2, So Dummy matrix has to be created

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [10]:
column_transformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = np.array(column_transformer.fit_transform(X), dtype = np.float64)

In [11]:
np.set_printoptions(suppress=True)
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

In [12]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Prepare Test and training dataset
### Split data into 2 parts, one for training and other for testing

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
X_train

array([[   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778]])

In [15]:
y_train

array([0, 1, 1, 1, 0, 0, 0, 1])

### Feature Scaling (If required)

In [16]:
from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()

X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)

In [17]:
X_train

array([[ 1.29099445, -0.77459667, -0.57735027,  1.69258297,  1.74283999],
       [-0.77459667,  1.29099445, -0.57735027,  1.36717316,  1.3614282 ],
       [-0.77459667,  1.29099445, -0.57735027, -0.4225808 ,  0.21719283],
       [-0.77459667,  1.29099445, -0.57735027, -0.74799061, -0.6409837 ],
       [-0.77459667, -0.77459667,  1.73205081, -0.13332763, -1.21310139],
       [-0.77459667, -0.77459667,  1.73205081, -0.25987589, -0.35492486],
       [ 1.29099445, -0.77459667, -0.57735027, -1.56151513, -1.0223955 ],
       [ 1.29099445, -0.77459667, -0.57735027,  0.06553392, -0.09005556]])

In [18]:
X_test

array([[-0.77459667,  1.29099445, -0.57735027,  0.71635354,  0.69395756],
       [-0.77459667, -0.77459667,  1.73205081, -2.04962985, -1.59451318]])