# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset=pd.read_csv("Data.csv")
x=dataset.iloc[:, :-1].values
y=dataset.iloc[:,-1].values


In [None]:
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer =SimpleImputer(missing_values=np.nan,strategy="mean")
imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])

In [None]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[("encoder",OneHotEncoder(),[0])],remainder="passthrough")
x=np.array(ct.fit_transform(x))


In [None]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
[0 1 0 0 1 1 0 1]
[0 1]


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train[:,1:3]=sc.fit_transform(X_train[:,1:3])
X_test[:,1:3]=sc.transform(X_test[:,1:3])
print(X_train)
print(X_test)

[[0.0 -0.5773502691896258 1.2909944487358056 38.77777777777778 52000.0]
 [0.0 1.7320508075688774 -0.7745966692414834 40.0 63777.77777777778]
 [1.0 -0.5773502691896258 -0.7745966692414834 44.0 72000.0]
 [0.0 -0.5773502691896258 1.2909944487358056 38.0 61000.0]
 [0.0 -0.5773502691896258 1.2909944487358056 27.0 48000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 48.0 79000.0]
 [0.0 1.7320508075688774 -0.7745966692414834 50.0 83000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 35.0 58000.0]]
[[0.0 1.7320508075688774 -0.7745966692414834 30.0 54000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 37.0 67000.0]]


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


df = pd.read_csv('winequality-red.csv', delimiter=';')


X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
print(X_train)
print(X_test)


[[-0.73307913  0.6648928  -1.25704443 ...  0.98846046  0.0630946
  -0.87223395]
 [ 1.06774091 -0.62346154  1.52314768 ... -1.7535127  -0.17390392
  -0.77978452]
 [-1.74604041 -1.07158479 -1.35814232 ...  2.32756363  0.77409018
   3.28799021]
 ...
 [-0.95818164  1.08500835 -0.90320179 ...  0.22325865 -1.00339876
   0.32960859]
 [-0.62052788  0.55286199 -1.35814232 ...  0.35079228 -0.47015208
  -1.33448108]
 [ 0.44870902 -0.73549236  1.16930505 ... -0.6694768   0.18159387
   1.90124882]]
[[ 0.27988214 -0.67947695  1.87699031 ...  1.43482818  0.00384497
   0.05226031]
 [ 0.22360652  0.55286199  0.05722821 ... -0.35064271 -0.17390392
  -0.22508797]
 [ 1.18029216 -1.07158479  1.57369663 ... -0.79701044  0.2408435
   0.79185571]
 ...
 [-0.67680351  0.77692361 -1.35814232 ...  0.79716001 -0.7071506
  -0.96468337]
 [-1.18328414 -0.79150776  0.81546242 ...  0.86092682  1.24808723
   0.69940629]
 [ 1.4616703  -1.18361561  1.27040294 ... -0.22310908  0.65559092
   0.97675456]]
