# Data Preprocessing

## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Import Data

In [2]:
dataset = pd.read_csv('eksikveriler.csv')
dataset.head()

Unnamed: 0,ulke,boy,kilo,yas,cinsiyet
0,tr,130,30,10.0,e
1,tr,125,36,11.0,e
2,tr,135,34,10.0,k
3,tr,133,30,9.0,k
4,tr,129,38,12.0,e


#### Checking if there are missing values in the dataset

In [3]:
dataset.isnull().sum()

ulke        0
boy         0
kilo        0
yas         2
cinsiyet    0
dtype: int64

In [4]:
# to Handle Missing Data
yas = dataset.iloc[:,1:4].values
# to Label and OneHot Encoders
ulke = dataset.iloc[:, 0:1].values     # type: numpy.ndarray

## Handling Missing Data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

imputer = imputer.fit(yas[:, 1:4])
yas[:, 1:4] = imputer.transform(yas[:, 1:4])
print(yas)

[[130.    30.    10.  ]
 [125.    36.    11.  ]
 [135.    34.    10.  ]
 [133.    30.     9.  ]
 [129.    38.    12.  ]
 [180.    90.    30.  ]
 [190.    80.    25.  ]
 [175.    90.    35.  ]
 [177.    60.    22.  ]
 [185.   105.    33.  ]
 [165.    55.    27.  ]
 [155.    50.    44.  ]
 [160.    58.    28.45]
 [162.    59.    41.  ]
 [167.    62.    55.  ]
 [174.    70.    47.  ]
 [193.    90.    28.45]
 [187.    80.    27.  ]
 [183.    88.    28.  ]
 [159.    40.    29.  ]
 [164.    66.    32.  ]
 [166.    56.    42.  ]]


## Label Encoding into Categorical Data

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ulke[:, 0] = le.fit_transform(dataset.iloc[:, 0])
dataset.iloc[:, 0] = le.fit_transform(dataset.iloc[:,0])
print(ulke)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]


## OneHot Encoding into Label Encoded Data (Created Dummy Variable)

In [7]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ulke = ohe.fit_transform(ulke).toarray()
print(ulke)

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


## Merging of Data (DataFrame)

In [8]:
ulkeDataframe = pd.DataFrame(data = ulke, index = range(22), columns = ['fr', 'tr', 'us'])
ulkeDataframe.head()

Unnamed: 0,fr,tr,us
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0


In [9]:
boykiloyasDataframe = pd.DataFrame(data = yas, index = range(22), columns = ['boy', 'kilo', 'yas'])
boykiloyasDataframe.head()

Unnamed: 0,boy,kilo,yas
0,130.0,30.0,10.0
1,125.0,36.0,11.0
2,135.0,34.0,10.0
3,133.0,30.0,9.0
4,129.0,38.0,12.0


In [10]:
# Bağımlı Değişken
cinsiyet = dataset.iloc[:, -1].values
cinsiyetDataframe = pd.DataFrame(data = cinsiyet, index = range(22), columns = ['cinsiyet'])
cinsiyetDataframe.head()

Unnamed: 0,cinsiyet
0,e
1,e
2,k
3,k
4,e


In [11]:
# Bağımsız Değişkenler
s = pd.concat([ulkeDataframe, boykiloyasDataframe], axis = 1)
s.head()

Unnamed: 0,fr,tr,us,boy,kilo,yas
0,0.0,1.0,0.0,130.0,30.0,10.0
1,0.0,1.0,0.0,125.0,36.0,11.0
2,0.0,1.0,0.0,135.0,34.0,10.0
3,0.0,1.0,0.0,133.0,30.0,9.0
4,0.0,1.0,0.0,129.0,38.0,12.0


In [12]:
s2 = pd.concat([s, cinsiyetDataframe], axis = 1)
s2.head()

Unnamed: 0,fr,tr,us,boy,kilo,yas,cinsiyet
0,0.0,1.0,0.0,130.0,30.0,10.0,e
1,0.0,1.0,0.0,125.0,36.0,11.0,e
2,0.0,1.0,0.0,135.0,34.0,10.0,k
3,0.0,1.0,0.0,133.0,30.0,9.0,k
4,0.0,1.0,0.0,129.0,38.0,12.0,e


## Splitting the Dataset into Training Set and Test Set

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(s, cinsiyetDataframe, test_size = 0.33, random_state = 0)

## Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

In [15]:
print(X_train)

[[-0.63245553  0.8660254  -0.40824829  0.45049444 -0.29657884 -0.24717129]
 [-0.63245553  0.8660254  -0.40824829  1.00824945  0.5096549   0.03416189]
 [ 1.58113883 -1.15470054 -0.40824829  1.13696215  0.91277178  0.35769504]
 [-0.63245553  0.8660254  -0.40824829 -1.6089087  -1.18343596 -1.18494855]
 [-0.63245553  0.8660254  -0.40824829 -1.35148331 -1.34468271 -1.372504  ]
 [-0.63245553  0.8660254  -0.40824829  0.57920713  0.91277178  0.50305051]
 [ 1.58113883 -1.15470054 -0.40824829  0.87953676  0.5096549   0.22171734]
 [-0.63245553 -1.15470054  2.44948974  0.79372829  1.51744708  0.78438369]
 [-0.63245553  0.8660254  -0.40824829  0.36468597  0.91277178  0.97193914]
 [ 1.58113883 -1.15470054 -0.40824829  0.70791983  0.8321484   0.31549506]
 [-0.63245553  0.8660254  -0.40824829 -1.43729177 -1.50592946 -1.46628173]
 [-0.63245553  0.8660254  -0.40824829 -1.56600447 -1.50592946 -1.372504  ]
 [ 1.58113883 -1.15470054 -0.40824829  0.32178174  0.10653803  2.09727185]
 [-0.63245553 -1.15470054