In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
data = pd.read_csv('datas.csv',sep=',')

In [3]:
data.head()

Unnamed: 0,country,height,weight,age,gender
0,tr,130,30.0,10,e
1,tr,125,36.0,11,e
2,tr,135,34.0,10,k
3,tr,133,30.0,9,k
4,tr,129,38.0,12,e


In [4]:
height = data[['height']]
height.head()

Unnamed: 0,height
0,130
1,125
2,135
3,133
4,129


## Data Preprocessing

### Missing Values

In [5]:
#https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

##### strategy 1 : mean

In [6]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')   

In [7]:
weight = data.iloc[:,1:4].values
weight

array([[130.,  30.,  10.],
       [125.,  36.,  11.],
       [135.,  34.,  10.],
       [133.,  30.,   9.],
       [129.,  38.,  12.],
       [180.,  90.,  30.],
       [190.,  80.,  25.],
       [175.,  90.,  35.],
       [177.,  nan,  22.],
       [185., 105.,  33.],
       [165.,  55.,  27.],
       [155.,  50.,  44.],
       [160.,  58.,  39.],
       [162.,  59.,  41.],
       [167.,  62.,  55.],
       [174.,  70.,  47.],
       [193.,  90.,  23.],
       [187.,  80.,  27.],
       [183.,  nan,  28.],
       [159.,  40.,  29.],
       [164.,  66.,  32.],
       [166.,  56.,  42.]])

In [8]:
imp_mean = imp_mean.fit(weight)
imp_mean

SimpleImputer()

In [9]:
weight_mean = imp_mean.transform(weight)
weight_mean

array([[130.  ,  30.  ,  10.  ],
       [125.  ,  36.  ,  11.  ],
       [135.  ,  34.  ,  10.  ],
       [133.  ,  30.  ,   9.  ],
       [129.  ,  38.  ,  12.  ],
       [180.  ,  90.  ,  30.  ],
       [190.  ,  80.  ,  25.  ],
       [175.  ,  90.  ,  35.  ],
       [177.  ,  60.95,  22.  ],
       [185.  , 105.  ,  33.  ],
       [165.  ,  55.  ,  27.  ],
       [155.  ,  50.  ,  44.  ],
       [160.  ,  58.  ,  39.  ],
       [162.  ,  59.  ,  41.  ],
       [167.  ,  62.  ,  55.  ],
       [174.  ,  70.  ,  47.  ],
       [193.  ,  90.  ,  23.  ],
       [187.  ,  80.  ,  27.  ],
       [183.  ,  60.95,  28.  ],
       [159.  ,  40.  ,  29.  ],
       [164.  ,  66.  ,  32.  ],
       [166.  ,  56.  ,  42.  ]])

#### strategy 2 : most_frequent

In [10]:
imp_freq = SimpleImputer(strategy="most_frequent")

In [11]:
imp_freq = imp_freq.fit(weight)
weight_feq = imp_freq.transform(weight)
weight_feq

array([[130.,  30.,  10.],
       [125.,  36.,  11.],
       [135.,  34.,  10.],
       [133.,  30.,   9.],
       [129.,  38.,  12.],
       [180.,  90.,  30.],
       [190.,  80.,  25.],
       [175.,  90.,  35.],
       [177.,  90.,  22.],
       [185., 105.,  33.],
       [165.,  55.,  27.],
       [155.,  50.,  44.],
       [160.,  58.,  39.],
       [162.,  59.,  41.],
       [167.,  62.,  55.],
       [174.,  70.,  47.],
       [193.,  90.,  23.],
       [187.,  80.,  27.],
       [183.,  90.,  28.],
       [159.,  40.,  29.],
       [164.,  66.,  32.],
       [166.,  56.,  42.]])

#### strategy 3 : knn

In [12]:
from sklearn.impute import KNNImputer

In [13]:
imp_knn = KNNImputer(n_neighbors=2, weights="uniform")

In [14]:
imp_knn = imp_knn.fit(weight)
weight_knn = imp_knn.transform(weight)
weight_knn

array([[130.,  30.,  10.],
       [125.,  36.,  11.],
       [135.,  34.,  10.],
       [133.,  30.,   9.],
       [129.,  38.,  12.],
       [180.,  90.,  30.],
       [190.,  80.,  25.],
       [175.,  90.,  35.],
       [177.,  85.,  22.],
       [185., 105.,  33.],
       [165.,  55.,  27.],
       [155.,  50.,  44.],
       [160.,  58.,  39.],
       [162.,  59.,  41.],
       [167.,  62.,  55.],
       [174.,  70.,  47.],
       [193.,  90.,  23.],
       [187.,  80.,  27.],
       [183.,  85.,  28.],
       [159.,  40.,  29.],
       [164.,  66.,  32.],
       [166.,  56.,  42.]])

In [15]:
#https://scikit-learn.org/stable/modules/impute.html#impute

#### Types of Data


![Image of Yaktocat](https://d1m75rqqgidzqn.cloudfront.net/wp-data/2022/06/01113127/types-of-data--1024x555.png)


##### Qualitative or Categorical Data
Nominal Data
* plate codes
* gender

Ordinal Data
* low, medium and high.


##### Quantitative or numerical Data
* shoe size
* exam


### Encoder

* Because some models don't work with categorical variable
https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02


In [16]:
country = data.iloc[:,0:1].values
country

array([['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr']], dtype=object)

#### Label Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
country[:,0] = le.fit_transform(country[:,0])
country

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]], dtype=object)

#### One Hot Encoding

In [20]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
country = ohe.fit_transform(country).toarray()
country

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

** dummy variable

### TRAIN-TEST SPLIT

In [21]:
#country_df = pd.DataFrame(data = country , index=range(country.shape[0]), columns=data.country.unique())
#country_df

In [22]:
x_data = data.iloc[:,1:-1]
country_df = pd.DataFrame(data = country , index=range(country.shape[0]), columns=data.country.unique())
x_data = pd.concat([country_df,x_data],axis=1)
x_data.head()

Unnamed: 0,tr,us,fr,height,weight,age
0,0.0,1.0,0.0,130,30.0,10
1,0.0,1.0,0.0,125,36.0,11
2,0.0,1.0,0.0,135,34.0,10
3,0.0,1.0,0.0,133,30.0,9
4,0.0,1.0,0.0,129,38.0,12


In [23]:
y_data = data.iloc[:,-1]
data2.head()

0    e
1    e
2    k
3    k
4    e
Name: gender, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.33, random_state=0)


In [None]:
x_train,y_train

### Standart Scaler

In [None]:
from sklearn.preprocessing import StandardScaler


#### standart scaler

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)


In [None]:
X_test

#### min-max scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_m = sc.fit_transform(x_train)
X_test_m = sc.fit_transform(x_test)
X_test_m

### Normalization

In [None]:
from sklearn import preprocessing

In [None]:
normalized_arr = preprocessing.normalize(weight_knn)
print(normalized_arr)

#### Dummy Variable
The Dummy Variable Trap occurs when two or more dummy variables created by one-hot encoding are highly correlated (multi-collinear). This means that one variable can be predicted from the others, making it difficult to interpret predicted coefficient variables in regression models. In other words, the individual effect of the dummy variables on the prediction model can not be interpreted well because of multicollinearity.

Using the one-hot encoding method, a new dummy variable is created for each categorical variable to represent the presence (1) or absence (0) of the categorical variable.

In [24]:
import pandas as pd
c1 = ['pine', 'oak', 'oak', 'pine', 'pine' ]
pd.get_dummies(c1)

Unnamed: 0,oak,pine
0,0,1
1,1,0
2,1,0
3,0,1
4,0,1


In [25]:
pd.get_dummies(c1, drop_first=True)

Unnamed: 0,pine
0,1
1,0
2,0
3,1
4,1


### p-value

In [None]:
if p < 0.05 h0 h is false