# Machine Learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [10]:
filename = '../datasets/iris.csv'

names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

df = pd.read_csv(filename)

df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


### Exploration

In [11]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [12]:
df.shape

(150, 5)

In [13]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [14]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

## data cleaning / data munging

In [15]:
df['species'] = df['species'].astype('category')
df['species']

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [16]:
df.dtypes

sepal_length     float64
sepal_width      float64
petal_length     float64
petal_width      float64
species         category
dtype: object

## Splits de data in een training set en een test set

In [24]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3)

In [25]:
df_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
84,5.4,3.0,4.5,1.5,versicolor
61,5.9,3.0,4.2,1.5,versicolor
81,5.5,2.4,3.7,1.0,versicolor
19,5.1,3.8,1.5,0.3,setosa
29,4.7,3.2,1.6,0.2,setosa
...,...,...,...,...,...
77,6.7,3.0,5.0,1.7,versicolor
134,6.1,2.6,5.6,1.4,virginica
70,5.9,3.2,4.8,1.8,versicolor
78,6.0,2.9,4.5,1.5,versicolor


In [26]:
df_test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
30,4.8,3.1,1.6,0.2,setosa
40,5.0,3.5,1.3,0.3,setosa
71,6.1,2.8,4.0,1.3,versicolor
5,5.4,3.9,1.7,0.4,setosa
83,6.0,2.7,5.1,1.6,versicolor
18,5.7,3.8,1.7,0.3,setosa
53,5.5,2.3,4.0,1.3,versicolor
12,4.8,3.0,1.4,0.1,setosa
103,6.3,2.9,5.6,1.8,virginica
67,5.8,2.7,4.1,1.0,versicolor


## Preparation

In [27]:
feature_names = names[:-1]
target_name = names[-1]

In [30]:
print('Features: ', feature_names)
print('Targert: ', target_name)

Features:  ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Targert:  species


In [None]:
df_train_features = df_train[feature_names]
df_train_target = df_train[target_name]

df_test_features = df_test[feature_names]
df_test_target = df_test[target_name]

## Scaling numeric features

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_train_features)

StandardScaler()

In [42]:
train_features_scaled = scaler.transform(df_train_features)
test_features_scaled = scaler.transform(df_test_features)

In [43]:
df_train_features_scaled = pd.DataFrame(
    train_features_scaled,
    columns = [col + '_scaled' for col in list(feature_names)], 
    index = df_train_features.index)

In [44]:
df_test_features_scaled = pd.DataFrame(
    test_features_scaled,
    columns = [col + '_scaled' for col in list(feature_names)], 
    index = df_test_features.index)

In [49]:
df_train = pd.concat([
    df_train_features,
    df_train_features_scaled, 
    df_train_target], axis=1)

In [50]:
df_test = pd.concat([
    df_test_features,
    df_test_features_scaled, 
    df_test_target], axis=1)

In [51]:
df_train.to_csv('../datasets/iris_training_dataset.csv') 
df_test.to_csv('../datasets/iris_test_dataset.csv')

In [52]:
!ls ../datasets

[34mWijkBuurtkaart_2019_v2up[m[m     iris.csv
WijkBuurtkaart_2019_v2up.zip iris_test_dataset.csv
ca-500.csv                   iris_training_dataset.csv
etmgeg_260.txt               test_data.csv
etmgeg_279.txt               train_data.csv
