<a href="https://colab.research.google.com/github/dmachlanski/ncrm-causality-2021/blob/main/Day_1_Intro_to_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 1 - Introduction to Machine Learning

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix

## Regression

In [23]:
arr_housing = np.loadtxt("https://raw.githubusercontent.com/dmachlanski/ncrm-causality-2021/main/data/housing.data")

In [24]:
arr_housing.shape

(506, 14)

In [25]:
pd_housing = pd.DataFrame(arr_housing)
pd_housing.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [27]:
housing_x = housing[:, :-1]
housing_y = housing[:, -1]

housing_x_train, housing_x_test, housing_y_train, housing_y_test = train_test_split(housing_x, housing_y, test_size=0.3)

print(housing_x_train.shape)
print(housing_y_train.shape)
print(housing_x_test.shape)
print(housing_y_test.shape)

(354, 13)
(354,)
(152, 13)
(152,)


In [29]:
lr = LinearRegression()
lr.fit(housing_x_train, housing_y_train)

y_pred_train = lr.predict(housing_x_train)
y_pred_test = lr.predict(housing_x_test)

lr_mse_train = mean_squared_error(housing_y_train, y_pred_train)
lr_mse_test = mean_squared_error(housing_y_test, y_pred_test)

print(lr_mse_train)
print(lr_mse_test)

22.55645492042014
21.442038036541465


In [30]:
dt = DecisionTreeRegressor()
dt.fit(housing_x_train, housing_y_train)

y_pred_train = dt.predict(housing_x_train)
y_pred_test = dt.predict(housing_x_test)

dt_mse_train = mean_squared_error(housing_y_train, y_pred_train)
dt_mse_test = mean_squared_error(housing_y_test, y_pred_test)

print(dt_mse_train)
print(dt_mse_test)

0.0
21.88059210526316


In [31]:
rf = RandomForestRegressor()
rf.fit(housing_x_train, housing_y_train)

y_pred_train = rf.predict(housing_x_train)
y_pred_test = rf.predict(housing_x_test)

rf_mse_train = mean_squared_error(housing_y_train, y_pred_train)
rf_mse_test = mean_squared_error(housing_y_test, y_pred_test)

print(rf_mse_train)
print(rf_mse_test)

1.1777602768361577
13.959


In [37]:
params = {"max_leaf_nodes": [2, 5, 10, 20, 30], "max_depth": [2, 5, 10, 20, None], 'n_estimators': [10, 100, 200]}
rf = RandomForestRegressor()
rf_cv = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1)

rf_cv.fit(housing_x_train, housing_y_train)

print(rf_cv.best_params_)

y_pred_train = rf_cv.predict(housing_x_train)
y_pred_test = rf_cv.predict(housing_x_test)

rf_cv_mse_train = mean_squared_error(housing_y_train, y_pred_train)
rf_cv_mse_test = mean_squared_error(housing_y_test, y_pred_test)

print(rf_cv_mse_train)
print(rf_cv_mse_test)

{'max_depth': None, 'max_leaf_nodes': 30, 'n_estimators': 200}
3.046443497455265
14.734961572716294


## Classification

In [45]:
df_diab = pd.read_csv("https://raw.githubusercontent.com/dmachlanski/ncrm-causality-2021/main/data/pima-indians-diabetes.csv", delimiter=',', header=None)

In [46]:
df_diab.shape

(768, 9)

In [48]:
df_diab.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
np.unique(df_diab[8])

array([0, 1])

In [None]:
# TODO:
# - check class balance (plot)
# - train models
# - use classification_report (and maybe confusion_matrix? - plot)