In [79]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 

# Data import en opschonen

In [51]:
train = pd.read_csv("../data/train.csv", delimiter=';')

In [52]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7639 entries, 0 to 7638
Data columns (total 92 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   target         7639 non-null   int64  
 1   v2a1           7639 non-null   float64
 2   hacdor         7639 non-null   int64  
 3   rooms          7639 non-null   int64  
 4   hacapo         7639 non-null   int64  
 5   v14a           7639 non-null   int64  
 6   refrig         7639 non-null   int64  
 7   v18q1          7639 non-null   int64  
 8   r4h1           7639 non-null   int64  
 9   r4h2           7639 non-null   int64  
 10  r4m1           7639 non-null   int64  
 11  r4m2           7639 non-null   int64  
 12  escolari       7639 non-null   int64  
 13  pared1         7639 non-null   int64  
 14  pared2         7639 non-null   int64  
 15  pared3         7639 non-null   int64  
 16  pared4         7639 non-null   int64  
 17  pared5         7639 non-null   int64  
 18  pared6  

In [53]:
train.head()

Unnamed: 0,target,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4h1,r4h2,...,computer,television,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,rural,age
0,0,0.0,0,4,0,1,1,0,0,1,...,0,0,3,1,0,0,0,0,1,43
1,0,0.0,0,8,0,1,1,1,0,1,...,0,0,3,0,0,0,0,0,1,18
2,0,0.0,0,5,0,1,1,0,0,2,...,0,1,2,0,0,1,0,0,0,62
3,1,0.0,0,8,0,1,1,2,0,2,...,1,1,4,1,0,0,0,0,0,20
4,0,350000.0,0,5,0,1,1,3,1,1,...,1,0,3,1,0,0,0,0,0,3


In [54]:
train["overcrowding"] = train["overcrowding"].str.replace(",", ".").astype("float")
train["meaneduc"] = train["meaneduc"].str.replace(",", ".").astype("float")

In [55]:
test = train.sample(frac=0.2)
train = train.drop(test.index)

# Random Forest trainen

In [56]:
X = train.drop(columns="target")
Y = train["target"]

In [57]:
clf = RandomForestClassifier(n_estimators=100, max_depth=40)
clf = clf.fit(X, Y)

In [58]:
clf.score(test.drop(columns="target"), test["target"])

0.9358638743455497

# Hyper parameter estimation

In [88]:
np.arange(50, 400, 10)

array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
       180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300,
       310, 320, 330, 340, 350, 360, 370, 380, 390])

In [101]:
if current_best_params:
  param_grid = [
  {'n_estimators': np.arange(current_best_params["n_estimators"]-5, current_best_params["n_estimators"]+5, 1), 'max_depth': np.arange(current_best_params["max_depth"]-5, current_best_params["max_depth"]+5, 1)},
 ]
else: 
  param_grid = [
    {'n_estimators': np.arange(50, 400, 10), 'max_depth': np.arange(5, 80, 5)},
  ]

In [102]:
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1) 

In [103]:
X_train = train.drop(columns="target")
Y_train = train["target"]

In [104]:
X_test = test.drop(columns="target")
Y_test = test["target"]

In [105]:
# fitting the model for grid search 
grid.fit(X_train, Y_train) 

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [106]:
# print best parameter after tuning 
print(grid.best_params_) 
current_best_params = grid.best_params_
grid_predictions = grid.predict(X_test) 

# print classification report 
print(classification_report(Y_test, grid_predictions)) 

{'max_depth': 43, 'n_estimators': 320}
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       973
           1       0.96      0.86      0.90       555

    accuracy                           0.93      1528
   macro avg       0.94      0.92      0.93      1528
weighted avg       0.94      0.93      0.93      1528

