In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Import the presplit data
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40095 entries, 0 to 40094
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     40095 non-null  int64  
 1   amount_tsh             40095 non-null  float64
 2   funder                 40095 non-null  object 
 3   gps_height             40095 non-null  int64  
 4   installer              40095 non-null  object 
 5   longitude              40095 non-null  float64
 6   latitude               40095 non-null  float64
 7   num_private            40095 non-null  int64  
 8   basin                  40095 non-null  object 
 9   region                 40095 non-null  object 
 10  region_code            40095 non-null  int64  
 11  district_code          40095 non-null  int64  
 12  population             40095 non-null  int64  
 13  public_meeting         40095 non-null  object 
 14  scheme_management      40095 non-null  object 
 15  pe

In [4]:
X_train.drop('id', axis = 1, inplace=True)
X_test.drop('id', axis = 1, inplace=True)

In [5]:
# X_train = pd.get_dummies(X_train, drop_first=True)

# X_test = pd.get_dummies(X_test, drop_first=True)

In [6]:
y_train.drop('id', inplace=True, axis = 1)
y_test.drop('id', axis = 1, inplace=True)

In [7]:
y_train = np.array(y_train)
y_train = y_train.reshape(40095, )

In [8]:
y_test = np.array(y_test)
y_test = y_test.reshape(13365, )

In [9]:
rfc = RandomForestClassifier()
ss = StandardScaler()

X_train_nums = X_train.select_dtypes(exclude=object)
X_test_nums = X_test.select_dtypes(exclude=object)

X_train_nums_scaled = ss.fit_transform(X_train_nums)
X_test_nums_scaled = ss.transform(X_test_nums)

In [10]:
rfr.fit(X_train_nums_scaled, y_train)

preds = rfr.predict(X_test_nums_scaled)

In [11]:
train_scores = rfc.score(X_train_nums_scaled, y_train)
test_scores = rfc.score(X_test_nums_scaled, y_test)
print('The Mode\'s accuracy on the training data is', round(train_scores, 4))
print('The Model\'s accuracy on the test data is', round(test_scores, 4))

The Mode's accuracy on the training data is 0.9858
The Model's accuracy on the test data is 0.7166


In [12]:
log_loss_rfc = cross_val_score(rfc, X_train_nums_scaled, y_train, scoring='neg_log_loss', n_jobs = -1)
log_loss_rfc = -log_loss_rfr.mean()
print('Log Loss:', log_loss_rfc)

Log Loss: 1.0160654921418601


This inital model that only contains numerical data is severly overfitting the data. The next model will need to tune some hyperparameters to reduce the gap between the training and testing data. Our log loss has been decreased from the dummy classifer, but there is still room for improvement

In [14]:
pipline_1 = Pipeline([('ss', StandardScaler()), ('rfc', RandomForestClassifier())])

In [15]:
pipline_1.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('rfc', RandomForestClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'rfc': RandomForestClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'rfc__bootstrap': True,
 'rfc__ccp_alpha': 0.0,
 'rfc__class_weight': None,
 'rfc__criterion': 'gini',
 'rfc__max_depth': None,
 'rfc__max_features': 'auto',
 'rfc__max_leaf_nodes': None,
 'rfc__max_samples': None,
 'rfc__min_impurity_decrease': 0.0,
 'rfc__min_impurity_split': None,
 'rfc__min_samples_leaf': 1,
 'rfc__min_samples_split': 2,
 'rfc__min_weight_fraction_leaf': 0.0,
 'rfc__n_estimators': 100,
 'rfc__n_jobs': None,
 'rfc__oob_score': False,
 'rfc__random_state': None,
 'rfc__verbose': 0,
 'rfc__warm_start': False}

In [16]:
params_1 = {
    'rfc__max_depth': list(range(5,50,5)),
    'rfc__min_samples_leaf': list(range(5,50,5)),
    'rfc__n_estimators': list(range(25,500,25)),
}

In [22]:
rs_1 = RandomizedSearchCV(pipline_1, params_1, n_jobs=-1, verbose=2, n_iter=50)

rs_1.fit(X_train_nums, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 11.0min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('ss', StandardScaler()),
                                             ('rfc',
                                              RandomForestClassifier())]),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'rfc__max_depth': [5, 10, 15, 20, 25,
                                                           30, 35, 40, 45],
                                        'rfc__min_samples_leaf': [5, 10, 15, 20,
                                                                  25, 30, 35,
                                                                  40, 45],
                                        'rfc__n_estimators': [25, 50, 75, 100,
                                                              125, 150, 175,
                                                              200, 225, 250,
                                                              275, 300, 325,
                                                              

In [23]:
rs_1.score(X_train_nums, y_train), rs_1.score(X_test_nums, y_test)

(0.8270607307644344, 0.7218855218855219)

In [24]:
rs_1.best_params_

{'rfc__n_estimators': 325, 'rfc__min_samples_leaf': 5, 'rfc__max_depth': 45}

In [13]:
x = True
while x == True:
    assert print('Need to fix the imbalance between the training data and the testing data')
    assert print('Fix below')

Need to fix the imbalance between the training data and the testing data


AssertionError: 

In [None]:
X_train_nums = X_train.select_dtypes(exclude = object)

X_train_cat = X_train.select_dtypes('object')

In [None]:
numerical_pipeline = Pipeline(steps=[
    ('ss', StandardScaler())
])
                
categorical_pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                         sparse=False))
])


In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(X_train)

In [None]:
trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
], verbose=True, n_jobs=-1)

In [None]:
model_pipe = Pipeline(steps=[
    ('trans', trans),
    ('rfr', RandomForestClassifier(verbose=1, n_jobs=-1))
])

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
model_pipe.score(X_train, y_train)

In [None]:
model_pipe.score(X_test, y_test)

In [None]:
model_pipe.get_params()

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_train.info()

In [None]:
X_test = pd.get_dummies(X_test, drop_first=True)
X_test.info()