In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv('../data/student-por.csv', sep=';')
df_train = df.sample(frac=0.85,random_state=2020)
df_test = df.drop(df_train.index)

len(df_train), len(df_test)

(552, 97)

In [38]:
def encode_features(X, encoder=None):
    # One-hot encodes the categorical features 
    categorical_columns = X.select_dtypes(exclude='number')
    numerical_columns = X.select_dtypes(include='number')
    categorical_col_names = categorical_columns.columns.tolist()
    
    if encoder is None:
        encoder = OneHotEncoder(sparse=False)

    one_hot = encoder.transform(categorical_columns)
    categorical_col_names = encoder.get_feature_names(categorical_col_names)
    one_hot = pd.DataFrame(one_hot, index=numerical_columns.index, columns=categorical_col_names)
    
    return pd.concat([numerical_columns, one_hot], axis=1)

In [62]:
X_train, y_train = df_train.drop('G3', axis=1), df_train['G3']
X_test, y_test = df_test.drop('G3', axis=1), df_train['G3']

In [55]:
model = RandomForestRegressor(random_state=42)

In [68]:
for drop_vars in [['G2','G1'], ['G2'], ['G1'], []]:
    X_train_dropped = X_train.drop(drop_vars, axis =1)
    enc = OneHotEncoder(sparse=False).fit(X_train_dropped.select_dtypes(exclude='number'))
    X_train_encoded = encode_features(X_train_dropped, enc)
    
    performance = cross_validate(model, X_train_encoded, 
                                 y_train, cv=5,  
                                 scoring='neg_mean_squared_error')
    avg_mse = -performance['test_score'].mean()
    
    if drop_vars:
        print(f'Predicting G3 after dropping {" and ".join(drop_vars)}, 5-fold validated MSE average: {avg_mse:.2f}')
    else:
        print(f'Predicting G3 after dropping nothing, 5-fold validated MSE average: {avg_mse:.2f}')

Predicting G3 after dropping G2 and G1, 5-fold validated MSE average: 6.83
Predicting G3 after dropping G2, 5-fold validated MSE average: 3.35
Predicting G3 after dropping G1, 5-fold validated MSE average: 1.74
Predicting G3 after dropping nothing, 5-fold validated MSE average: 1.69
