In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import RegressorChain

In [2]:
df = pd.read_csv('../data/student-por.csv', sep=';')
df_train = df.sample(frac=0.85,random_state=2020)
df_test = df.drop(df_train.index)

len(df_train), len(df_test)

(552, 97)

In [3]:
def encode_features(X, encoder=None):
    # One-hot encodes the categorical features 
    categorical_columns = X.select_dtypes(exclude='number')
    numerical_columns = X.select_dtypes(include='number')
    categorical_col_names = categorical_columns.columns.tolist()
    
    if encoder is None:
        encoder = OneHotEncoder(sparse=False)

    one_hot = encoder.transform(categorical_columns)
    categorical_col_names = encoder.get_feature_names(categorical_col_names)
    one_hot = pd.DataFrame(one_hot, index=numerical_columns.index, columns=categorical_col_names)
    
    return pd.concat([numerical_columns, one_hot], axis=1)

In [4]:
X_train, y_train = df_train.drop('G3', axis=1), df_train['G3']
X_test, y_test = df_test.drop('G3', axis=1), df_test['G3']

In [5]:
random_forest = RandomForestRegressor(random_state=42)

In [6]:
for drop_vars in [['G2','G1'], ['G2'], ['G1'], []]:
    X_train_dropped = X_train.drop(drop_vars, axis =1)
    enc = OneHotEncoder(sparse=False).fit(X_train_dropped.select_dtypes(exclude='number'))
    X_train_encoded = encode_features(X_train_dropped, enc)
    
    performance = cross_validate(random_forest, X_train_encoded, 
                                 y_train, cv=5,  
                                 scoring='neg_mean_squared_error')
    avg_mse = -performance['test_score'].mean()
    
    if drop_vars:
        print(f'Predicting G3 after dropping {" and ".join(drop_vars)}, 5-fold validated MSE average: {avg_mse:.2f}')
    else:
        print(f'Predicting G3 after dropping nothing, 5-fold validated MSE average: {avg_mse:.2f}')

Predicting G3 after dropping G2 and G1, 5-fold validated MSE average: 6.83
Predicting G3 after dropping G2, 5-fold validated MSE average: 3.35
Predicting G3 after dropping G1, 5-fold validated MSE average: 1.74
Predicting G3 after dropping nothing, 5-fold validated MSE average: 1.69


In [7]:
# best model(least error) was when G1 and G2 were in model
enc = OneHotEncoder(sparse=False).fit(X_train.select_dtypes(exclude='number'))
X_train_encoded = encode_features(X_train, enc)
X_test_encoded = encode_features(X_test, enc)

In [8]:
random_forest.fit(X_train_encoded, y_train)

RandomForestRegressor(random_state=42)

In [9]:
preds = random_forest.predict(X_test_encoded)
mean_squared_error(y_test, preds)

3.4523762886597935

In [10]:
X_train, Y_train = df_train.drop(['G3','G2','G1'], axis=1), df_train[['G3','G2','G1']]
X_test, Y_test = df_test.drop(['G3','G2','G1'], axis=1), df_test[['G3','G2','G1']]

In [11]:
enc = OneHotEncoder(sparse=False).fit(X_train.select_dtypes(exclude='number'))
X_train_encoded = encode_features(X_train, enc)
X_test_encoded = encode_features(X_test, enc)

In [13]:
random_forest = RandomForestRegressor(random_state=42)
linear_regressor = LinearRegression()
models = [(random_forest, 'Random Forest'), (linear_regressor, 'Linear Regressor')]

for model, model_name in models:
    chain_model = RegressorChain(model, random_state=5, cv=5)
    chain_model.fit(X_train_encoded, Y_train)
    chain_preds = chain_model.predict(X_test_encoded)
    print(f'MSE using a chain of {model_name}s is {mean_squared_error(Y_test, chain_preds)}')

MSE using a chain of Random Forests is 8.054362199312715
MSE using a chain of Linear Regressors is 7.5492734251455405
