In [None]:
# Importation des modules Python
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import r2_score, mean_absolute_error, balanced_accuracy_score, f1_score
from matplotlib import pyplot as plt

In [None]:
# Import de GoogleDrive
import os
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/MOOC'

path = os.getcwd()
print(f"Le répertoire courant est : {path} \n")

Mounted at /content/drive
/content/drive/MyDrive/MOOC
Le répertoire courant est : /content/drive/MyDrive/MOOC 



In [None]:
# Lecture du fichier csv correspondant à la concaténation des tables Users, Messages et Results
df = pd.read_csv('users_messages_results.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72153 entries, 0 to 72152
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   username            72153 non-null  object 
 1   country             16092 non-null  object 
 2   gender              72153 non-null  object 
 3   level_of_education  17566 non-null  object 
 4   body                5601 non-null   float64
 5   course_id           72153 non-null  object 
 6   grade               72153 non-null  float64
dtypes: float64(2), object(5)
memory usage: 3.9+ MB


In [None]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924 entries, 1169 to 50928
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   username            924 non-null    object 
 1   country             924 non-null    object 
 2   gender              924 non-null    object 
 3   level_of_education  924 non-null    object 
 4   body                924 non-null    float64
 5   course_id           924 non-null    object 
 6   grade               924 non-null    float64
dtypes: float64(2), object(5)
memory usage: 57.8+ KB


In [None]:
# Pour modéliser une classification, on utilise le seuil de 0.5 tel que décrit sur le site de FUN MOOC. Idéalement, il faudrait utiliser la colonne 'Certificate Eligible', mais elle n'est pas disponible dans le dump postgres.
df.drop(['username'],  axis=1, inplace=True)
def transf(grade):
  if grade >= 0.5:
    return 0
  else:
    return 1
df['grade'] = df['grade'].apply(transf)

In [None]:
df.head()

Unnamed: 0,country,gender,level_of_education,body,course_id,grade
1169,FR,m,hs,1.0,MinesTelecom/04017/session01,0
1170,FR,m,hs,1.0,MinesTelecom/04018/session01,1
1180,FR,m,m,-0.44,MinesTelecom/04017/session01,0
1181,FR,m,m,-0.44,MinesTelecom/04018/session01,1
1191,FR,m,m,0.333333,MinesTelecom/04017/session01,1


Etant donné la contrainte de temps et l'impossibilité de tester tous les modèles, on a choisi une méthode ensembliste (GradientBoosting) qui avait donné de bons résultats sur nos projets précédents. La nouvelle version "HistGradientBoosting" est censée être plus rapide.

In [None]:
# Numerical variables
column_num = ['body']
transfo_num = Pipeline(steps=[
    ('scaling', RobustScaler())
])

In [None]:
# Categorical variables
column_cat = ['country', 'gender', 'level_of_education', 'course_id']
transfo_cat = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')), 
    ('dr', TruncatedSVD())    
    ])

In [None]:
# Class ColumnTransformer : apply alls steps on the whole dataset
preparation = ColumnTransformer(
    transformers=[
        ('data_cat', transfo_cat , column_cat),
        ('data_num', transfo_num , column_num)
    ])

In [None]:
# Declare the pipeline
pipe = Pipeline(steps=[('preparation', preparation),
                        ('model', HistGradientBoostingClassifier())])

In [None]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preparation', 'model', 'preparation__n_jobs', 'preparation__remainder', 'preparation__sparse_threshold', 'preparation__transformer_weights', 'preparation__transformers', 'preparation__verbose', 'preparation__verbose_feature_names_out', 'preparation__data_cat', 'preparation__data_num', 'preparation__data_cat__memory', 'preparation__data_cat__steps', 'preparation__data_cat__verbose', 'preparation__data_cat__onehot', 'preparation__data_cat__dr', 'preparation__data_cat__onehot__categories', 'preparation__data_cat__onehot__drop', 'preparation__data_cat__onehot__dtype', 'preparation__data_cat__onehot__handle_unknown', 'preparation__data_cat__onehot__max_categories', 'preparation__data_cat__onehot__min_frequency', 'preparation__data_cat__onehot__sparse', 'preparation__data_cat__onehot__sparse_output', 'preparation__data_cat__dr__algorithm', 'preparation__data_cat__dr__n_components', 'preparation__data_cat__dr__n_iter', 'preparation__data_cat__dr__n_ov

In [None]:
# Declare model and parameter for Grid Search
parameters = {'model__learning_rate': [0.05, 0.1, 0.15], 'model__max_iter' : range(100, 301, 100), 'model__max_leaf_nodes': range(30, 100, 30), 'model__min_samples_leaf' : range(10, 50, 10)}

In [None]:
# Declare the Grid Search method
grid = GridSearchCV(pipe, parameters, scoring=['balanced_accuracy', 'f1'], refit='balanced_accuracy', cv = 3, n_jobs =-1, verbose = 1, error_score="raise" )

In [None]:
# Display diagram
set_config(display="diagram")
preparation

In [None]:
# Fit the model
y = df['grade']
X = df.drop(columns='grade')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
# Evaluate cross validation performance 
print("CV score:", grid.best_score_)

CV score: 0.6468255342296783


In [None]:
# Find the best parameters 
print("CV parameters:", grid.best_params_)

CV parameters: {'model__learning_rate': 0.05, 'model__max_iter': 200, 'model__max_leaf_nodes': 90, 'model__min_samples_leaf': 40}


In [None]:
# Make predictions
y_pred = grid.predict(X_test)

In [None]:
# Evaluate model performance
print("balanced accuracy", balanced_accuracy_score(y_test, y_pred))
print("f1 score", f1_score(y_test, y_pred))

balanced accuracy 0.6464013325151223
f1 score 0.6330532212885154


Etant donné les contraites, on ne peut pas s'attendre à obtenir de bons résultats. Le fichier data_modelling2 représente juste une nouvelle tentative avec une sélection différente de features.