In [15]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from utils import get_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [16]:
engine = create_engine(get_config('mysql'))
engine

Engine(mysql://root:***@127.0.0.1:3306/g3_MOOC)

In [17]:
query = """
    Select body, polarity, subjectivity, eligibility, gender, education_level, country from Message m
    join Result r
    on m.username = r.username
    join User u
    on m.username = u.username;
    """
df = pd.read_sql(query, engine)

In [18]:
df['gender'] = df['gender'].replace("", np.nan)
df['country'] = df['country'].replace("", np.nan)
df['education_level'] = df['education_level'].replace("", np.nan)

In [19]:
df = df.dropna().drop_duplicates()

In [20]:
X = df.drop(columns='eligibility')
y = df['eligibility']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [21]:
X_cat = X.select_dtypes(include=[object])
X_num = X.select_dtypes(exclude=[object])

In [22]:
X_cat.columns

Index(['body', 'gender', 'education_level', 'country'], dtype='object')

In [23]:
col_tg = ColumnTransformer(
    transformers=[
        ('tf_num', RobustScaler(), X_num.columns),
        ('tf_cat', CountVectorizer(), 'body')
    ]
)
pipe = Pipeline(
    steps=[
        ('preparation', col_tg),
        ('model', KNeighborsClassifier())
    ]
)

In [24]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preparation',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tf_num',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  Index(['polarity', 'subjectivity'], dtype='object')),
                                                 ('tf_cat',
                                                  CountVectorizer(analyzer='word',
                                           

In [25]:
y_max_pred = pipe.predict(X_test)
print("RFR:", round(accuracy_score(y_test, y_max_pred), 5))

RFR: 0.21053


In [26]:
from pycaret.classification import *
exp_reg101 = setup(data = df, target = 'eligibility')

Unnamed: 0,Description,Value
0,session_id,5517
1,Target,eligibility
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(95, 7)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [27]:
best_models = compare_models(sort='Prec.')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.3929,0.3667,0.3667,0.2483,0.2929,-0.229,-0.2422,0.009
nb,Naive Bayes,0.2238,0.2417,0.4333,0.229,0.2966,-0.5078,-0.5563,0.006
svm,SVM - Linear Kernel,0.2262,0.0,0.3667,0.2279,0.2747,-0.5032,-0.5417,0.005
qda,Quadratic Discriminant Analysis,0.2833,0.2917,0.3333,0.2183,0.2607,-0.416,-0.4137,0.005
dt,Decision Tree Classifier,0.281,0.2097,0.1,0.1583,0.1119,-0.4764,-0.4958,0.006
ridge,Ridge Classifier,0.2095,0.0,0.1667,0.1167,0.1349,-0.5901,-0.6297,0.005
lr,Logistic Regression,0.2381,0.1222,0.1333,0.1083,0.119,-0.5528,-0.5846,0.008
ada,Ada Boost Classifier,0.1786,0.0847,0.1,0.0667,0.0778,-0.6445,-0.684,0.028
rf,Random Forest Classifier,0.1786,0.0889,0.0667,0.0583,0.0619,-0.6797,-0.6981,0.076
gbc,Gradient Boosting Classifier,0.1929,0.1333,0.0667,0.0583,0.0619,-0.6421,-0.6722,0.019


In [28]:
best_models

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [29]:
evaluate_model(best_models)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [30]:
predict_model(best_models)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.3448,0.3601,0.25,0.1333,0.1739,-0.2904,-0.3301


Unnamed: 0,polarity,subjectivity,body_ Bonjour \n La partie devoir se décompose de la sorte \n - un QCM de 5 questions (chacune apporte 1 point)\n - un TP à réaliser pour compléter lévaluation (15 points)\n \n Il y a 2 étapes dans la partie TP \n 1=> tout dabord fournir un programme qui répond à la consigne. Le code est à mettre dans la partie Your response (copier/coller de ta production). Le code sera évalué par des pairs.\n 2=> une fois la réponse à la consigne envoyée pour évaluation cest à toi dévaluer 2 à 3 autres participants.,body_ Bonsoir\n Non aucun prérequis.\n Si tu nas jamais fait de développement prend le temps de bien comprendre les exemples et le cours avance doucement sans sauter détapes (même celles qui te sembles simples) et ça devrait bien se passer.\n Tu as le wiki en complément et le forum si tu as des questions.\n ;-)\n Bon courage,body_ Pas nimporte quel whisky alors !!\n Sinon tu risques davoir encore plus de migraine quavant de poster ton commentaire très pédagogique...,body_ Tout à fait il suffit dappuyer sur le bouton poussoir avec ta souris (clic gauche). \n Et jajouterais que tu as un indicateur lumineux dans langle inférieur gauche du bouton poussoir qui tindique quil est bien enfoncé un point vert sallume quand tu simules lappuie avec la souris il séteint lorsque lon relâche le clic gauche,body_ Visiblement tu as aussi du mal à en expliquer les raisons... \n Cest dommage ça aurait pût aider pour te donner un peu délan... \n Aurais-tu un peu plus de détails à donner sur les difficultés rencontrées ou ce que tu ne comprends pas ?,body_Bonjour\n\nJe constate que le code utilisé dans les réponses aux TP nest pas indenté.\nSur le 1er TP que jai été amené à évaluer jai signalé dans les commentaires que la bonne pratique voulait que lon insère des retraits dans son code que lon aère les commentaires etc. Une remarque pour informer simplement...\n\nJusquà m’apercevoir que lindentation du code que jai fourni pour ce TP avait disparu... \nErreur de copié/collé jai dabord pensé. Mais idem pour le second TP lindentation du code avait sauté.\n\nPour le moment seules quelques lignes constituent nos programmes et il y a peu de complexité (je nai pas dis difficulté...).\nQuen sera-t-il si le code se densifie ?\nLes retraits permettent de conserver un peu de clarté dans le programme il respire...\n\nEst-il donc possible de conserver lindentation du code ? \nCertains ont-ils une astuce ou trouvé comment faire ?\n\nPar avance merci.\n\n;-)\n,body_Bonjour\n\nOui delay nest pas des plus adapté pour détecter réellement lappuie lorsque le feu tricolore est vert... \nOn se contente donc de détecter la pression sur la transition entre le vert et le orange. \n\nJe tinvite à parcourir le forum le sujet y est abordé plusieurs fois. Tu y trouvera des éléments de réponse complémentaires. \n,body_Bonjour\n\nPour ceux qui réalisent les TP sous 123d.circuits pourquoi ne pas fournir le lien vers le montage en commentaire au tout début du code soumis à évaluation ?\n\nPersonnellement je trouve que ce serait un plus aussi bien pour les évaluateurs qui pourront découvrir dautres montages ou un code mieux indenté que pour les évalués et notamment ceux qui ... euh... huuum... râlent un peu.\n\nAvec la limite que le code à prendre en compte sera toujours celui soumis à évaluation sur le site du MOOC et non celui qui est présent sur 123d.circuits puisquil peut être modifié à posteriori.\n\nVous en pensez quoi ?\n \n\n,...,education_level_m,education_level_none,education_level_p,country_BE,country_CN,country_FR,country_RE,eligibility,Label,Score
0,-0.333333,0.19375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.6
1,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.6
2,-1.0,0.433333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.6
3,1.0,0.338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,0.6
4,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,1,0.6
5,-0.666667,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.6
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.8
7,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,0.6
8,-1.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.8
9,-1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.6


In [31]:
save_model(best_models, 'classification_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='eligibility', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                 ('fix_perfect', Remove_100(target='eligibility')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
    