In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from utils import get_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
engine = create_engine(get_config('mysql'))
engine

Engine(mysql://root:***@127.0.0.1:3306/g3_MOOC)

In [3]:
query = """
    Select body, polarity, subjectivity, eligibility, gender, education_level, country from Message m
    join Result r
    on m.username = r.username
    join User u
    on m.username = u.username;
    """
df = pd.read_sql(query, engine)

In [4]:
df['gender'] = df['gender'].replace("", np.nan)
df['country'] = df['country'].replace("", np.nan)
df['education_level'] = df['education_level'].replace("", np.nan)

In [5]:
df = df.dropna().drop_duplicates()

In [6]:
df['education_level'].unique()
df['eligibility'].unique()

array([0, 1])

In [77]:
X = df.drop(columns='eligibility')
y = df['eligibility']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [78]:
X_cat = X.select_dtypes(include=[object])
X_num = X.select_dtypes(exclude=[object])

In [79]:
X_cat.columns

Index(['body', 'gender', 'education_level', 'country'], dtype='object')

In [80]:
col_tg = ColumnTransformer(
    transformers=[
        ('tf_num', RobustScaler(), X_num.columns),
        ('tf_cat', CountVectorizer(), 'body')
    ]
)
pipe = Pipeline(
    steps=[
        ('preparation', col_tg),
        ('model', KNeighborsClassifier())
    ]
)

In [81]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preparation',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tf_num',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  Index(['polarity', 'subjectivity'], dtype='object')),
                                                 ('tf_cat',
                                                  CountVectorizer(analyzer='word',
                                           

In [82]:
y_max_pred = pipe.predict(X_test)
print("RFR:", round(accuracy_score(y_test, y_max_pred), 5))

RFR: 0.21053


In [83]:
from pycaret.classification import *
exp_reg101 = setup(data = df, target = 'eligibility')

Unnamed: 0,Description,Value
0,session_id,6768
1,Target,eligibility
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(95, 7)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [84]:
best_models = compare_models(sort='Prec.', n_select= 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.15,0.1917,0.3333,0.1517,0.2052,-0.482,-0.6881,0.005
knn,K Neighbors Classifier,0.5333,0.41,0.1167,0.15,0.13,-0.1355,-0.1421,0.007
qda,Quadratic Discriminant Analysis,0.2714,0.2525,0.2,0.1367,0.1583,-0.4256,-0.4866,0.005
svm,SVM - Linear Kernel,0.4452,0.0,0.1333,0.0619,0.0778,-0.224,-0.2467,0.004
gbc,Gradient Boosting Classifier,0.4452,0.2675,0.05,0.0333,0.04,-0.3024,-0.3295,0.015
lr,Logistic Regression,0.5476,0.1708,0.0,0.0,0.0,-0.1823,-0.2114,0.301
dt,Decision Tree Classifier,0.4286,0.3483,0.0,0.0,0.0,-0.3524,-0.3795,0.005
ridge,Ridge Classifier,0.4143,0.0,0.0,0.0,0.0,-0.3646,-0.3943,0.005
rf,Random Forest Classifier,0.4119,0.1908,0.0,0.0,0.0,-0.3738,-0.3979,0.066
ada,Ada Boost Classifier,0.4286,0.1979,0.0,0.0,0.0,-0.3524,-0.3795,0.029


In [85]:
tuned_top3 = [tune_model(i) for i in best_models] 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.9167,0.3333,1.0,0.5,0.3636,0.4714
1,0.5714,0.9167,0.0,0.0,0.0,0.0,0.0
2,0.5714,1.0,0.0,0.0,0.0,0.0,0.0
3,0.7143,1.0,0.0,0.0,0.0,0.0,0.0
4,0.7143,0.75,0.0,0.0,0.0,0.0,0.0
5,0.7143,0.6,0.0,0.0,0.0,0.0,0.0
6,0.6667,1.0,0.0,0.0,0.0,0.0,0.0
7,0.6667,1.0,0.0,0.0,0.0,0.0,0.0
8,0.8333,0.625,0.5,1.0,0.6667,0.5714,0.6325
9,0.6667,1.0,0.0,0.0,0.0,0.0,0.0


In [86]:
blend = blend_models(tuned_top3) 

stack = stack_models(tuned_top3) 

best_auc_model = automl(optimize = 'Prec.') 

best_auc_model 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5714,0.5833,0.0,0.0,0.0,0.0,0.0
1,0.5714,0.1667,0.0,0.0,0.0,0.0,0.0
2,0.4286,0.0,0.0,0.0,0.0,-0.2727,-0.3536
3,0.7143,0.0,0.0,0.0,0.0,0.0,0.0
4,0.7143,0.05,0.0,0.0,0.0,0.0,0.0
5,0.7143,0.3,0.0,0.0,0.0,0.0,0.0
6,0.6667,0.0,0.0,0.0,0.0,0.0,0.0
7,0.6667,0.5,0.0,0.0,0.0,0.0,0.0
8,0.6667,0.5,0.0,0.0,0.0,0.0,0.0
9,0.6667,0.25,0.0,0.0,0.0,0.0,0.0


QuadraticDiscriminantAnalysis(priors=None, reg_param=0.99,
                              store_covariance=False, tol=0.0001)

In [87]:
predict_model(best_auc_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Quadratic Discriminant Analysis,0.5172,0.8357,0.0667,1.0,0.125,0.0645,0.1826


Unnamed: 0,polarity,subjectivity,body_ Bonjour \n La partie devoir se décompose de la sorte \n - un QCM de 5 questions (chacune apporte 1 point)\n - un TP à réaliser pour compléter lévaluation (15 points)\n \n Il y a 2 étapes dans la partie TP \n 1=> tout dabord fournir un programme qui répond à la consigne. Le code est à mettre dans la partie Your response (copier/coller de ta production). Le code sera évalué par des pairs.\n 2=> une fois la réponse à la consigne envoyée pour évaluation cest à toi dévaluer 2 à 3 autres participants.,body_ Pas nimporte quel whisky alors !!\n Sinon tu risques davoir encore plus de migraine quavant de poster ton commentaire très pédagogique...,body_ Tout à fait il suffit dappuyer sur le bouton poussoir avec ta souris (clic gauche). \n Et jajouterais que tu as un indicateur lumineux dans langle inférieur gauche du bouton poussoir qui tindique quil est bien enfoncé un point vert sallume quand tu simules lappuie avec la souris il séteint lorsque lon relâche le clic gauche,body_ Visiblement tu as aussi du mal à en expliquer les raisons... \n Cest dommage ça aurait pût aider pour te donner un peu délan... \n Aurais-tu un peu plus de détails à donner sur les difficultés rencontrées ou ce que tu ne comprends pas ?,body_/*\n Feu tricolore\n Orange allumée pendant 1 seconde\n Rouge allumée pendant 3 secondes\n Verte allumée pendant 3 secondes\n*/\n \n// Numéros des broche utilisées\nint ledr = 13; //led rouge\nint ledo = 12; //led orange\nint ledv = 11; //led verte\n \nvoid setup() \n // indique que les broches des LEDS sont en sortie \n pinMode(ledr OUTPUT);\n pinMode(ledo OUTPUT);\n pinMode(ledv OUTPUT);\n\n \nvoid loop() \n digitalWrite(ledo HIGH); // allumer la LED orange \n delay(1000); // attendre 1000ms = 1s\n digitalWrite(ledo LOW); // éteindre la LED orange\n delay(500); // attendre 1.5 seconde\n digitalWrite(ledr HIGH); // allumer la LED rouge\n delay(3000); // attendre 3s\n digitalWrite(ledr LOW); // éteindre la LED rouge\n delay(500); // attendre 1.5 seconde\n digitalWrite(ledv HIGH); // allumer la LED verte\n delay(3000); // attendre 3s\n digitalWrite(ledv LOW); // éteindre la LED verte\n delay(500); // attendre 1.5 seconde\n,body_Bonjour Christophe\nJe suis électrotechnicien avec une spécialisation Domotique et GTC/GTB. Tu nas pas besoin dapprendre le langage C ou la programmation orientée objet (ce quArduino utilise en gros) pour faire de la Domotique. En domotique on utilise des systèmes paramétrables plus que programmable au sens informatique du terme. Et tu ne vendras pas dinstallation à base dArduino (bon courage pour les normes et les garanties!). A titre professionnel tu dois tintéresser aux gammes des différents fabriquants dappareillage électrique (solutions généralement propriétaire) ainsi quaux protocoles dédiés aux automatismes de bâtiment\nKNX DALI(pour léclairage) ZigBee Enocean...\nTu peux aussi utiliser des automates industrielles (WAGO Schneider Siemens SAIA...) ou des régulateurs communicants pour de grosses installations avec de la régulation (chauffage clim. piscine...). Dans ce cas tu auras de la programmation (soit IDE propriétaire soit CODESYS avec les langages ST FBD etc... de la norme CEI61131-3). Le langage ST (pour structuré) ressemble au Pascal et est de type procédural. Il est donc très abordable si tu as les bases de lalgorithmique.\nJe te souhaite une bonne continuation!\n,body_Bonjour a tous jeune ingenieur de 28 ans travaillant depuis peu en securite de fonctionnement applique au domaine de la signalisation ferroviaire je me suis trouve une veritable curiosite pour le domaine de lelectronique embarquee domaine que je cotoie de maniere indirecte au quotidien. Je vis a Shanghai Chine.,body_Bonjour\n\nJe constate que le code utilisé dans les réponses aux TP nest pas indenté.\nSur le 1er TP que jai été amené à évaluer jai signalé dans les commentaires que la bonne pratique voulait que lon insère des retraits dans son code que lon aère les commentaires etc. Une remarque pour informer simplement...\n\nJusquà m’apercevoir que lindentation du code que jai fourni pour ce TP avait disparu... \nErreur de copié/collé jai dabord pensé. Mais idem pour le second TP lindentation du code avait sauté.\n\nPour le moment seules quelques lignes constituent nos programmes et il y a peu de complexité (je nai pas dis difficulté...).\nQuen sera-t-il si le code se densifie ?\nLes retraits permettent de conserver un peu de clarté dans le programme il respire...\n\nEst-il donc possible de conserver lindentation du code ? \nCertains ont-ils une astuce ou trouvé comment faire ?\n\nPar avance merci.\n\n;-)\n,...,education_level_b,education_level_hs,education_level_jhs,education_level_m,country_BE,country_FR,country_RE,eligibility,Label,Score
0,-0.333333,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0.7408
1,-0.384615,0.105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.7741
2,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,0.662
3,0.4,0.433333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.7711
4,-0.333333,0.075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0.7688
5,-0.333333,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.8264
6,0.142857,0.2875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0,0.5123
7,0.0,0.307692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.5564
8,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.5568
9,-0.333333,0.19375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.7742


In [88]:
save_model(best_auc_model, 'classification_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='eligibility', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                 ('dummy', Dummify(target='eligibility')),
                 ('fix_perfect', Remove_100(target='eligibility')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
          