In [158]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from utils import get_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [159]:
engine = create_engine(get_config('mysql'))
engine

Engine(mysql://root:***@127.0.0.1:3306/g3_MOOC)

In [160]:
query = """
    Select body, polarity, subjectivity, eligibility, gender, education_level, country from Message m
    join Result r
    on m.username = r.username
    join User u
    on m.username = u.username;
    """
df = pd.read_sql(query, engine)

In [161]:
df['gender'] = df['gender'].replace("", np.nan)
df['country'] = df['country'].replace("", np.nan)
df['education_level'] = df['education_level'].replace("", np.nan)

In [162]:
df = df.dropna().drop_duplicates()

In [163]:
X = df.drop(columns='eligibility')
y = df['eligibility']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [164]:
X_cat = X.select_dtypes(include=[object])
X_num = X.select_dtypes(exclude=[object])

In [165]:
X_cat.columns

Index(['body', 'gender', 'education_level', 'country'], dtype='object')

In [166]:
col_tg = ColumnTransformer(
    transformers=[
        ('tf_num', RobustScaler(), X_num.columns),
        ('tf_cat', CountVectorizer(), 'body')
    ]
)
pipe = Pipeline(
    steps=[
        ('preparation', col_tg),
        ('model', KNeighborsClassifier())
    ]
)

In [167]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preparation',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tf_num',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  Index(['polarity', 'subjectivity'], dtype='object')),
                                                 ('tf_cat',
                                                  CountVectorizer(analyzer='word',
                                           

In [168]:
y_max_pred = pipe.predict(X_test)
print("RFR:", round(accuracy_score(y_test, y_max_pred), 5))

RFR: 0.33333


In [169]:
from pycaret.classification import *
exp_reg101 = setup(data = df, target = 'eligibility')

Unnamed: 0,Description,Value
0,session_id,7411
1,Target,eligibility
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(87, 7)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [170]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6167,0.5,0.0,0.0,0.0,0.0,0.0,0.004
knn,K Neighbors Classifier,0.4833,0.3993,0.2833,0.27,0.2486,-0.0852,-0.1255,0.008
lr,Logistic Regression,0.4333,0.1611,0.05,0.025,0.0333,-0.2452,-0.2961,0.007
ridge,Ridge Classifier,0.4,0.0,0.05,0.025,0.0333,-0.3,-0.3592,0.004
rf,Random Forest Classifier,0.4,0.0653,0.05,0.025,0.0333,-0.3,-0.3592,0.084
et,Extra Trees Classifier,0.4,0.0875,0.0,0.0,0.0,-0.33,-0.3842,0.065
lda,Linear Discriminant Analysis,0.3833,0.0875,0.05,0.025,0.0333,-0.3333,-0.3852,0.006
gbc,Gradient Boosting Classifier,0.3667,0.1083,0.0833,0.045,0.0571,-0.3649,-0.4257,0.021
dt,Decision Tree Classifier,0.35,0.1875,0.0833,0.0583,0.0667,-0.3786,-0.4261,0.006
svm,SVM - Linear Kernel,0.35,0.0,0.3333,0.1667,0.2186,-0.303,-0.3681,0.005


DummyClassifier(constant=None, random_state=7411, strategy='prior')