In [2]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
import re
import joblib
import sklearn.feature_extraction.text as txt

from paths import joblib_dir
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import make_column_transformer

import classification_lib as cl
import cosine_normalisation_pipeline as cnp
import stop_words_perso as swp 
import mispell_dict as md

train = pd.read_csv('train.csv')

y = train.iloc[:, 11:]

# transformation des targets variables catégorielles
#y_transformed = y.apply(lambda x: pd.cut(x,
#                        [-0.1, .25, .5, .75, 1.1],
#                        labels=['low', 'medium-', 'medium+', 'high']))
y_transformed = y

# séparation en cas d'études séparés sur les questions ou answers
y_question = y_transformed.loc[:, y_transformed.columns.str.startswith('question')]
y_answer = y_transformed.loc[:, y_transformed.columns.str.startswith('answer')]

to_delete_var = ['qa_id', 'url',
                 'question_user_name', 'question_user_page',
                 'answer_user_name', 'answer_user_page']

X = train.iloc[:, :11].drop(to_delete_var, 1)
X_title = train.question_title
X_question = train.question_body
X_answer = train.answer

# nombre de lignes avec passage à la ligne comme proxy
linebreak_re = r'\n'
# longueur/verbosité avec nombre de caractères comme proxy
chars_re = r'.'

numbers_re = r'\d\.?\d*'
links_re = r'www[^\s]*(?=\s)|http[^\s]*(?=\s)'
demonstrations_re = r'(?<=\n).*[&\^=\+\_\[\]\{\}\|]+.*(?=\n)'
belonging_re = r'\'s'
# TODO: densité de ponctuation ?
# question_mark = r'\?'


count_encoder_union = make_union(
    cl.PatternCounter(chars_re),
    cl.PatternEncoder(numbers_re),
    cl.PatternEncoder(links_re),
    cl.PatternEncoder(demonstrations_re),
    verbose=True
)

full_count_encoder_union = make_union(
    cl.PatternCounter(linebreak_re),
    count_encoder_union,
    verbose=True
)

cleaner_pipeline = make_pipeline(
    cl.PatternRemover(numbers_re),
    cl.PatternRemover(links_re),
    cl.PatternRemover(demonstrations_re),
    cl.PatternRemover(belonging_re),
    cl.SpellingCorrecter(),
    verbose=True
)

cleaner_count_encoder_ct = make_column_transformer(
    ('passthrough', ['question_title']),
    (cleaner_pipeline, ['question_body']),
    (cleaner_pipeline, ['answer']),
    ('passthrough', ['category', 'host']),
    (count_encoder_union, ['question_title']),
    (full_count_encoder_union, ['question_body']),
    (full_count_encoder_union, ['answer']),
    remainder='drop',
    verbose=True
)

X_transformed = pd.DataFrame(
    data=cleaner_count_encoder_ct.fit_transform(train),
    columns=[
        'question_title', 'question_body', 'answer',
        'category', 'host',
        'title_chars',  'title_num', 'title_links', 'title_demo',
        'question_linebreak', 'question_chars', 'question_num', 
        'question_links', 'question_demo',
        'answer_linebreak', 'answer_chars', 'answer_num', 
        'answer_links', 'answer_demo'
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X_transformed,
    y_transformed,
    test_size=0.15
)

stop_words = list(txt.ENGLISH_STOP_WORDS)
for words in swp.stop_words_to_remove:
    stop_words.remove(words)
stop_words += swp.cs_stop_words \
              + swp.generated_during_tokenizing

title_tfidftransformer = cl.LemmaTfidfVectorizer(
    sublinear_tf=True,
    stop_words=stop_words,
    min_df=0.015,
    max_df=0.85,
    ngram_range=(1,2)
)

question_tfidftransformer = cl.LemmaTfidfVectorizer(
    sublinear_tf=True,
    stop_words=stop_words,
    min_df=0.015,
    max_df=0.85,
    ngram_range=(1,2)
)

answer_tfidftransformer = cl.LemmaTfidfVectorizer(
    sublinear_tf=True,
    stop_words=stop_words,
    min_df=0.015,
    max_df=0.85,
    ngram_range=(1,2)
)

title_tfidf_acp_pipe = make_pipeline(
    cl.Squeezer(),
    title_tfidftransformer,
    TruncatedSVD(n_components=15),
    verbose=True
)

question_tfidf_acp_pipe = make_pipeline(
    cl.Squeezer(),
    question_tfidftransformer,
    TruncatedSVD(n_components=220),
    verbose=True
)

answer_tfidf_acp_pipe = make_pipeline(
    cl.Squeezer(),
    answer_tfidftransformer,
    TruncatedSVD(n_components=250),
    verbose=True
)

cat_host_ohe = OneHotEncoder(drop='first', sparse=False)

tfidf_ohe_ct = make_column_transformer(
    (title_tfidf_acp_pipe, 0),
    (question_tfidf_acp_pipe, 1),
    (answer_tfidf_acp_pipe, 2),
    (cat_host_ohe, [3,4]),
    verbose=True,
    remainder='passthrough'
)

X_train_transformed = tfidf_ohe_ct.fit_transform(X_train).astype(float)

cosine_tfidftransformer = cl.LemmaTfidfVectorizer(
    sublinear_tf=True,
    ngram_range=(1,2)
)

cosine_tfidftransformer.fit(
    X_train.question_title
    + ' ' + X_train.question_body
    + ' ' + X_train.answer
)

norm_transformer = StandardScaler().fit(X_train_transformed)

X_train_transformed = cnp.do_and_stack_cosine(
    cosine_tfidftransformer,
    X_train_transformed,
    X_train
#    norm_transformer, 
#    norm=True
)

# for test usage:
X_test_transformed = cnp.do_and_stack_cosine(
    cosine_tfidftransformer,
    tfidf_ohe_ct.transform(X_test),
    X_test
#    norm_transformer, 
#    norm=True
)

[ColumnTransformer] . (1 of 7) Processing passthrough-1, total=   0.0s
[Pipeline] .. (step 1 of 5) Processing patternremover-1, total=   0.1s
[Pipeline] .. (step 2 of 5) Processing patternremover-2, total=   0.1s
[Pipeline] .. (step 3 of 5) Processing patternremover-3, total=   0.2s
[Pipeline] .. (step 4 of 5) Processing patternremover-4, total=   0.0s
[Pipeline] . (step 5 of 5) Processing spellingcorrecter, total=   0.2s
[ColumnTransformer] .... (2 of 7) Processing pipeline-1, total=   0.5s
[Pipeline] .. (step 1 of 5) Processing patternremover-1, total=   0.1s
[Pipeline] .. (step 2 of 5) Processing patternremover-2, total=   0.1s
[Pipeline] .. (step 3 of 5) Processing patternremover-3, total=   0.2s
[Pipeline] .. (step 4 of 5) Processing patternremover-4, total=   0.0s
[Pipeline] . (step 5 of 5) Processing spellingcorrecter, total=   0.2s
[ColumnTransformer] .... (3 of 7) Processing pipeline-2, total=   0.6s
[ColumnTransformer] . (4 of 7) Processing passthrough-2, total=   0.0s
[Featu

In [7]:
norm_transformer = StandardScaler().fit(X_train_transformed)

In [8]:
X_train_transformed = norm_transformer.transform(X_train_transformed)
X_test_transformed = norm_transformer.transform(X_test_transformed)

In [9]:
X_train = X_train_transformed
X_test = X_test_transformed

In [None]:
l=[]

for col in list(y_transformed.columns):
    for item in y_transformed[col].unique():
        l.append(item)
set(l)

In [None]:
# test Multiple models

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import RegressorChain

clf_rfr = RandomForestRegressor(random_state=0)

param_grid_rfr = [{'n_estimators': [10, 50, 100],
                   'min_samples_leaf': [1, 3, 5],
                   'max_features': ['sqrt', 'log2']}]


clf_chain = RegressorChain(RandomForestRegressor(random_state=0), order=None, cv=None, random_state=0)

param_grid_chain = [{'base_estimator__n_estimators': [10, 50, 100],
                   'base_estimator__min_samples_leaf': [1, 3, 5],
                   'base_estimator__max_features': ['sqrt', 'log2']}]

gridcvs={}

for pgrid, clf, name in zip((param_grid_rfr,
                             param_grid_chain),
                            (clf_rfr, 
                             clf_chain),
                            ('RFR', 'chained_RFR')):
    gcv = GridSearchCV(clf,
                       pgrid,
                       cv=3,
                       refit=True)
    gridcvs[name] = gcv


outer_cv = KFold(n_splits=3, shuffle=True)
outer_scores = {}

for name, gs in gridcvs.items():
    nested_score = cross_val_score(gs, 
                                   X_train, 
                                   y_train, cv=outer_cv)
    outer_scores[name] = nested_score
    
outer_scores

In [None]:
chain = gridcvs['chained_RFR']
chain.fit(X_train, y_train)

chain.best_params_

rfr = gridcvs['RFR']
rfr.fit(X_train, y_train)

In [48]:
# test one model 

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import RegressorChain

clf_rfr = RandomForestRegressor(random_state=0)

param_grid_rfr = [{'n_estimators': [10, 50, 100],
                   'min_samples_leaf': [1, 3, 5],
                   'max_features': ['sqrt', 'log2']}]


clf_chain = RegressorChain(RandomForestRegressor(random_state=0), 
                           order=[16,26,18,28,1,11,7,20,10,8,15,6,5,2,22,14,0,4,12,17,3,27,25,29,13,23,21,24,19,9],
                           cv=None, 
                           random_state=0)

param_grid_chain = [{'base_estimator__n_estimators': [100],
                   'base_estimator__min_samples_leaf': [1, 3],
                   'base_estimator__max_features': ['sqrt']}]


gcv = GridSearchCV(clf_chain,param_grid_chain,cv=3, refit=True)



In [49]:
gcv.fit(X_train, y_train)
y_pred = gcv.predict(X_train)



In [50]:
gcv.best_params_

{'base_estimator__max_features': 'sqrt',
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__n_estimators': 100}

In [51]:
import numpy as np 
from scipy import stats

y_pred = gcv.predict(X_test)
corrs=[]
for col in range(len(y_test.columns)):
    corr = stats.spearmanr(pd.DataFrame(y_pred).iloc[:,col], y_test.iloc[:,col])
    corrs.append(corr.correlation)

mean_spearman = np.mean(corrs)

mean_spearman
pd.DataFrame({'target':list(y_test.columns), 'score':corrs}).sort_values(by='score', ascending=False).index

Int64Index([16, 26,  1, 28, 18, 11,  7, 10, 20, 22, 15,  6,  8, 14,  5,  2,  0,
             4, 12, 25, 17,  3, 21, 27, 24, 13, 19, 29, 23,  9],
           dtype='int64')

In [52]:

mean_spearman

0.29922502188829403

In [45]:
y_test

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
187,1.000000,0.444444,0.0,1.000000,1.000000,1.0,0.666667,0.666667,0.666667,0.0,...,0.666667,1.000000,0.555556,1.000000,1.000000,0.866667,0.666667,0.333333,0.666667,0.888889
1509,0.888889,0.666667,0.0,0.666667,0.666667,1.0,0.444444,0.333333,0.333333,0.0,...,0.444444,1.000000,0.666667,1.000000,1.000000,0.933333,1.000000,0.000000,0.000000,0.888889
1954,1.000000,0.666667,0.0,0.666667,0.666667,1.0,0.555556,0.444444,0.333333,0.0,...,0.888889,1.000000,0.666667,1.000000,1.000000,0.933333,0.666667,0.000000,0.666667,0.888889
370,1.000000,0.333333,0.0,0.500000,1.000000,1.0,0.500000,0.333333,0.000000,0.0,...,0.833333,1.000000,0.666667,1.000000,1.000000,0.800000,1.000000,0.500000,0.500000,1.000000
5945,0.888889,0.777778,0.0,0.666667,1.000000,1.0,0.444444,0.333333,0.000000,0.0,...,0.777778,1.000000,0.666667,1.000000,0.888889,0.933333,0.666667,0.000000,0.666667,0.888889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932,0.777778,0.555556,0.0,0.333333,1.000000,1.0,0.666667,0.444444,0.333333,0.0,...,0.666667,0.666667,0.333333,1.000000,0.666667,0.400000,1.000000,0.000000,0.000000,0.833333
4561,1.000000,0.333333,0.0,0.666667,1.000000,1.0,0.444444,0.333333,0.000000,0.0,...,0.888889,0.888889,0.555556,1.000000,0.888889,0.733333,0.000000,0.000000,1.000000,0.888889
4706,0.666667,0.666667,0.0,1.000000,1.000000,1.0,0.666667,0.333333,0.000000,0.0,...,1.000000,0.666667,0.333333,0.666667,0.666667,0.600000,0.000000,0.000000,1.000000,1.000000
5791,1.000000,0.777778,0.0,0.666667,1.000000,1.0,0.777778,0.666667,0.000000,0.0,...,0.777778,0.777778,0.666667,0.888889,0.888889,0.800000,0.666667,0.333333,0.333333,0.888889


In [46]:
y_test_cut= y_test.apply(lambda x: pd.cut(x,
                        [-0.1, .25, .5, .75, 1.1],
                        labels=['low', 'medium-', 'medium+', 'high']))

y_pred_cut= pd.DataFrame(y_pred).apply(lambda x: pd.cut(x,
                        [-0.1, .25, .5, .75, 1.1],
                        labels=['low', 'medium-', 'medium+', 'high']))

from sklearn.metrics import accuracy_score
scores=[]
for col in range(len(y_test.columns)):
    scores.append(accuracy_score(y_test_cut.iloc[:,col], y_pred_cut.iloc[:,col]))
    
mean_scores = np.mean(scores)
mean_scores

0.6809210526315789

0.6827850877192982