In [1]:
# librairies
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

In [2]:
# Tokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').\
                             replace('#', ' ').replace('<p>', ' ').replace('>', ' ').replace('<', ' ')
    # Remove ponctuation (except # and ++ for c# and c++)
    sentence_clean1 = re.sub('[^\\w\\s#\\s++]', ' ', sentence_clean)

    # Remove numbers
    sentence_clean2 = re.sub(r'\w*\d+\w*', ' ', sentence_clean1)

    # Remove extra spaces
    sentence_clean = re.sub('\s+', ' ', sentence_clean2)
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')']
stop_w.extend(['code', 'quot', 'use', 'http', 'com', 'error', 'work', 'want', 'one', 'would', 'need', 
                   'help', 'also', 'exampl', 'could', 'thing', 'well', 'dear', 'p'])

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
#    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@")) 
                                      and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer

def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

# Fonction de préparation du texte pour le bag of words (Countvectorizer et Tf_idf, Word2Vec)
def transform_bow_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    # lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lw)
    return transf_desc_text
#    return lw

In [3]:
#import pickle
#file_X_tfidf = open('./models/X_tfidf.pkl', 'rb')
#X_tfidf = pickle.load(file_X_tfidf)
#file_X_tfidf.close()
#file_y_mlb = open('./models/y_mlb.pkl', 'rb')
#y_mlb = pickle.load(file_y_mlb)
#file_y_mlb.close()

In [30]:
def process_text(text):
    # mise en forme du texte
    text_prep = transform_bow_fct(text)
    tt = ""
    for word in text_prep.split(" "):
        tt = tt + word + " "
    text_final = []
    text_final.append(tt)
    print (text_final)
    
    # recherche du classifieur pré-entrainé
    file_clf = open('./models/clf.pkl', 'rb')
    clf = pickle.load(file_clf)
       
    # recherche du vectoriseur pré-entrainé
    file_vect = open('./models/vect.pkl', 'rb')
    vect = pickle.load(file_vect)

    X_text = vect.transform(text_final)    
    y_pred = clf.predict(X_text)
    
    # recherche du multiLabelBinarizer pré-entrainé
    file_mlb = open('./models/mlb.pkl', 'rb')
    mlb = pickle.load(file_mlb)

    y_pred_inversed = mlb.inverse_transform(y_pred)

    return y_pred_inversed

In [31]:
text_sample = "<p>python how to send pdf or other media as messages to people with WhatsApp click to chat.</p>"

In [32]:
print (process_text(text_sample))

['python send pdf media messages people whatsapp click chat ']
[()]


In [None]:
text_prep = preprocessor.fit(text)

In [None]:
text_prep

In [37]:
df = pd.read_csv("data/cleaned/df_final_version.csv",sep=';')

In [39]:
from ast import literal_eval
for col in ['Title', 'Body', 'Tags']:
     df[col] = df[col].apply(literal_eval)

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_mlb = mlb.fit_transform(df.Tags)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.97,
                             min_df= 3,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)
X = df.Body
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)


In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_mlb, test_size=0.3, random_state=42)

In [47]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
final_model = OneVsRestClassifier(LinearSVC(dual=False, max_iter=10000))
final_model.fit(X_train,y_train)

OneVsRestClassifier(estimator=LinearSVC(dual=False, max_iter=10000))

In [49]:
from sklearn import cluster, metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score
y_pred = final_model.predict(X_test)
print('Score ROC_AUC : ', metrics.roc_auc_score(y_test, y_pred,multi_class="ovr"))
print ('Accuracy : ', round(accuracy_score(y_test, y_pred),2))

Score ROC_AUC :  0.6943480522471015
Accuracy :  0.24


In [14]:
#file_final_model = './models/final_model.pkl'
#pickle.dump(final_model, open(file_final_model, 'wb'))

In [50]:
text = "register go multiple route french version app web"
text_tok = transform_bow_fct(text)

In [51]:
tt = ""
for word in text_tok.split(" "):
    tt = tt + word + " "
print (tt)
ta = []
ta.append(tt)
ta

register go multiple route french version app web 


['register go multiple route french version app web ']

Transformation du document en vecteur et prédiction

In [52]:
X_text = vectorizer.transform(ta)
X_text.shape

(1, 26530)

In [53]:
y_pred = final_model.predict(X_text)

In [54]:
y_pred

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [22]:
y_pred_inversed = mlb.inverse_transform(y_pred)
y_pred_inversed

[()]

In [23]:
from mlflow.models.signature import infer_signature

In [24]:
signature = infer_signature(X_train, y_train)

In [26]:
import mlflow.sklearn

In [27]:
mlflow.sklearn.save_model(final_model, 'mlflow_model', signature=signature)

