# Classification de la domaine

## A. Préparation du fichier

In [2]:
import pandas as pd

In [3]:
with open('../sources_opus/OpenSubtitles_fr.txt', 'r', encoding='utf-8') as f:
    data_opus = f.readlines() 

In [4]:
nb_lignes = len(data_opus)

In [5]:
from random import sample, seed
seed(1234)   # pour la reproductibilité des résultats
index_taked = sample(range(0,nb_lignes), 200)

In [6]:
lignes_opus = [data_opus[i] for i in index_taked]

In [7]:
df_opus = pd.DataFrame({'texte' : lignes_opus, 'domaine' : 0})

In [8]:
df_faq = pd.read_pickle('df_concat.pkl')

In [9]:
df_faq['domaine'] = 1
df_faq = df_faq[['texte', 'domaine']]

In [10]:
df_domaine = pd.concat([df_opus, df_faq])

In [11]:
# Sauvegarde
df_domaine.to_pickle('df_classif_domaine.pkl')

## B. Nettoyage des données

In [12]:
import nltk
import string
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import unidecode

In [13]:
stemmer = SnowballStemmer('french')
nlp = spacy.load('fr_core_news_sm')

sw = nltk.corpus.stopwords.words('french')
sw += ['être', 'avoir']
sw.sort()

def lemmatise_text(text):
    lst_lematised = [token.lemma_ for token in nlp(text)] 
    return ' '.join(lst_lematised).lower()


def stem_text(text):
    lst_stemmerised = [stemmer.stem(token) for token in word_tokenize(text)]    
    return ' '.join(lst_stemmerised)


def substitute_punctuation(text):
    return ' '.join(text.replace("'", ' ').translate(str.maketrans('', '', string.punctuation)).split())


def supp(text):
    return text.replace("«", "").replace("’", "").replace("•", "").replace("®", "")

def supprime_accent(txt):
    return unidecode.unidecode(txt)

In [14]:
#Import du vectorizer et du classifieur

from joblib import load
vectoriser_theme = load('vectorizer_classif_theme.joblib')

In [15]:
from sklearn.model_selection import train_test_split

df_domaine = pd.read_pickle('df_classif_domaine.pkl')
X_train, X_test, y_train, y_test = train_test_split(df_domaine['texte'], 
                                                    df_domaine['domaine'],
                                                    train_size=0.7,
                                                    random_state=5)

In [16]:
X_train_clean = (X_train.apply(lemmatise_text)
                        .apply(stem_text)
                        .apply(substitute_punctuation)
                        .apply(supp)
                )

X_test_clean = (X_test.apply(lemmatise_text)
                      .apply(stem_text)
                      .apply(substitute_punctuation)
                      .apply(supp)
               )

## C. Le classifieur

#### 1. Régression Logistique

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [18]:
X_train_clean_vectorized_tfidf = vectoriser_theme.transform(X_train_clean)
X_test_clean_vectorized_tfidf = vectoriser_theme.transform(X_test_clean)

In [19]:
model_lr =  LogisticRegression(multi_class = 'multinomial', solver='lbfgs', max_iter=500).fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_lr.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.8387096774193549

In [20]:
model_lr =  LogisticRegression(multi_class = 'multinomial', solver='newton-cg', max_iter=100, penalty="l2").fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_lr.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.8387096774193549

#### SVM

In [21]:
from sklearn import svm

In [22]:
model_svm = svm.SVC(kernel='linear', C=10).fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_svm.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.9032258064516129

In [23]:
from joblib import dump
dump(model_svm, 'model_classif_domaine.joblib')

['model_classif_domaine.joblib']