# Classification de la domaine

## A. Préparation du fichier

In [1]:
import pandas as pd

In [2]:
with open('sources_opus/OpenSubtitles_fr.txt', 'r', encoding='utf-8') as f:
    data_opus = f.readlines() 

In [3]:
nb_lignes = len(data_opus)

In [4]:
from random import sample
index_taked = sample(range(0,nb_lignes), 200)

In [5]:
lignes_opus = [data_opus[i] for i in index_taked]

In [6]:
df_opus = pd.DataFrame({'texte' : lignes_opus, 'theme' : 0})

In [7]:
df_concat = pd.read_pickle('df_concat.pkl')
df_concat["theme"].replace(2, 1, inplace=True)
df_concat["theme"].replace(3, 1, inplace=True)
df_concat["theme"].replace(4, 1, inplace=True)
df_concat["theme"].replace(5, 1, inplace=True)

In [8]:
df_theme = pd.concat([df_concat, df_opus])

In [9]:
# Sauvegarde
df_theme.to_pickle('df_theme.pkl')

## B. Nettoyage des données

In [11]:
import nltk
import string
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import unidecode

In [12]:
stemmer = SnowballStemmer('french')
nlp = spacy.load('fr_core_news_sm')

sw = nltk.corpus.stopwords.words('french')
sw += ['être', 'avoir']
sw.sort()

def lemmatise_text(text):
    lst_lematised = [token.lemma_ for token in nlp(text)] 
    return ' '.join(lst_lematised).lower()


def stem_text(text):
    lst_stemmerised = [stemmer.stem(token) for token in word_tokenize(text)]    
    return ' '.join(lst_stemmerised)


def substitute_punctuation(text):
    return ' '.join(text.replace("'", ' ').translate(str.maketrans('', '', string.punctuation)).split())


def supp(text):
    return text.replace("«", "").replace("’", "").replace("•", "").replace("®", "")

def supprime_accent(txt):
    return unidecode.unidecode(txt)

In [13]:
#Import du vectorizer et du classifieur

from joblib import load
vectoriser_theme = load('vectorizer_classif_question.joblib')
classifier_theme = load('model_classif_question.joblib') 

Using TensorFlow backend.


In [14]:
from sklearn.model_selection import train_test_split

df_concat = pd.read_pickle('df_theme.pkl')
X_train, X_test, y_train, y_test = train_test_split(df_theme['texte'], 
                                                    df_theme['theme'],
                                                    train_size=0.7,
                                                    random_state=5)

In [15]:
X_train_clean = (X_train.apply(lemmatise_text)
                        .apply(stem_text)
                        .apply(substitute_punctuation)
                        .apply(supp)
                )

X_test_clean = (X_test.apply(lemmatise_text)
                      .apply(stem_text)
                      .apply(substitute_punctuation)
                      .apply(supp)
               )

## C. Le classifieur

#### 1. Régression Logistique

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [17]:
X_train_clean_vectorized_tfidf = vectoriser_theme.transform(X_train_clean)
X_test_clean_vectorized_tfidf = vectoriser_theme.transform(X_test_clean)

In [18]:
model_lr =  LogisticRegression(multi_class = 'multinomial', solver='lbfgs', max_iter=500).fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_lr.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.8494623655913979

In [19]:
model_lr =  LogisticRegression(multi_class = 'multinomial', solver='newton-cg', max_iter=100, penalty="l2").fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_lr.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.8494623655913979

#### SVM

In [20]:
from sklearn import svm

In [23]:
model_svm = svm.SVC(kernel='linear', C=10).fit(X_train_clean_vectorized_tfidf, y_train)
predictions_valid = model_svm.predict(X_test_clean_vectorized_tfidf)
accuracy_score(y_test, predictions_valid)

0.8924731182795699

In [24]:
from joblib import dump
dump(model_svm, 'model_classif_domaine.joblib')

['model_classif_domaine.joblib']