# Plateforme Agnostique de Traitement et d'Analyse des Textes
### Notebook d'expérimentation
---

## Sujet : Prise en compte des observations labelizer


---


## Observations et environnement
---

In [1]:
cd ../..

/Users/fm/Desktop/Work/Patat


In [2]:
import importlib
import pandas as pd

### Chargement des données

## Expérience
---

### Lemmatization du corpus

In [None]:
import patat.db.article_db
import patat.db.label_db
import patat.ml.lex_analyser

importlib.reload(patat.db.article_db)
importlib.reload(patat.db.label_db)
importlib.reload(patat.ml.lex_analyser)

article_db = patat.db.article_db.ArticleDb()
label_db = patat.db.label_db.LabelDb()
lex = patat.ml.lex_analyser.LexAnalyser()

urls_0, urls_1 = label_db.get_obs_urls('infox')

text_0,text_1 = article_db.get_texts(urls_0, urls_1)

lemmas_0 = lex.get_lemmas_from_text(text_0)
lemmas_1 = lex.get_lemmas_from_text(text_1)

ignore_words = lex.get_ignore_words(lemmas_0,lemmas_1)

len(ignore_words)

### Vectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer(stop_words=ignore_words)
#count_vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_recueil['lemmas'])

In [28]:
y = df_recueil['infox']

In [29]:
X.shape

(489, 9289)

In [30]:
X = X.toarray()

In [31]:
y.value_counts()

0.0    246
1.0    243
Name: infox, dtype: int64

In [32]:
y = y.astype(float)

### Construction des Datasets d'entrainement et de test

In [33]:
import sklearn.model_selection

#X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,y,train_size=0.8,shuffle=True)
X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,y,random_state=17,train_size=0.7)

X_train.shape

(342, 9289)

### Entrainement du modele

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

classifier = LogisticRegression(C=100.0, random_state=42, solver='lbfgs', multi_class='ovr', max_iter=1000)
#classifier = LogisticRegression(max_iter=1000)

# Fit the model
classifier.fit(X_train, y_train)


### Mesure des résultats

In [35]:
y_pred = classifier.predict(X_test)

In [36]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Matrice de confusion')
print(cnf_matrix)
print(f'Accuracy score {metrics.accuracy_score(y_test, y_pred)*100:.2f}%')
print(f'Recall score {metrics.recall_score(y_test, y_pred)*100:.2f}%')

Matrice de confusion
[[63 17]
 [13 54]]
Accuracy score 79.59%
Recall score 80.60%


### Cross Validation

In [41]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X, y, cv=12)
scores

array([0.80487805, 0.87804878, 0.80487805, 0.75609756, 0.92682927,
       0.7804878 , 0.85365854, 0.65853659, 0.73170732, 0.775     ,
       0.85      , 0.775     ])

In [42]:
print("accuracy : %0.2f \nstandard deviation : %0.2f" % (scores.mean(), scores.std()))

accuracy : 0.80 
standard deviation : 0.07


In [43]:
from sklearn.model_selection import cross_validate
scores = cross_validate(classifier, X, y, cv=12,scoring=['accuracy','recall'])
scores
df_scores = pd.DataFrame(scores)
df_scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
0,0.174136,0.000691,0.804878,0.9
1,0.183455,0.000688,0.878049,0.85
2,0.187381,0.000648,0.804878,0.8
3,0.201525,0.00069,0.756098,0.85
4,0.157928,0.000642,0.926829,0.9
5,0.219782,0.007732,0.780488,0.65
6,0.309208,0.000659,0.853659,0.761905
7,0.283839,0.000686,0.658537,0.619048
8,0.203458,0.000645,0.731707,0.761905
9,0.143816,0.000648,0.775,0.95


In [56]:
print(f'Accuracy : Mean = {df_scores["test_accuracy"].mean():.2f} \
Deviation = {df_scores["test_accuracy"].std():.2f}')
print(f'Recall : Mean = {df_scores["test_recall"].mean():.2f} \
Deviation = {df_scores["test_recall"].std():.2f}')

Accuracy : Mean = 0.80 Deviation = 0.07
Recall : Mean = 0.79 Deviation = 0.11


### Benchmark algos

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

names = [
    "Logistic Regression",
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    LogisticRegression(C=100.0, random_state=42, solver='lbfgs', multi_class='ovr', max_iter=1000),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

results =[]
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    print('------------------------------------')
    print(f'Classifier : {name}')
    print('Matrice de confusion')
    print(cnf_matrix)
    acc = metrics.accuracy_score(y_test, y_pred)
    print(f'Accuracy score {acc*100:.2f}%')
    recall = metrics.recall_score(y_test, y_pred)
    print(f'Recall score {recall*100:.2f}%')
    results.append({
        'Classifier' : name,
        'Accuracy' : round(acc*100),
        'Recall' : round(recall*100),
    })
print('------------------------------------')

In [None]:
df = pd.DataFrame(results)
df = df.sort_values(['Accuracy','Recall'],ascending=False)
df

### Prédictions sur le corpus

In [None]:
df_pred = df_corpus

In [None]:
%%time
X = vectorizer.transform(df_pred['lemmas'])

In [None]:
%%time
y = classifier.predict(X)
y_proba = classifier.predict_proba(X)[:,1]

In [None]:
df_pred.loc[:,'infox_pred'] = y
df_pred.loc[:,'infox_proba'] = y_proba

In [None]:
df_pred

## Sauvegarde des résultats
---

### Sauvegarde du prédicteur

In [None]:
import pickle
def save_predictor(vectorizer,classifier,data='',filename=''):
    predictor = {
        'vectorizer' : vectorizer,
        'classifier' : classifier,
        'data' : data
    }
    if filename == '':
        filename = f'data/predictors/{str(int(time.time()))}.pp'
    with open(filename, 'wb') as file:
        pickle.dump(predictor, file)

In [None]:
save_predictor(vectorizer,classifier,filename='221013-InfoxPred.pp')

In [None]:
df_label_pred = df_pred[['url','infox_proba']].copy()

In [None]:
df_label_pred['label']='infox'
df_label_pred = df_label_pred.rename(columns={'infox_proba': 'value'})
df_label_pred['owner']='221013-InfoxPred'
df_label_pred['type']='pred'

In [None]:
df_label_pred

In [None]:
label_db.merge_labels(df_label_pred)

In [None]:
label_db.df()

In [None]:
label_db.save_prod()

## Conclusions
---

- Production Corpus Labelisé avec prédictions : OK

## Bricolages
---

In [None]:
df_pred