In [48]:
import os
import numpy as np
import pandas as pd

In [49]:
conda update scikit-learn

Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.

Solving environment: ...working... 

Updating scikit-learn is constricted by 

anaconda -> requires scikit-learn==0.21.3=py37h6288b17_0

If you are sure you want an update of your package either try `conda update --all` or install a specific version of the package you want using `conda install <pkg>=<version>`

done

## Package Plan ##

  environment location: C:\Users\utilisateur\Anaconda3

  added / updated specs:
    - scikit-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    backports.functools_lru_cache-1.6.1|     pyhd3eb1b0_0          12 KB
    conda-4.9.2                |   py37haa95532_0         2.9 MB
    conda-package-handling-1.7.2|   py37h76e460a_0         724 KB
    future-0.18.2              |           py37_1         646 KB
    ---

In [52]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.22.1.


# 1. Charger le dataset avec les fichier CSV

In [53]:
# télécharger
df_train = pd.read_csv("Data/df_train.csv")
df_test = pd.read_csv("Data/df_test.csv")

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

In [5]:
print(df_train.shape, df_test.shape)

(16737, 6) (4201, 6)


# 2. prepocessing

In [35]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords

def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Get the list of stop words 
    stop_words = stopwords.words('french')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Create a Snowball stemmer 
    stemmer = FrenchStemmer()

    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return " ".join(tokens_stemmed)

In [13]:
df_train.Contenu_stemmed = df_train.Contenu_du_fichier_txt.apply(process)

  """Entry point for launching an IPython kernel.


In [14]:
df_test.Contenu_stemmed = df_test.Contenu_du_fichier_txt.apply(process)

  """Entry point for launching an IPython kernel.


## 2 bis Supprimer les mots qui ont moins de 3 lettres

In [None]:
import re
def delete_less_than_3_letters(contenu):
    return re.sub(r'\b\w{1,3}\b', '', contenu)

In [None]:
df_train["Contenu_more_than_3_letters"] = df_train.Contenu_stemmed.apply(delete_less_than_3_letters)
df_test["Contenu_more_than_3_letters"] = df_test.Contenu_stemmed.apply(delete_less_than_3_letters)

# 3. Feature engeneering

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.metrics import classification_report
import joblib

In [55]:
tfidf_word = TfidfVectorizer(max_features=1000)
# tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), lowercase=False, max_features=1000)

In [56]:
X_tfidf_word_train = tfidf_word.fit_transform(df_train.Contenu_more_than_3_letters)
# X_tfidf_char_train_complet = tfidf_char_train_complet.fit_transform(df_train.Contenu_du_fichier_txt)
                                     
# X_tfidf_train_complet = sparse.hstack([X_tfidf_word_train_complet, X_tfidf_char_train_complet])

In [40]:
X_tfidf_word_train.shape

(16737, 1000)

In [10]:
X_tfidf_word_test = tfidf_word.transform(df_test.Contenu_more_than_3_letters)
# X_tfidf_char_test_complet = tfidf_char_train_complet.transform(df_test.Contenu_du_fichier_txt)
# X_tfidf_test_complet = sparse.hstack([X_tfidf_word_test_complet, X_tfidf_char_test_complet])

In [57]:
# enregistrer le modèle tf-idf vectorizer
joblib.dump(tfidf_word,"tfidf_word.sav")

['tfidf_word.sav']

In [None]:
# télécharger le modèle tf-idf vectorizer
# tfidf_word=joblib.load("tfidf_word.sav")

## 3 bis Sauvegarder les données transformées par TFIDF ngram=(1,3)

In [None]:
# sauvegarder
sparse.save_npz("X_tfidf_train_complet.npz", X_tfidf_train_complet)
sparse.save_npz("X_tfidf_test_complet.npz", X_tfidf_test_complet)

In [None]:
# télécharger
X_tfidf_train_complet = sparse.load_npz("X_tfidf_train_complet.npz")
X_tfidf_test_complet = sparse.load_npz("X_tfidf_test_complet.npz")

# 4. Entraîner le modèle

In [13]:
from sklearn.svm import SVC

In [67]:
svc = SVC(probability=True)

In [68]:
svc.fit(X_tfidf_word_train, df_train.target)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [69]:
# Enregistrer le modèle de classification
joblib.dump(svc,"svc_for_web_typage.sav")

['svc_for_web_typage.sav']

# 5. Evaluer le modèle

In [21]:
# fonction pour la mise en forme de reporting
def report_mise_en_forme(df):
    df[["precision", "recall", "f1-score"]] = df[["precision", "recall", "f1-score"]].round(2)
    df.support = df.support.astype(int)
    return df

In [22]:
# reporting sur le test
report_svc_test = classification_report(df_test.target, svc.predict(X_tfidf_word_test), output_dict=True)
df_report_svc_test = pd.DataFrame(report_svc_test).transpose()
report_mise_en_forme(df_report_svc_test)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
ACTE DECES,0.99,1.0,0.99,202
ACTE DECES SUITE,1.0,0.6,0.75,5
ACTE NAISSANCE,0.88,0.35,0.5,20
ACTE NAISSANCE SUITE,0.0,0.0,0.0,2
ACTE NOTORIETE,0.96,0.89,0.92,53
ACTE NOTORIETE SUITE,0.86,0.91,0.89,386
AR HERITIERS,0.95,1.0,0.98,21
AR HERITIERS SUITE,0.95,0.9,0.93,21
ATTES HONNEUR,1.0,1.0,1.0,68
ATTES HONNEUR SUITE,1.0,1.0,1.0,24


In [None]:
# reporting sur le train afin de savoir s'il y a overfitting
report_svc_train = classification_report(df_train.target, svc.predict(X_tfidf_word_train), output_dict=True)
df_report_svc_train = pd.DataFrame(report_svc_train).transpose()
report_mise_en_forme(df_report_svc_train)

# 6. Amélioration
- merger certaines catégories principales avec leur suite

In [None]:
# Création du dico merge

dico_merge = {
    "ACTE DECES MERGE": ['ACTE DECES', 'ACTE DECES SUITE'],
    "ACTE NAISSANCE MERGE": ['ACTE NAISSANCE', 'ACTE NAISSANCE SUITE'],
    "ACTE NOTORIETE MERGE": ['ACTE NOTORIETE', 'ACTE NOTORIETE SUITE'],
    "AR HERITIERS MERGE": ['AR HERITIERS', 'AR HERITIERS SUITE'],
    "ATTES HONNEUR MERGE": ['ATTES HONNEUR', 'ATTES HONNEUR SUITE'],
    "AUTO PAIEMENT MERGE": ['AUTO PAIEMENT', 'AUTO PAIEMENT SUITE'],
    "AVIS MERGE": ['AVIS', 'AVIS SUITE'],
    "CERFA 2705 MERGE": ['CERFA 2705', 'CERFA 2705 SUITE'],
    "CERFA 2738 MERGE": ['CERFA 2738', 'CERFA 2738 SUITE'],
    "CERTIF HEREDITE MERGE": ['CERTIF HEREDITE', 'CERTIF HEREDITE SUITE'],
    "CNI MERGE": ['CNI', 'CNI SUITE'],
    "DELIBERA SOCIETE MERGE": ['DELIBERA SOCIETE', 'DELIBERA SOCIETE SUITE'],
    "DEM PRESTA MERGE": ['DEM PRESTA', 'DEM PRESTA SUITE'],
    "DEMANDE RENTE MERGE": ['DEMANDE RENTE', 'DEMANDE RENTE SUITE'],
    "DESIGN NOTAIRE MERGE": ['DESIGN NOTAIRE', 'DESIGN NOTAIRE SUITE'],
    "DEVOLUTION SUCC MERGE": ['DEVOLUTION SUCC', 'DEVOLUTION SUCC SUITE'],
    "DIVERS MERGE": ['DIVERS', 'DIVERS SUITE'],
    "FACTURE MERGE": ['FACTURE', 'FACTURE SUITE'],
    "INSTRUCT NOTAORE MERGE": ['INSTRUCT NOTAORE', 'INSTRUCT NOTAORE SUITE'],
    "LETTRE BENEFICIAIRE MERGE": ['LETTRE BENEFICIAIRE', 'LETTRE BENEFICIAIRE SUITE'],
    "LIVRET FAMILLE MERGE": ['LIVRET FAMILLE', 'LIVRET FAMILLE SUITE'],
    "MANDAT GESTION MERGE": ['MANDAT GESTION', 'MANDAT GESTION SUITE'],
    "MANDAT NOTAIRE MERGE": ['MANDAT NOTAIRE', 'MANDAT NOTAIRE SUITE'],
    "MANIF AYANT DROIT MERGE": ['MANIF AYANT DROIT', 'MANIF AYANT DROIT SUITE'],
    "ORD JUGE TUTELLE MERGE": ['ORD JUGE TUTELLE', 'ORD JUGE TUTELLE SUITE'], 
    "PASSEPORT MERGE": ['PASSEPORT', 'PASSEPORT SUITE'],
    "RENONCIATION MERGE": ['RENONCIATION', 'RENONCIATION SUITE'],
    "RIB MERGE": ['RIB', 'RIB SUITE'],
    "STATUT ASSOC MERGE": ['STATUT ASSOC', 'STATUT ASSOC SUITE']
}

In [None]:
def create_category_merge(colonne_target):
    liste_category_merge = []
    for target_ini in colonne_target:
        for key, value in dico_merge.items():
            if target_ini in value:
                liste_category_merge.append(key)
    return liste_category_merge

In [None]:
df_train["target_merge"] = create_category_merge(df_train.target)
df_test["target_merge"] = create_category_merge(df_test.target)

### Ensuite recommencer étape 5 et 6

### brouillon

In [71]:
svc.classes_

array(['ACTE DECES', 'ACTE DECES SUITE', 'ACTE NAISSANCE',
       'ACTE NAISSANCE SUITE', 'ACTE NOTORIETE', 'ACTE NOTORIETE SUITE',
       'AR HERITIERS', 'AR HERITIERS SUITE', 'ATTES HONNEUR',
       'ATTES HONNEUR SUITE', 'AUTO PAIEMENT', 'AVIS', 'AVIS SUITE',
       'CERFA 2705', 'CERFA 2705 SUITE', 'CERFA 2738', 'CERFA 2738 SUITE',
       'CERTIF HEREDITE', 'CERTIF HEREDITE SUITE', 'CNI', 'CNI SUITE',
       'DELIBERA SOCIETE SUITE', 'DEM PRESTA', 'DEM PRESTA SUITE',
       'DEMANDE RENTE', 'DEMANDE RENTE SUITE', 'DESIGN NOTAIRE',
       'DESIGN NOTAIRE SUITE', 'DEVOLUTION SUCC', 'DEVOLUTION SUCC SUITE',
       'DIVERS', 'DIVERS SUITE', 'FACTURE', 'FACTURE SUITE',
       'INSTRUCT NOTAORE', 'INSTRUCT NOTAORE SUITE',
       'LETTRE BENEFICIAIRE', 'LETTRE BENEFICIAIRE SUITE',
       'LIVRET FAMILLE', 'LIVRET FAMILLE SUITE', 'MANDAT GESTION',
       'MANDAT GESTION SUITE', 'MANDAT NOTAIRE', 'MANDAT NOTAIRE SUITE',
       'MANIF AYANT DROIT', 'MANIF AYANT DROIT SUITE', 'ORD JUGE TUTELL

In [114]:
targets = list(svc.classes_)

In [146]:
file = "901_00822-20200709115505002-SUCC-0102_page_0.txt"
tokenizer = RegexpTokenizer(r'\w+')


with open (file) as f:
    contenu = f.read()
    contenu = contenu.rstrip().encode('utf-8')
    contenu = contenu.decode('utf-8')
    contenu_preprocessed = process(contenu)
    contenu_tfidf = tfidf_word.transform([contenu_preprocessed])
    predictProba = svc.predict_proba(contenu_tfidf)
#     print(predictProba[0][1])
#     print(svc.classes_[1])
    preprobalist = list(predictProba[0])
    dico_proba = {targets[i]: round(preprobalist[i]*100,2) for i in range(len(preprobalist))} 
print(dico_proba)
    
#     index = np.argsort(prediction, axis=1)[:,-3:]
#     index_for_proba = index[0]
#     print(svc.classes_[index][0])
#     print(np.around(prediction[0][index_for_proba]*100, decimals=2))


{'ACTE DECES': 0.02, 'ACTE DECES SUITE': 0.02, 'ACTE NAISSANCE': 0.01, 'ACTE NAISSANCE SUITE': 0.01, 'ACTE NOTORIETE': 0.02, 'ACTE NOTORIETE SUITE': 0.04, 'AR HERITIERS': 0.01, 'AR HERITIERS SUITE': 0.02, 'ATTES HONNEUR': 0.01, 'ATTES HONNEUR SUITE': 0.02, 'AUTO PAIEMENT': 0.01, 'AVIS': 0.01, 'AVIS SUITE': 0.01, 'CERFA 2705': 0.01, 'CERFA 2705 SUITE': 0.04, 'CERFA 2738': 0.01, 'CERFA 2738 SUITE': 0.01, 'CERTIF HEREDITE': 0.01, 'CERTIF HEREDITE SUITE': 0.01, 'CNI': 0.02, 'CNI SUITE': 0.02, 'DELIBERA SOCIETE SUITE': 0.01, 'DEM PRESTA': 98.76, 'DEM PRESTA SUITE': 0.01, 'DEMANDE RENTE': 0.0, 'DEMANDE RENTE SUITE': 0.01, 'DESIGN NOTAIRE': 0.02, 'DESIGN NOTAIRE SUITE': 0.04, 'DEVOLUTION SUCC': 0.01, 'DEVOLUTION SUCC SUITE': 0.04, 'DIVERS': 0.12, 'DIVERS SUITE': 0.09, 'FACTURE': 0.03, 'FACTURE SUITE': 0.04, 'INSTRUCT NOTAORE': 0.05, 'INSTRUCT NOTAORE SUITE': 0.1, 'LETTRE BENEFICIAIRE': 0.02, 'LETTRE BENEFICIAIRE SUITE': 0.05, 'LIVRET FAMILLE': 0.02, 'LIVRET FAMILLE SUITE': 0.02, 'MANDAT GESTI

In [149]:
dico_proba_ordered = {k: v for k, v in sorted(dico_proba.items(), key=lambda item: item[1], reverse=True)}

In [150]:
from itertools import islice
top3 = list(islice(dico_proba_ordered.items(), 3))

In [157]:
top3[0][1]

98.76

[('DEM PRESTA', 0.9875500503378545),
 ('DIVERS', 0.0012453842194217705),
 ('INSTRUCT NOTAORE SUITE', 0.0010170327259706102)]