In [22]:
#Importation des librairies utilisées
import unicodedata 
import time
import pandas as pd
import numpy as np
import random
import nltk
import collections
import itertools
import csv
import warnings

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

In [23]:
# Répertoire de travail
DATA_DIR = "C:/Users/ETIENNE/Documents/Work/INSA/4A/Projets 4gmm 2018/"

# Nom des fichiers
training_reduit_path = DATA_DIR + "INSA_wefight_data_clean.csv"
# Variable Globale
HEADER_TEST = ['Question','Intent','BlockId', 'Action']
HEADER_TRAIN =['Question','Intent','BlockId', 'Action']

In [24]:
def split_dataset(input_path, nb_line, tauxValid,columns):
    time_start = time.time()
    data_all = pd.read_csv(input_path,sep=",",names=columns,nrows=nb_line) #cree data frame
    data_all = data_all.fillna("") #remplace les na par " "
    data_train, data_valid = train_test_split(data_all, test_size = tauxValid) # Split arrays or matrices into random train and test subsets
    time_end = time.time()
    print("Split Takes %d s" %(time_end-time_start))
    return data_train, data_valid

nb_line=20000  # part totale extraite du fichier initial ici déjà réduit
tauxValid=0.10 # part totale extraite du fichier initial ici déjà réduit
data_train, data_valid = split_dataset(training_reduit_path, nb_line, tauxValid, HEADER_TRAIN)
data_train.head()

Split Takes 3 s


Unnamed: 0,Question,Intent,BlockId,Action
434,j'ai une une séance de rayon pendant que j'éta...,#2-118_QVDP_Grossesse_Malformations,598d7089e4b03f0d40c73834,wiki_cancer
5001,ma dose de traitement,Profile_write_doseTraitement,596340c2e4b0a226d0f53612,conversation_FichePatientWrite:doseTraitement
3468,PAC et chimiothérapie,#6-60_TRTEINS_PAC,59919288e4b0feb288a76cc3,wiki_cancer
2583,quels sont les effets du taxol et de l hormono...,#6-24_TRTEINS_hormonotherapie,59632c41e4b0a226d067cc6d,wiki_cancer
1736,comment diminuer la fatigue après une mastecto...,#2-64-0_QVDP_Fatigue,59942265e4b068eebf52bb9f,wiki_cancer


In [25]:
# Librairies 
from bs4 import BeautifulSoup #Nettoyage d'HTML
import re # Regex
import nltk # Nettoyage des données

## listes de mots à supprimer dans la description des produits
## Depuis NLTK
nltk_stopwords = nltk.corpus.stopwords.words('french') 
## Depuis Un fichier externe.
lucene_stopwords = [unicode(w, "utf-8") for w in open(DATA_DIR+"lucene_stopwords.txt").read().split(",")] #En local

## Union des deux fichiers de stopwords 
stopwords = list(set(nltk_stopwords).union(set(lucene_stopwords)))

## Fonction de setmming de stemming permettant la racinisation
stemmer=nltk.stem.SnowballStemmer('french')

In [26]:
# Fonction clean générale
def clean_txt(txt):
    ### remove html stuff
    txt = BeautifulSoup(txt,"html.parser",from_encoding='utf-8').get_text() #nettoyage donnee html
    ### lower case
    txt = txt.lower()
    ### special escaping character '...'
    txt = txt.replace(u'\u2026','.')
    txt = txt.replace(u'\u00a0',' ')
    ### remove accent btw
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore')
    ###txt = unidecode(txt)
    ### remove non alphanumeric char
    txt = re.sub('[^a-z_]', ' ', txt)
    ### remove french stop words
    tokens = [w for w in txt.split() if (len(w)>2) and (w not in stopwords)]
    ### french stemming
    tokens = [stemmer.stem(token) for token in tokens]
    #Stemmers remove morphological affixes from words, leaving only the word stem
    ### tokens = stemmer.stemWords(tokens)
    return ' '.join(tokens)
    #join() returns a string in which the string elements of sequence have been joined by str separator.

def clean_marque(txt):
    txt = re.sub('[^a-zA-Z0-9]', '_', txt).lower()
    return txt

In [27]:
# fonction de nettoyage du fichier(stemming et liste de mots à supprimer)
def clean_df(input_data, column_names= ['Question','Intent','BlockId', 'Action']):
    #Test if columns entry match columns names of input data
    column_names_diff= set(column_names).difference(set(input_data.columns))
    #set.difference   new set with elements in column_names but not in input_data.columns
    
    if column_names_diff: #rentre dans la boucle si column_names différent zero
        # warning = exception
        warnings.warn("Column(s) '"+", ".join(list(column_names_diff)) +"' do(es) not match columns of input data", Warning)
        
    nb_line = input_data.shape[0]
    print("Start Clean %d lines" %nb_line)
    
    # Cleaning start for each columns
    time_start = time.time()
    clean_list=[]
    for column_name in column_names:
        column = input_data[column_name].values
        if column_name == "Question":
            array_clean = np.array(map(clean_txt,column))
            
        elif column_name == "Intent":
            array_clean = np.asarray(input_data['Intent']) #on recopie telle quelle la colonne intent  
            
        else:
            array_clean = np.array(map(clean_marque,column))
            #applies a function to all the items in an input_list
            #map(function_to_apply, list_of_inputs)
        clean_list.append(array_clean)
    time_end = time.time()
    print("Cleaning time: %d secondes"%(time_end-time_start))
    
    #Convert list to DataFrame
    array_clean = np.array(clean_list).T
    data_clean = pd.DataFrame(array_clean, columns = column_names)
    return data_clean

In [28]:
# Take approximately 2 minutes fors 100.000 rows
data_valid_clean = clean_df(data_valid)
data_train_clean = clean_df(data_train)

Start Clean 502 lines
Cleaning time: 1 secondes
Start Clean 4511 lines
Cleaning time: 4 secondes


In [29]:
data_train_cleanOrigin = data_train_clean 

In [30]:
np.shape(data_train_clean)

(4511, 4)

In [31]:
#effectif de chaque categories
CatCount= collections.Counter(data_train_cleanOrigin['Intent']).items()
CatCount

[('#2-88_Reprise_Cycle', 7),
 ('#2-135_QVDP_Alimentation_Alcool', 24),
 ('#2-55_QVDP_Douleur', 78),
 ('#5-37_Soutien_Psychologique', 52),
 ('#2-97_QVDP_ResteACharge', 22),
 ('#2-60_QVDP_RadioT_Diarrh\xc3\x83\xc2\xa9e', 20),
 ('Profile_write_role', 1),
 ('#1-5_Menu_Aidant', 39),
 ('#6-41_TRTEINS_ManchonLymphodeme', 39),
 ('#8-9_DDP_Charte_patient', 7),
 ('#6-2_TRTEINS_Cycles', 14),
 ('#6-35_TRTEINS_PrevenirLymphoedeme', 8),
 ('#6-63_TRTEINS_PAC_Precautions', 13),
 ('#2-125_QVDP_Sexualite_Reconstruction', 50),
 ('#6-57_TRTEINS_Yeux_Secs', 77),
 ('#6-44_TRTEINS_AllergieManchon', 9),
 ('Profile_write_nomTraitement', 1),
 ('#9-53_Informations_depistagesein', 48),
 ('#2-122_QVDP_Sexualite_Alopecie', 12),
 ('#2-58_QVDP_RadioT_Eff', 25),
 ('#2-36_QVDP_Alopecie_Pourquoi', 104),
 ('#6-5_TRTEINS_ChimioInjectable', 3),
 ('#2-130_QVDP_Alimentation', 114),
 ('#9-53_Depistage_cancer_sein', 8),
 ('#2-75_QVDP_SportPrecautions', 17),
 ('Conversation_Historique', 6),
 ('#2-73_QVDP_Sport', 42),
 ('#2-53_Q

In [32]:
#on echantillonne pour avoir le même nbre de questions dans chaque categorie
for k in range (np.shape(CatCount)[0]):
    if (CatCount[k][1] < 193): #le plus quand nbre de question est 193
        DfCat = (data_train_clean.loc[data_train_clean['Intent']== CatCount[k][0],:]) #con recupere les dataframe avec l'intent souhaite
        nbreLigneManquant = 193 - CatCount[k][1] #on regarde combien il manque de question
        if nbreLigneManquant < np.shape(DfCat)[0]: #lorsque on doit prendre quelques colonnes on tire aleatoirement
            DfCatRand = DfCat.sample(n=nbreLigneManquant)
            data_train_clean = pd.concat([data_train_clean, DfCatRand], ignore_index=True)#on concatene avec l'ancien dataframe        
        else:
            taille = np.shape(DfCat)[0]
            DfCatTemp = DfCat.append([DfCat]*193,ignore_index=True) #on cree un dataframe des categories manquante assez gros pour pouvoirtirer aleatoirement les questions qu'on garde
            DfCatTemp = DfCatTemp.sample(n=nbreLigneManquant)
            data_train_clean = pd.concat([data_train_clean, DfCatTemp], ignore_index=True)
np.shape(data_train_clean)

(27985, 4)

In [33]:
CatCount= collections.Counter(data_train_clean['Intent']).items()
CatCount

[('#2-88_Reprise_Cycle', 193),
 ('#2-135_QVDP_Alimentation_Alcool', 193),
 ('#2-55_QVDP_Douleur', 193),
 ('#5-37_Soutien_Psychologique', 193),
 ('#2-97_QVDP_ResteACharge', 193),
 ('#2-60_QVDP_RadioT_Diarrh\xc3\x83\xc2\xa9e', 193),
 ('Profile_write_role', 193),
 ('#1-5_Menu_Aidant', 193),
 ('#6-41_TRTEINS_ManchonLymphodeme', 193),
 ('#8-9_DDP_Charte_patient', 193),
 ('#6-2_TRTEINS_Cycles', 193),
 ('#6-35_TRTEINS_PrevenirLymphoedeme', 193),
 ('#6-63_TRTEINS_PAC_Precautions', 193),
 ('#2-125_QVDP_Sexualite_Reconstruction', 193),
 ('#6-57_TRTEINS_Yeux_Secs', 193),
 ('#6-44_TRTEINS_AllergieManchon', 193),
 ('Profile_write_nomTraitement', 193),
 ('#9-53_Informations_depistagesein', 193),
 ('#2-122_QVDP_Sexualite_Alopecie', 193),
 ('#2-58_QVDP_RadioT_Eff', 193),
 ('#2-36_QVDP_Alopecie_Pourquoi', 193),
 ('#6-5_TRTEINS_ChimioInjectable', 193),
 ('#2-130_QVDP_Alimentation', 193),
 ('#9-53_Depistage_cancer_sein', 193),
 ('#2-75_QVDP_SportPrecautions', 193),
 ('Conversation_Historique', 193),
 ('#

In [34]:
np.shape(data_train_clean)

(27985, 4)

In [35]:
#L5 = (data_train_clean.loc[data_train_clean['Intent']=='#8-9_DDP_Charte_patient',:])

In [36]:
data_train_clean

Unnamed: 0,Question,Intent,BlockId,Action
0,seanc rayon etais enceint beb peut malform nai...,#2-118_QVDP_Grossesse_Malformations,598d7089e4b03f0d40c73834,wiki_cancer
1,dos trait,Profile_write_doseTraitement,596340c2e4b0a226d0f53612,conversation_fichepatientwrite_dosetraitement
2,pac chimiotherap,#6-60_TRTEINS_PAC,59919288e4b0feb288a76cc3,wiki_cancer
3,effet taxol hormonotherap,#6-24_TRTEINS_hormonotherapie,59632c41e4b0a226d067cc6d,wiki_cancer
4,diminu fatigu apre mastectom reconstruct,#2-64-0_QVDP_Fatigue,59942265e4b068eebf52bb9f,wiki_cancer
5,veux infos catheter,#6-60_TRTEINS_PAC,59919288e4b0feb288a76cc3,wiki_cancer
6,chimiotherap ambulatoir,#2-106_QVDP_Social_HAD,598b4541e4b03f0d36de29d9,wiki_cancer
7,fertilit march,#2-87_QVDP_Fertilite,598b4175e4b03f0d36ca2350,wiki_cancer
8,obstruct chambr implant,#6-64_TRTEINS_PAC_Risques,5991933de4b0feb288ac2399,wiki_cancer
9,soulag douleur suit trait pouvon fair cur,#2-56_QVDP_DiminuerDouleur,59843d02e4b03f0d130d8199,wiki_cancer


In [44]:
#ajout d'un data frame ayant 2 lignes dont l intent est Charte patient
L5 = (data_train_clean.loc[data_train_clean['Intent']=='#8-9_DDP_Charte_patient',:])
L2 = L5.sample(n=3)
AAA = pd.concat([data_train_clean, L2], ignore_index=True)

In [43]:
#ajout d'un data frame * 5 ayant 2 lignes dont l intent est Charte patient
#L5 = (data_train_clean.loc[data_train_clean['Intent']=='#8-9_DDP_Charte_patient',:])
#L2 = L5.sample(n=2)
#data_train_clean.append([L5]*5)

# Prediction

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher

In [49]:
def vectorizer_train(df,column):
    col = df[column]
    # TFIDF
    vec = TfidfVectorizer(
            min_df = 1, #on prend tous les mots
            stop_words =stopwords,
            smooth_idf=True,
            norm='l2',
            sublinear_tf=True,
            use_idf=True, #tf avec idf
            ngram_range=(1,1)) 
    tfidf=vec.fit_transform(col)
    return vec,tfidf

def apply_vectorizer(df, vec, columns):
    
    data_hash = map(lambda x : " ".join(x), df[columns].values)  
    tfidf=vec.transform(df[columns])

    # TFIDF
    #tfidf=vec.transform(df)
    return tfidf

In [50]:
vec,X = vectorizer_train(data_train_clean,"Question")
Y = data_train_clean["Intent"].values
Xv = apply_vectorizer(data_valid_clean,vec,"Question")
Yv = data_valid_clean["Intent"].values

In [51]:
# Regression Logistique 
## estimation
from sklearn.linear_model import LogisticRegression
#si on augmente C, on augmente bcp le score
cla = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True,
                          intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear',
                          max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
cla.fit(X,Y)
score=cla.score(X,Y)
Y_predict = cla.predict(X)
                
print('# training score:',score)

('# training score:', 0.96401643737716636)


In [52]:
## erreur en validation
scoreValidation=cla.score(Xv,Yv)
predict_v = cla.predict(Xv)
##probleme car tfidf et Xv pas le même nbre de colonne 
print('# validation score:',scoreValidation)

('# validation score:', 0.69521912350597614)


Score d'apprentissage bien meilleur qu'avec les données brutes, mais le score de validation gagne que 2%