In [1]:
#Importation des librairies utilisées
import unicodedata 
import time
import pandas as pd
import numpy as np
import random
import nltk
import collections
import itertools
import csv
import warnings

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt



In [4]:
# Répertoire de travail
DATA_DIR = "/home/bouchero/Documents/projet/"

# Nom des fichiers
training_reduit_path = DATA_DIR + "INSA_wefight_data_clean.csv"
# Variable Globale
HEADER_TEST = ['Question','Intent','BlockId', 'Action']
HEADER_TRAIN =['Question','Intent','BlockId', 'Action']

In [5]:
def split_dataset(input_path, nb_line, tauxValid,columns):
    time_start = time.time()
    data_all = pd.read_csv(input_path,sep=",",names=columns,nrows=nb_line) #cree data frame
    data_all = data_all.fillna("") #remplace les na par " "
    data_train, data_valid = train_test_split(data_all, test_size = tauxValid) # Split arrays or matrices into random train and test subsets
    time_end = time.time()
    print("Split Takes %d s" %(time_end-time_start))
    return data_train, data_valid

nb_line=20000  # part totale extraite du fichier initial ici déjà réduit
tauxValid=0.10 # part totale extraite du fichier initial ici déjà réduit
data_train, data_valid = split_dataset(training_reduit_path, nb_line, tauxValid, HEADER_TRAIN)
data_train.head()

Split Takes 0 s


Unnamed: 0,Question,Intent,BlockId,Action
1013,chimio et perte de cheveux,#2-36_QVDP_Alopecie_Pourquoi,5984243ae4b03f0d12766ac7,wiki_cancer
281,Grossesse,#2-107_QVDP_Grossesse,598d6de3e4b03f0d40b7891f,wiki_cancer
360,cancer opération et grossesse,#2-111_QVDP_Grossesse_Chirurgie,598d6edce4b03f0d40bd4b11,wiki_cancer
1852,quand faire des siestes ?,#2-72_QVDP_Fatigue_Sieste,59846316e4b03f0d141962b2,wiki_cancer
271,@Sexualite_Grossesse @Cancer,#2-107_QVDP_Grossesse,598d6de3e4b03f0d40b7891f,wiki_cancer


In [6]:
# Librairies 
from bs4 import BeautifulSoup #Nettoyage d'HTML
import re # Regex
import nltk # Nettoyage des données

## listes de mots à supprimer dans la description des produits
## Depuis NLTK
nltk_stopwords = nltk.corpus.stopwords.words('french') 
## Depuis Un fichier externe.
lucene_stopwords = [unicode(w, "utf-8") for w in open(DATA_DIR+"lucene_stopwords.txt").read().split(",")] #En local

## Union des deux fichiers de stopwords 
stopwords = list(set(nltk_stopwords).union(set(lucene_stopwords)))

## Fonction de setmming de stemming permettant la racinisation
stemmer=nltk.stem.SnowballStemmer('french')

In [7]:
# Fonction clean générale
def clean_txt(txt):
    ### remove html stuff
    txt = BeautifulSoup(txt,"html.parser",from_encoding='utf-8').get_text() #nettoyage donnee html
    ### lower case
    txt = txt.lower()
    ### special escaping character '...'
    txt = txt.replace(u'\u2026','.')
    txt = txt.replace(u'\u00a0',' ')
    ### remove accent btw
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore')
    ###txt = unidecode(txt)
    ### remove non alphanumeric char
    txt = re.sub('[^a-z_]', ' ', txt)
    ### remove french stop words
    tokens = [w for w in txt.split() if (len(w)>2) and (w not in stopwords)]
    ### french stemming
    tokens = [stemmer.stem(token) for token in tokens]
    #Stemmers remove morphological affixes from words, leaving only the word stem
    ### tokens = stemmer.stemWords(tokens)
    return ' '.join(tokens)
    #join() returns a string in which the string elements of sequence have been joined by str separator.

def clean_marque(txt):
    txt = re.sub('[^a-zA-Z0-9]', '_', txt).lower()
    return txt

In [8]:
# fonction de nettoyage du fichier(stemming et liste de mots à supprimer)
def clean_df(input_data, column_names= ['Question','Intent','BlockId', 'Action']):
    #Test if columns entry match columns names of input data
    column_names_diff= set(column_names).difference(set(input_data.columns))
    #set.difference   new set with elements in column_names but not in input_data.columns
    
    if column_names_diff: #rentre dans la boucle si column_names différent zero
        # warning = exception
        warnings.warn("Column(s) '"+", ".join(list(column_names_diff)) +"' do(es) not match columns of input data", Warning)
        
    nb_line = input_data.shape[0]
    print("Start Clean %d lines" %nb_line)
    
    # Cleaning start for each columns
    time_start = time.time()
    clean_list=[]
    for column_name in column_names:
        column = input_data[column_name].values
        if column_name == "Question":
            array_clean = np.array(map(clean_txt,column))
            
        elif column_name == "Intent":
            array_clean = np.asarray(input_data['Intent']) #on recopie telle quelle la colonne intent  
            
        else:
            array_clean = np.array(map(clean_marque,column))
            #applies a function to all the items in an input_list
            #map(function_to_apply, list_of_inputs)
        clean_list.append(array_clean)
    time_end = time.time()
    print("Cleaning time: %d secondes"%(time_end-time_start))
    
    #Convert list to DataFrame
    array_clean = np.array(clean_list).T
    data_clean = pd.DataFrame(array_clean, columns = column_names)
    return data_clean

In [9]:
stopwords_perso = ["veux", "apres", "faire", "peux", "dois", "vais", "peut", "fait"]
stopwords2 = list(set(stopwords).union(set(stopwords_perso)))

# Fonction clean générale
def clean_txt2(txt):
    ### remove html stuff
    txt = BeautifulSoup(txt,"html.parser",from_encoding='utf-8').get_text() #nettoyage donnee html
    ### lower case
    txt = txt.lower()
    ### special escaping character '...'
    txt = txt.replace(u'\u2026','.')
    txt = txt.replace(u'\u00a0',' ')
    ### remove accent btw
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore')
    ###txt = unidecode(txt)
    ### remove non alphanumeric char
    txt = re.sub('[^a-z_]', ' ', txt)
    ### remove french stop words
    tokens = [w for w in txt.split() if (len(w)>2) and (w not in stopwords2)]
    ### french stemming
    tokens = [stemmer.stem(token) for token in tokens]
    #Stemmers remove morphological affixes from words, leaving only the word stem
    ### tokens = stemmer.stemWords(tokens)
    return ' '.join(tokens)
    #join() returns a string in which the string elements of sequence have been joined by str separator.

    # fonction de nettoyage du fichier(stemming et liste de mots à supprimer)
def clean_df2(input_data, column_names= ['Question','Intent','BlockId', 'Action']):
    #Test if columns entry match columns names of input data
    column_names_diff= set(column_names).difference(set(input_data.columns))
    #set.difference   new set with elements in column_names but not in input_data.columns
    
    if column_names_diff: #rentre dans la boucle si column_names différent zero
        # warning = exception
        warnings.warn("Column(s) '"+", ".join(list(column_names_diff)) +"' do(es) not match columns of input data", Warning)
        
    nb_line = input_data.shape[0]
    print("Start Clean %d lines" %nb_line)
    
    # Cleaning start for each columns
    time_start = time.time()
    clean_list=[]
    for column_name in column_names:
        column = input_data[column_name].values
        if column_name == "Question":
            array_clean = np.array(map(clean_txt2,column))
            
        elif column_name == "Intent":
            array_clean = np.asarray(input_data['Intent']) #on recopie telle quelle la colonne intent  
            
        else:
            array_clean = np.array(map(clean_marque,column))
            #applies a function to all the items in an input_list
            #map(function_to_apply, list_of_inputs)
        clean_list.append(array_clean)
    time_end = time.time()
    print("Cleaning time: %d secondes"%(time_end-time_start))
    
    #Convert list to DataFrame
    array_clean = np.array(clean_list).T
    data_clean = pd.DataFrame(array_clean, columns = column_names)
    return data_clean

data_valid_clean1 = clean_df2(data_valid)
data_train_clean1 = clean_df2(data_train)

Start Clean 502 lines
Cleaning time: 0 secondes
Start Clean 4511 lines
Cleaning time: 1 secondes


In [28]:
# Take approximately 2 minutes fors 100.000 rows
data_valid_clean = clean_df(data_valid)
data_train_clean = clean_df(data_train)

Start Clean 502 lines
Cleaning time: 1 secondes
Start Clean 4511 lines
Cleaning time: 4 secondes


In [11]:
data_train_cleanOrigin = data_train_clean1 

In [12]:
np.shape(data_train_clean1)

(4511, 4)

In [13]:
#effectif de chaque categories
CatCount= collections.Counter(data_train_cleanOrigin['Intent']).items()
CatCount

[('#2-135_QVDP_Alimentation_Alcool', 22),
 ('#2-55_QVDP_Douleur', 79),
 ('#5-37_Soutien_Psychologique', 52),
 ('#2-97_QVDP_ResteACharge', 24),
 ('#2-60_QVDP_RadioT_Diarrh\xc3\x83\xc2\xa9e', 18),
 ('Profile_write_role', 1),
 ('#1-5_Menu_Aidant', 43),
 ('#6-41_TRTEINS_ManchonLymphodeme', 37),
 ('#8-9_DDP_Charte_patient', 7),
 ('#6-6_TRTEINS_ChimioOrale', 18),
 ('#6-35_TRTEINS_PrevenirLymphoedeme', 6),
 ('#6-63_TRTEINS_PAC_Precautions', 15),
 ('#2-125_QVDP_Sexualite_Reconstruction', 50),
 ('#6-57_TRTEINS_Yeux_Secs', 79),
 ('#6-44_TRTEINS_AllergieManchon', 8),
 ('Profile_write_nomTraitement', 1),
 ('#9-53_Informations_depistagesein', 44),
 ('#2-122_QVDP_Sexualite_Alopecie', 12),
 ('#2-58_QVDP_RadioT_Eff', 24),
 ('#2-36_QVDP_Alopecie_Pourquoi', 99),
 ('#6-5_TRTEINS_ChimioInjectable', 4),
 ('#2-130_QVDP_Alimentation', 119),
 ('#9-53_Depistage_cancer_sein', 8),
 ('#2-53_QVDP_Cils', 29),
 ('Conversation_Historique', 6),
 ('#6-96_TRTEINS_Perte_Poids', 68),
 ('#2-75_QVDP_SportPrecautions', 16),


In [14]:
#on echantillonne pour avoir le même nbre de questions dans chaque categorie
for k in range (np.shape(CatCount)[0]):
    if (CatCount[k][1] < 193): #le plus quand nbre de question est 193
        DfCat = (data_train_clean1.loc[data_train_clean1['Intent']== CatCount[k][0],:]) #con recupere les dataframe avec l'intent souhaite
        nbreLigneManquant = 193 - CatCount[k][1] #on regarde combien il manque de question
        if nbreLigneManquant < np.shape(DfCat)[0]: #lorsque on doit prendre quelques colonnes on tire aleatoirement
            DfCatRand = DfCat.sample(n=nbreLigneManquant)
            data_train_clean = pd.concat([data_train_clean1, DfCatRand], ignore_index=True)#on concatene avec l'ancien dataframe        
        else:
            taille = np.shape(DfCat)[0]
            DfCatTemp = DfCat.append([DfCat]*193,ignore_index=True) #on cree un dataframe des categories manquante assez gros pour pouvoirtirer aleatoirement les questions qu'on garde
            DfCatTemp = DfCatTemp.sample(n=nbreLigneManquant)
            data_train_clean1 = pd.concat([data_train_clean1, DfCatTemp], ignore_index=True)
np.shape(data_train_clean)

(24140, 4)

In [15]:
CatCount= collections.Counter(data_train_clean1['Intent']).items()
CatCount

[('#2-135_QVDP_Alimentation_Alcool', 193),
 ('#2-55_QVDP_Douleur', 193),
 ('#5-37_Soutien_Psychologique', 193),
 ('#2-97_QVDP_ResteACharge', 193),
 ('#2-60_QVDP_RadioT_Diarrh\xc3\x83\xc2\xa9e', 193),
 ('Profile_write_role', 193),
 ('#1-5_Menu_Aidant', 193),
 ('#6-41_TRTEINS_ManchonLymphodeme', 193),
 ('#8-9_DDP_Charte_patient', 193),
 ('#6-6_TRTEINS_ChimioOrale', 193),
 ('#6-35_TRTEINS_PrevenirLymphoedeme', 193),
 ('#6-63_TRTEINS_PAC_Precautions', 193),
 ('#2-125_QVDP_Sexualite_Reconstruction', 193),
 ('#6-57_TRTEINS_Yeux_Secs', 193),
 ('#6-44_TRTEINS_AllergieManchon', 193),
 ('Profile_write_nomTraitement', 193),
 ('#9-53_Informations_depistagesein', 193),
 ('#2-122_QVDP_Sexualite_Alopecie', 193),
 ('#2-58_QVDP_RadioT_Eff', 193),
 ('#2-36_QVDP_Alopecie_Pourquoi', 99),
 ('#6-5_TRTEINS_ChimioInjectable', 193),
 ('#2-130_QVDP_Alimentation', 119),
 ('#9-53_Depistage_cancer_sein', 193),
 ('#2-53_QVDP_Cils', 193),
 ('Conversation_Historique', 193),
 ('#6-96_TRTEINS_Perte_Poids', 193),
 ('#2-

In [16]:
np.shape(data_train_clean1)

(27241, 4)

In [35]:
#L5 = (data_train_clean.loc[data_train_clean['Intent']=='#8-9_DDP_Charte_patient',:])

In [17]:
data_train_clean1

Unnamed: 0,Question,Intent,BlockId,Action
0,chimio pert cheveux,#2-36_QVDP_Alopecie_Pourquoi,5984243ae4b03f0d12766ac7,wiki_cancer
1,grossess,#2-107_QVDP_Grossesse,598d6de3e4b03f0d40b7891f,wiki_cancer
2,canc oper grossess,#2-111_QVDP_Grossesse_Chirurgie,598d6edce4b03f0d40bd4b11,wiki_cancer
3,siest,#2-72_QVDP_Fatigue_Sieste,59846316e4b03f0d141962b2,wiki_cancer
4,sexualite_grossess canc,#2-107_QVDP_Grossesse,598d6de3e4b03f0d40b7891f,wiki_cancer
5,infos canc,#9-2_Informations_cancer,59632c41e4b0a226d067cd48,wiki_cancer
6,cach cheveux lor repouss,#2-45_QVDP_Alopecie_Perruque,5984386ee4b03f0d12ec6c33,wiki_cancer
7,pris charg securit social nouvel ald mem,#2-96_QVDP_Social_Priseencharge,598b43bee4b03f0d36d5f410,wiki_cancer
8,perdr cheveux,#2-41_QVDP_Alopecie_Diminuer,59843588e4b03f0d12d83b08,wiki_cancer
9,manchon compress,#6-41_TRTEINS_ManchonLymphodeme,598b5110e4b03f0d3718d13f,wiki_cancer


In [18]:
#ajout d'un data frame ayant 2 lignes dont l intent est Charte patient
L5 = (data_train_clean1.loc[data_train_clean1['Intent']=='#8-9_DDP_Charte_patient',:])
L2 = L5.sample(n=3)
AAA = pd.concat([data_train_clean1, L2], ignore_index=True)

In [43]:
#ajout d'un data frame * 5 ayant 2 lignes dont l intent est Charte patient
#L5 = (data_train_clean.loc[data_train_clean['Intent']=='#8-9_DDP_Charte_patient',:])
#L2 = L5.sample(n=2)
#data_train_clean.append([L5]*5)

# Prediction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher

In [20]:
def vectorizer_train(df,column):
    col = df[column]
    # TFIDF
    vec = TfidfVectorizer(
            min_df = 1, #on prend tous les mots
            stop_words =stopwords,
            smooth_idf=True,
            norm='l2',
            sublinear_tf=True,
            use_idf=True, #tf avec idf
            ngram_range=(1,1)) 
    tfidf=vec.fit_transform(col)
    return vec,tfidf

def apply_vectorizer(df, vec, columns):
    
    data_hash = map(lambda x : " ".join(x), df[columns].values)  
    tfidf=vec.transform(df[columns])

    # TFIDF
    #tfidf=vec.transform(df)
    return tfidf

In [21]:
vec,X = vectorizer_train(data_train_clean1,"Question")
Y = data_train_clean1["Intent"].values
Xv = apply_vectorizer(data_valid_clean1,vec,"Question")
Yv = data_valid_clean1["Intent"].values

In [22]:
# Regression Logistique 
## estimation
from sklearn.linear_model import LogisticRegression
#si on augmente C, on augmente bcp le score
cla = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True,
                          intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear',
                          max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
cla.fit(X,Y)
score=cla.score(X,Y)
Y_predict = cla.predict(X)
                
print('# training score:',score)

('# training score:', 0.95668294115487684)


In [24]:
## erreur en validation
scoreValidation=cla.score(Xv,Yv)
predict_v = cla.predict(Xv)
##probleme car tfidf et Xv pas le même nbre de colonne 
print('# validation score:',scoreValidation)

('# validation score:', 0.68326693227091628)


Score d'apprentissage bien meilleur qu'avec les données brutes, mais le score de validation gagne que 2%