In [1]:
#Importation des librairies utilisées
import unicodedata 
import time
import pandas as pd
import numpy as np
import random
import nltk
import collections
import itertools
import csv
import warnings

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt



In [2]:
# Répertoire de travail
DATA_DIR = "C:/Users/ETIENNE/Documents/Work/INSA/4A/Projets 4gmm 2018/"

# Nom des fichiers
training_reduit_path = DATA_DIR + "INSA_wefight_data_clean.csv"
# Variable Globale
HEADER_TEST = ['Question','Intent','BlockId', 'Action']
HEADER_TRAIN =['Question','Intent','BlockId', 'Action']

In [3]:
def split_dataset(input_path, nb_line, tauxValid,columns):
    time_start = time.time()
    data_all = pd.read_csv(input_path,sep=",",names=columns,nrows=nb_line) #cree data frame
    data_all = data_all.fillna("") #remplace les na par " "
    data_train, data_valid = train_test_split(data_all, test_size = tauxValid) # Split arrays or matrices into random train and test subsets
    time_end = time.time()
    print("Split Takes %d s" %(time_end-time_start))
    return data_train, data_valid

nb_line=20000  # part totale extraite du fichier initial ici déjà réduit
tauxValid=0.10 # part totale extraite du fichier initial ici déjà réduit
data_train, data_valid = split_dataset(training_reduit_path, nb_line, tauxValid, HEADER_TRAIN)
data_train.head()

Split Takes 0 s


Unnamed: 0,Question,Intent,BlockId,Action
2583,quels sont les effets du taxol et de l hormono...,#6-24_TRTEINS_hormonotherapie,59632c41e4b0a226d067cc6d,wiki_cancer
1134,Qd les cheveux repoussent ils ?,#2-38_QVDP_Alopecie_Repousse,598434d5e4b03f0d12d31a20,wiki_cancer
4886,j'ai un rendez 25/01/2011,conversation_rappel_rendezvous,596340b8e4b0a226d0f4e811,conversation_rappelRendezVous
5001,ma dose de traitement,Profile_write_doseTraitement,596340c2e4b0a226d0f53612,conversation_FichePatientWrite:doseTraitement
822,influence de l'alimentation sur la maladie,#2-130_QVDP_Alimentation,5991c543e4b0b2045b0c568f,wiki_cancer


In [4]:
# Librairies 
from bs4 import BeautifulSoup #Nettoyage d'HTML
import re # Regex
import nltk # Nettoyage des données

## listes de mots à supprimer dans la description des produits
## Depuis NLTK
nltk_stopwords = nltk.corpus.stopwords.words('french') 
## Depuis Un fichier externe.
lucene_stopwords = [unicode(w, "utf-8") for w in open(DATA_DIR+"lucene_stopwords.txt").read().split(",")] #En local

## Union des deux fichiers de stopwords 
stopwords = list(set(nltk_stopwords).union(set(lucene_stopwords)))

## Fonction de setmming de stemming permettant la racinisation
stemmer=nltk.stem.SnowballStemmer('french')

In [5]:
# Fonction clean générale
def clean_txt(txt):
    ### remove html stuff
    txt = BeautifulSoup(txt,"html.parser",from_encoding='utf-8').get_text() #nettoyage donnee html
    ### lower case
    txt = txt.lower()
    ### special escaping character '...'
    txt = txt.replace(u'\u2026','.')
    txt = txt.replace(u'\u00a0',' ')
    ### remove accent btw
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore')
    ###txt = unidecode(txt)
    ### remove non alphanumeric char
    txt = re.sub('[^a-z_]', ' ', txt)
    ### remove french stop words
    tokens = [w for w in txt.split() if (len(w)>2) and (w not in stopwords)]
    ### french stemming
    tokens = [stemmer.stem(token) for token in tokens]
    #Stemmers remove morphological affixes from words, leaving only the word stem
    ### tokens = stemmer.stemWords(tokens)
    return ' '.join(tokens)
    #join() returns a string in which the string elements of sequence have been joined by str separator.

def clean_marque(txt):
    txt = re.sub('[^a-zA-Z0-9]', '_', txt).lower()
    return txt

In [6]:
# fonction de nettoyage du fichier(stemming et liste de mots à supprimer)
def clean_df(input_data, column_names= ['Question','Intent','BlockId', 'Action']):
    #Test if columns entry match columns names of input data
    column_names_diff= set(column_names).difference(set(input_data.columns))
    #set.difference   new set with elements in column_names but not in input_data.columns
    
    if column_names_diff: #rentre dans la boucle si column_names différent zero
        # warning = exception
        warnings.warn("Column(s) '"+", ".join(list(column_names_diff)) +"' do(es) not match columns of input data", Warning)
        
    nb_line = input_data.shape[0]
    print("Start Clean %d lines" %nb_line)
    
    # Cleaning start for each columns
    time_start = time.time()
    clean_list=[]
    for column_name in column_names:
        column = input_data[column_name].values
        if column_name == "Question":
            array_clean = np.array(map(clean_txt,column))
            
        elif column_name == "Intent":
            array_clean = np.asarray(input_data['Intent']) #on recopie telle quelle la colonne intent  
            
        else:
            array_clean = np.array(map(clean_marque,column))
            #applies a function to all the items in an input_list
            #map(function_to_apply, list_of_inputs)
        clean_list.append(array_clean)
    time_end = time.time()
    print("Cleaning time: %d secondes"%(time_end-time_start))
    
    #Convert list to DataFrame
    array_clean = np.array(clean_list).T
    data_clean = pd.DataFrame(array_clean, columns = column_names)
    return data_clean

In [7]:
# Take approximately 2 minutes fors 100.000 rows
data_valid_clean = clean_df(data_valid)
data_train_clean = clean_df(data_train)

Start Clean 502 lines
Cleaning time: 0 secondes
Start Clean 4511 lines
Cleaning time: 4 secondes


In [8]:
#data_valid_clean

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher

In [10]:
def vectorizer_train(df,column):
    col = df[column]
    # TFIDF
    vec = TfidfVectorizer(
            min_df = 1, #on prend tous les mots
            stop_words =stopwords,
            smooth_idf=True,
            norm='l2',
            sublinear_tf=True,
            use_idf=True, #tf avec idf
            ngram_range=(1,1)) 
    tfidf=vec.fit_transform(col)
    return vec,tfidf

def apply_vectorizer(df, vec, columns):
    
    data_hash = map(lambda x : " ".join(x), df[columns].values)  
    tfidf=vec.transform(df[columns])

    # TFIDF
    #tfidf=vec.transform(df)
    return tfidf

# prediction

In [11]:
vec,X = vectorizer_train(data_train_clean,"Question")
Y = data_train_clean["Intent"].values
Xv = apply_vectorizer(data_valid_clean,vec,"Question")
Yv = data_valid_clean["Intent"].values

In [12]:
Xv

<502x1935 sparse matrix of type '<type 'numpy.float64'>'
	with 1341 stored elements in Compressed Sparse Row format>

In [13]:
# Regression Logistique 
## estimation
from sklearn.linear_model import LogisticRegression
#si on augmente C, on augmente bcp le score
cla = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True,
                          intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear',
                          max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
cla.fit(X,Y)
score=cla.score(X,Y)
Y_predict = cla.predict(X)
                
print('# training score:',score)

('# training score:', 0.79716249168698738)


In [14]:
## erreur en validation
scoreValidation=cla.score(Xv,Yv)
predict_v = cla.predict(Xv)
##probleme car tfidf et Xv pas le même nbre de colonne 
print('# validation score:',scoreValidation)

('# validation score:', 0.6394422310756972)


In [88]:
sorted(collections.Counter(predict_v).items(), key=lambda x : x[1], reverse=True)

[('#6-49_TRTEINS_Peau', 59),
 ('#2-130_QVDP_Alimentation', 29),
 ('#2-36_QVDP_Alopecie_Pourquoi', 22),
 ('#6-60_TRTEINS_PAC', 21),
 ('#6-1_TRTEINS_Chimiotherapie', 19),
 ('#2-55_QVDP_Douleur', 17),
 ('#9-2_Informations_cancer', 17),
 ('#6-98_TRTEINS_EI_Frequents', 15),
 ('#2-107_QVDP_Grossesse', 15),
 ('#6-97_TRTEINS_Nausees_Vomissements', 14),
 ('#6-18_TRTEINS_Radiotherapie', 14),
 ('#6-53_TRTEINS_Aphtes', 11),
 ('#2-45_QVDP_Alopecie_Perruque', 11),
 ('#6-90_TRTEINS_Mauvais_Gout', 10),
 ('#2-64-0_QVDP_Fatigue', 10),
 ('#2-96_QVDP_Social_Priseencharge', 9),
 ('#6-41_TRTEINS_ManchonLymphodeme', 8),
 ('#6-73_TRTEINS_Chirurgie_Mastectomie', 8),
 ('#6-92_TRTEINS_Diarrhee', 8),
 ('#6-57_TRTEINS_Yeux_Secs', 7),
 ('#2-120_QVDP_Sexualite', 7),
 ('#2-115_QVDP_Grossesse_Allaitement', 7),
 ('#9-53_Informations_depistagesein', 6),
 ('#6-58_TRTEINS_Bouche_Seche', 6),
 ('conversation_rappel_rendezvous', 6),
 ('#2-125_QVDP_Sexualite_Reconstruction', 5),
 ('#6-67_TRTEINS_Chirurgie_Questions', 5),
 ('#

### F test

precision (also called positive predictive value) is the fraction of relevant instances among the retrieved (= extrait) instances



recall (also known as sensitivity) is the fraction of relevant instances that have been retrieved over the total amount of relevant instances. Both precision and recall are therefore based on an understanding and measure of relevance.



Example : Suppose a computer program for recognizing dogs in photographs identifies 8 dogs in a picture containing 12 dogs and some cats. Of the 8 dogs identified, 5 actually are dogs (true positives), while the rest are cats (false positives). The program's precision is 5/8 while its recall is 5/12

In [17]:
from sklearn.metrics import f1_score

Les problèmes : Xv et X ne sont pas de même taille, Xv est plus petit. On cut X pour avoir la même taille que Xv ou on fait un tirage aleatoire?


En essayant de cut, j'ai tjrs pas reussis ...

In [84]:
f1_score(XvListe,XListe,"macro")

ValueError: continuous is not supported

In [31]:
np.shape(Xv)

(502, 1935)

In [61]:
np.shape(X[:502,:])

(502, 1935)

In [85]:
y_true = [0, 1, 2, 2, 0, 2]
y_pred = [6, 2, 1, 5, 0, 1]
f1_score(y_true, y_pred, average='macro')

0.13333333333333333

ValueError: multiclass format is not supported