In [1]:
import os
import sys
import re
import pickle
import numpy as np
import pandas as pd
import gensim

import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv1D,GlobalMaxPooling1D
from keras.layers.core import Dense, Activation,Dropout
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [15]:
np.random.seed(7)

In [16]:
DIR_DATA = 'C:/Users/amel/Desktop/PFE/'
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
TEST_SPLIT = 0.2
VALIDATION_SPLIT = 0.1
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [17]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [18]:
print('\nTraining word2vec...')
sentences, labels = loadData('data_prétraitement.csv')
words =[nltk.word_tokenize(sent) for sent in sentences]
print(words[1])
word_model = gensim.models.Word2Vec(words, size=100, min_count=1, window=5, iter=100)

pretrained_weights = word_model.wv.syn0


Training word2vec...
['dai', 'feel', 'close', 'partner', 'friend', 'feel', 'peac', 'also', 'experi', 'close', 'contact', 'peopl', 'regard', 'greatli']


  import sys


In [19]:
vocab_size, emdedding_size = pretrained_weights.shape


In [20]:
print('Result embedding shape:', pretrained_weights.shape)

Result embedding shape: (6184, 100)


In [21]:
def loadData(filename):
    df = pd.read_csv(DIR_DATA + filename,delimiter=';')
    selected = ['label', 'text']
    non_selected = list(set(df.columns) - set(selected))
    df = df.drop(non_selected, axis=1)
    df = df.dropna(axis=0, how='any', subset=selected)
    labels = sorted(list(set(df[selected[0]].tolist())))
    dict.fromkeys(set(df[selected[0]].tolist()))
    label_dict = {}
    for i in range(len(labels)):
        label_dict[labels[i]] = i

    x_train = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_train = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    y_train = to_categorical(np.asarray(y_train))
    return x_train,y_train

In [22]:
def createVocabAndData(sentences):
    
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    vocab = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return vocab,data

In [23]:
def cnnModel(pretrained_weights,epoch):
    model = Sequential()
    n, embedding_dims = pretrained_weights.shape
    model.add(Embedding(n, embedding_dims, weights=[pretrained_weights], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Dropout(0.2))
    model.add(Conv1D(128,3,padding='valid',strides=1))
    model.add(GlobalMaxPooling1D())
    #model.add(Embedding(n, embedding_dims, weights=[pretrained_weights], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    #model.add(LSTM(128, dropout=0.6, recurrent_dropout=0.6))
    model.add(Dense(7))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, validation_split=VALIDATION_SPLIT, epochs=epoch, batch_size=128)
    model.save_weights('text_lstm_weights.h5')

    scores= model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
    return model


In [24]:
if __name__ == "__main__":

    #sentences, labels = loadData('data_prétraitement.csv')
    
    vocab, data = createVocabAndData(sentences)
    print ("Data created")
    print("Train Test split")
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)
    model=cnnModel(pretrained_weights,40)

Data created
Train Test split
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          618400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 98, 128)           38528     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 903       
_________________________________________________________________
activation_2 (Activation)    (None, 7)                 0         
Total params: 657,831
Trainable params: 39,431
Non-trainable params: 618,400
___________________________________