# Text classification

### Necessary Libraries

In [1]:
# libraries for dataset preparation, feature engineering, model training 
import numpy as np
import os
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, linear_model, svm
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import ast
import os
from keras.layers import Dense, Embedding, Input
from keras.layers import GRU, Dropout, MaxPooling1D, Conv1D, Flatten, BatchNormalization, GlobalMaxPool1D
from keras.models import Model, Sequential

import numpy as np
import itertools
from keras.utils import np_utils
from sklearn.metrics import (classification_report, 
                             precision_recall_fscore_support, 
                             accuracy_score)

from keras.preprocessing import text, sequence
import seaborn as sns
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Loding data

In [2]:
# load dataframe from CSV file
df_train = pd.read_csv("../Data/train2/train_text.csv", sep="\t")
df_dev = pd.read_csv("../Data/dev2/dev_text.csv", sep="\t")
df_train[0:10]

Unnamed: 0,Filename,Text,Class,Language
0,../Data/train2/txt/AAAWNA.txt,REPUBLIQUE FRANCAISE Nationalité Française Car...,C2,fr
1,../Data/train2/txt/AAUVHC.txt,17 08 12 15:10 MLTC 00212537680032 p.2 في ليلة...,C3,ar
2,../Data/train2/txt/ABBABC.txt,Hi sweetie ! just a quick note to say ... HAPP...,C3,en
3,../Data/train2/txt/ABBABD.txt,BE THE CHANGE YOU WANT TO SEE IN THE WORLD !,C3,en
4,../Data/train2/txt/ABBABE.txt,scenario 1 Bridgewater House Barlow Street Wor...,C3,en
5,../Data/train2/txt/ABBABF.txt,remember - Jane got 100 % on the vast test she...,C3,en
6,../Data/train2/txt/ABBACE.txt,- get map from hostel - get Euros visit eat . ...,C3,en
7,../Data/train2/txt/ABBADF.txt,"Hi Lucie, The postman left your delivery with ...",C3,en
8,../Data/train2/txt/ABBAGH.txt,"To whom it may concern, I am writing to inform...",C3,en
9,../Data/train2/txt/ABBAJL.txt,"Dear Jay, I hope ¤{eveything/everything}¤ is o...",C3,en


### Class classification

In [3]:
X_train = df_train['Text']
y_train = df_train['Class']
X_dev = df_dev['Text']
y_dev = df_dev['Class']

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_dev = encoder.fit_transform(y_dev)

print('X_train: ',X_train.shape)
print('X_dev: ',X_dev.shape)

X_train:  (6129,)
X_dev:  (1000,)


- Create document vectors:

In [4]:
vectorizer = CountVectorizer(max_features=3000,analyzer='word', token_pattern=r'\w{1,}')
vectorizer.fit(df_train['Text'])

X_train_counts = vectorizer.transform(X_train)
X_dev_counts = vectorizer.transform(X_dev)

- Naive Bayes classifier:

In [5]:
clf = MultinomialNB()
clf.fit(X_train_counts,y_train)
score_dev = clf.score(X_dev_counts,y_dev)
print(' - Validation du classifieur: \n')
print('Score : ',score_dev)

 - Validation du classifieur: 

Score :  0.726


- Linear classifier:

In [6]:
lc = linear_model.LogisticRegression()
lc.fit(X_train_counts,y_train)
score_dev = lc.score(X_dev_counts,y_dev)
print(' - Validation du classifieur: \n')
print('Score : ',score_dev)

 - Validation du classifieur: 

Score :  0.891


- CNN classifier:

In [23]:
def get_train_test(train_raw_text, test_raw_text):
    
    tokenizer = text.Tokenizer(num_words=MAX_FEATURES, )

    tokenizer.fit_on_texts(list(train_raw_text))
    train_tokenized = tokenizer.texts_to_sequences(train_raw_text)
    test_tokenized = tokenizer.texts_to_sequences(test_raw_text)
    return sequence.pad_sequences(train_tokenized, maxlen=MAX_TEXT_LENGTH), \
           sequence.pad_sequences(test_tokenized, maxlen=MAX_TEXT_LENGTH)

def get_model():

    inp = Input(shape=(MAX_TEXT_LENGTH,))
    model = Embedding(MAX_FEATURES, EMBED_SIZE)(inp)
    
    model = Conv1D(filters=32, kernel_size=7, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=2)(model)
    model = BatchNormalization(axis=1)(model)
    model = Dropout(0.2)(model)
    
    model = Conv1D(filters=64, kernel_size=5, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=3)(model)
    model = BatchNormalization(axis=1)(model)
    model = Dropout(0.3)(model)
    
    model = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=5)(model)
    model = BatchNormalization(axis=1)(model)
    model = Dropout(0.5)(model)
    
    model = Flatten()(model)
    model = Dense(250, activation="relu")(model)
    model = Dense(5, activation="softmax")(model)
    
    model = Model(inputs=inp, outputs=model)
    
    
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    model.summary()
    
    return model


In [None]:
# Model parameters
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

MAX_FEATURES = 500
MAX_TEXT_LENGTH = 500
EMBED_SIZE  = 32
BATCH_SIZE = 128
EPOCHS = 100
VALIDATION_SPLIT = 0.1

# Get the list of different classes
CLASSES_LIST = np.unique(y_train)
n_out = len(CLASSES_LIST)
print(CLASSES_LIST)

# Convert class string to index
train_y_cat = np_utils.to_categorical(y_train, n_out)
test_y_cat = np_utils.to_categorical(y_dev, n_out)

# get the textual data in the correct format for NN
x_vec_train, x_vec_test = get_train_test(X_train, X_dev)
print(len(x_vec_train), len(x_vec_test))

# define the NN topology
model = get_model()

# Create callbacks
filepath = 'model_cnn' + '.hdf5'
multi_checkpointer = ModelCheckpoint(filepath=filepath, verbose=0)
multi_lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=0, factor=0.2)

# Train 
model.fit(x_vec_train, train_y_cat,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS, verbose=1, validation_split=VALIDATION_SPLIT,callbacks=[multi_checkpointer,multi_lr_reduction])


[0 1 2 3 4]
6129 1000
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 500, 32)           16000     
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 500, 32)           7200      
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 250, 32)           0         
_________________________________________________________________
batch_normalization_10 (Batc (None, 250, 32)           1000      
_________________________________________________________________
dropout_10 (Dropout)         (None, 250, 32)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 250, 64)          

In [22]:
# Matrice de probabilitées d'appartenance à chaque classe
y_predicted = model.predict(x_vec_test)

# Attribution de la classe qui a la plus probable
y_pred = np.zeros(len(y_dev))
for i in range(len(y_predicted)):
    y_pred[i] = np.argmax(y_predicted[i])

print("Test Accuracy:", accuracy_score(y_pred, y_dev))

p, r, f1, s = precision_recall_fscore_support(y_dev, y_pred, 
                                              average='micro',
                                              labels=[x for x in 
                                                      np.unique(y_train) 
                                                      if x not in ['CSDECMOTV']])

print('p r f1 %.1f %.2f %.3f' % (np.average(p, weights=s)*100.0, 
                                 np.average(r, weights=s)*100.0, 
                                 np.average(f1, weights=s)*100.0))


print(classification_report(y_dev, y_pred, labels=[x for x in 
                                                       np.unique(y_train) 
                                                       if x not in ['CSDECMOTV']]))

Test Accuracy: 0.792
p r f1 79.2 79.20 79.200
             precision    recall  f1-score   support

          0       0.98      0.90      0.94       146
          1       0.91      0.85      0.88       426
          2       0.57      0.83      0.67       212
          3       0.76      0.59      0.66       169
          4       0.86      0.51      0.64        47

avg / total       0.82      0.79      0.80      1000



### Language classification

In [None]:
X_train = df_train['Text']
y_train = df_train['Language']
X_dev = df_dev['Text']
y_dev = df_dev['Language']

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_dev = encoder.fit_transform(y_dev)

- Naive bayes classifier:

In [None]:
clf = MultinomialNB()
clf.fit(X_train_counts,y_train)
score_dev = clf.score(X_dev_counts,y_dev)
print(' - Validation du classifieur: \n')
print('Score : ',score_dev)

- Linear classifier:

In [None]:
lc = linear_model.LogisticRegression()
lc.fit(X_train_counts,y_train)
score_dev = lc.score(X_dev_counts,y_dev)
print(' - Validation du classifieur: \n')
print('Score : ',score_dev)

- CNN classifier:

In [None]:
MAX_FEATURES = 500
MAX_TEXT_LENGTH = 500
EMBED_SIZE  = 32
BATCH_SIZE = 128
EPOCHS = 100
VALIDATION_SPLIT = 0.1

# Get the list of different classes
CLASSES_LIST = np.unique(y_train)
n_out = len(CLASSES_LIST)
print(CLASSES_LIST)

# Convert class string to index
train_y_cat = np_utils.to_categorical(y_train, n_out)
test_y_cat = np_utils.to_categorical(y_dev, n_out)

# get the textual data in the correct format for NN
x_vec_train, x_vec_test = get_train_test(X_train, X_dev)
print(len(x_vec_train), len(x_vec_test))

# define the NN topology
model = get_model()

# Create callbacks
filepath = 'model_cnn' + '.hdf5'
multi_checkpointer = ModelCheckpoint(filepath=filepath, verbose=0)
multi_lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=0, factor=0.2)

# Train 
model.fit(x_vec_train, train_y_cat,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS, verbose=1, validation_split=VALIDATION_SPLIT,callbacks=[multi_checkpointer,multi_lr_reduction])