# SVM Classifier for smaller dataset

http://www.developintelligence.com/blog/2017/06/practical-neural-networks-keras-classifying-yelp-reviews/


Conversely, for smaller datasets, the SVM is much better than the neural network.
The SVM, on the other hand, will perform well even for much smaller datasets.

Now, instead of converting each word to a single number and learning an Embedding layer, we use a term-frequency inverse document frequency (TF-IDF) vectorisation process. Using this vectorisation scheme, we ignore the order of the words completely, and represent each review as a large sparse matrix, with each cell representing a specific word, and how often it appears in that review. We normalize the counts by the total number of times the word appears in all of the reviews, so rare words are given a higher importance than common ones (though we ignore all words that aren’t seen in at least three different reviews.

    Line six sets up the vectorizer. We set ngram_range to (1,2) which means we’ll consider all words on their own but also look at all pairs of words. This is useful because we don’t have a concept of word order anymore, so looking at pairs of words as single tokens allows the classifier to learn that word pairs such as “not good” are usually negative, even though “good” is positive. We also set min_df to 3, which means that we’ll ignore words that aren’t seen at least three times (in three different reviews).

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="1"

from collections import Counter
from datetime import datetime
import numpy as np
import time
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import pathlib
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt
import json

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
import numpy as np

from sklearn.model_selection import StratifiedKFold

%matplotlib inline 



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
captions_root = "/home/elkhand/git-repos/human-emotions-classifier/dataset/metadata"
captions_root_path = pathlib.Path(captions_root)
human_output_caption_to_label_csv_path = captions_root_path/'humanCaptionWithLabeldf.csv'
dataset_path = human_output_caption_to_label_csv_path

stop_words = set(stopwords.words('english'))

def get_words_withoutstopwords(words):
    words_without_stopwords = []
    for word in words:
        if word not in stop_words:
            words_without_stopwords.append(word)
    return words_without_stopwords

def change_label_str_to_int_binary(labelStr):
    """this is only for binary classification, negative sentiment has value 0, positive sentiment has value 1"""
    if labelStr == "negative":
        return 0
    elif  labelStr == "neutral":
        raise "Neutral Should not exist"
    elif  labelStr == "positive":
        return 1
    else:
        raise labelStr+ " Should not exist"

def get_df_for_binary_classification():
    df = pd.read_csv(dataset_path, header=0, sep="|")
    # Doing only binary classification
    df = df[(df.label == "negative") | (df.label == "positive")]
    df = df.reset_index()
    df["label"] = df["label"].apply(lambda x: change_label_str_to_int_binary(x))
    df["caption"] = df["caption"].apply(lambda x: " ".join(get_words_withoutstopwords(x.lower().split())))
    return df    
    
df = get_df_for_binary_classification()

balanced_texts = []
balanced_labels = []

for index, row in df.iterrows():
    label = int(row['label'])
    caption = row['caption']
    balanced_texts.append(caption)
    balanced_labels.append(label)

        
print(Counter(balanced_labels))


vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=0)
classifier = LinearSVC()
Xs = vectorizer.fit_transform(balanced_texts)
print("Vectors shape:", Xs.shape)

kFoldSplits = 10

scores = cross_val_score(classifier, Xs, balanced_labels, cv=kFoldSplits, n_jobs=-1)

print("Cross-Validation scores: ",scores)
print("Cross-validation results: %.2f%% (+/- %.2f%%)" % (np.mean(scores), np.std(scores)))




Counter({1: 317, 0: 163})
Vectors shape: (480, 12137)
Cross-Validation scores:  [0.65306122 0.67346939 0.67346939 0.64583333 0.75       0.66666667
 0.6875     0.65957447 0.70212766 0.65957447]
Cross-validation results: 0.68% (+/- 0.03%)


# New Neural Network based on Tf-IDF

In [11]:
##############################################################
# New Neural Network with Embedding layer

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

num_words = 10000
maxlen = 40


config = tf.ConfigProto()
config.gpu_options.allow_growth = True

def generate_model_name(filename, best_acc_val):
    timestamp = str(time.time()).split(".")[0]
    best_acc_val = round(best_acc_val,4)
    filename += "-" + str(best_acc_val) + "-" + timestamp
    return filename

def plot_model_accuracy(history):
    """plot acc and loss for train and val"""
    filename = "custom-hec" 
    filename = generate_model_name(filename + "-acc", max(history.history['val_acc']))
    fig = plt.figure()
    print(history.history.keys())
    print("best_val_acc", max(history.history['val_acc']))
    print("best_train_acc", max(history.history['acc']))
    print("lowest_val_loss", min(history.history['val_loss']))
    print("lowest_train_loss", min(history.history['loss']))
    #  "Accuracy"
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    fig.savefig("custom-model/" + filename + ".png") 
    
    # "Loss"
    fig = plt.figure()
    filename = "hec" 
    filename = generate_model_name(filename + "-loss", min(history.history['val_loss']))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    fig.savefig("custom-model/" + filename + ".png") 

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=maxlen)

#This might take a while to run. Here, we use the most common 20000 words instead of 5. 
#The only other difference is that we pass maxlen=300 when we pad the sequences. 
#This means that as well as padding the very short texts with zeros,
#we’ll also truncate the very long ones. All of our texts will then be represented by 300 numbers.


filepath="custom-model/customNN.weights.best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor='val_loss', patience=25)
callbacks_list = [ checkpoint, early_stopping] # reduce_lr,

#In line two, we add an Embedding layer. This layer lets the network expand each token to a larger vector, 
#allowing the network to represent words in a meaningful way. We pass 20000 as the first argument, 
#which is the size of our vocabulary (remember, we told the tokenizer to only use the 20 000 most common words 
#earlier), and 128 as the second, which means that each token can be expanded to a vector of size 128. 
#We give it an input_length of 300, which is the length of each of our sequences.


def get_LSTM_model(dropout, recurrent_dropout):
    # Cross-validation results: 0.84% (+/- 0.06%)
    model = Sequential()
    model.add(Embedding(num_words, 32, input_length=maxlen))
    model.add(Dropout(dropout))
    model.add(LSTM(32, dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def get_CNN_model(dropout, recurrent_dropout):
    # Cross-validation results: 0.80% (+/- 0.03%), with dropout = 0.2
    # Cross-validation results: 0.83% (+/- 0.05%)
    model = Sequential()
    model.add(Embedding(num_words, 32, input_length=maxlen))# 128
    model.add(Dropout(dropout))
    model.add(Conv1D(32, 3, activation='relu'))# 64, 5
    model.add(Dropout(dropout))
    model.add(MaxPooling1D(pool_size=2)) #4
    model.add(Dropout(dropout)) # 0.2
    model.add(LSTM(32, dropout=dropout, recurrent_dropout=recurrent_dropout)) # 128
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def get_label_count(labels):
    labelToCount = {}
    for label in labels:
        if label not in labelToCount:
            labelToCount[label] = 1
        else:
            labelToCount[label] += 1
    return labelToCount

# Instantiate the cross validator
kfold_splits = 5
skf = StratifiedKFold(n_splits=kfold_splits, shuffle=True)
printCnt = 5
cvscores = []

balanced_labels = np.array(balanced_labels)
# Loop through the indices the split() method returns
for index, (train_indices, val_indices) in enumerate(skf.split(data, balanced_labels)):
    X_train, X_val = data[train_indices], data[val_indices]
    y_train, y_val = balanced_labels[train_indices], balanced_labels[val_indices]

    print("Train label distribution: ", get_label_count(y_train))
    print("Val label distribution: ", get_label_count(y_val))
    dropout = 0.65
    recurrent_dropout = 0.65
    #model = get_CNN_model(dropout, recurrent_dropout)
    model = get_LSTM_model(dropout, recurrent_dropout)
    #history = model.fit(data, np.array(balanced_labels), validation_split=0.2, epochs=100, callbacks=callbacks_list)
    history = model.fit(x=X_train,
                          y=y_train, 
                          batch_size=16,# 64 seems fine, 32 is better 
                          epochs=100, 
                          verbose=0, 
                          validation_data = (X_val, y_val),
                          #shuffle=True,
                          callbacks=callbacks_list) 


    val_acc_list = history.history['val_acc']
    best_val_acc =  max(val_acc_list)
    cvscores.append(best_val_acc)
    print("best_val_acc: ", best_val_acc)
    plot_model_accuracy(history)
    
print("Cross-validation results: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))    

Train label distribution:  {1: 253, 0: 130}
Val label distribution:  {0: 33, 1: 64}


InternalError: GPU sync failed