# MODEL 3 - Grid search

This notebook shows the gridseacrh on the Doc2Vec parameters of the model 3. See the documenation for further details.

#### The preprocessing we apply is the simple one; 
#### We consider the dataset union of violations and train samples;
#### The Doc2vec model is trained on all the possible samples;
#### The SVM is trained on the train of the N-th article and tested on the test20 of the N-th article

In [2]:
import os
import sys
import shutil
import random
import re
import regex
import string
from random import shuffle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import KFold
from gensim import models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import svm, metrics
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from sklearn.preprocessing import normalize
from module_preprocessing import apply_preprocessing
from ast import literal_eval
from sklearn.model_selection import ParameterGrid
from time import time
from datetime import timedelta

In [10]:
data_violations = pd.read_csv(
    "crystal_ball_data/SIMPLE_PREP/all_violations_simple_rd.csv",
    index_col="index")
data_violations.raw_text = data_violations.raw_text.apply(literal_eval)
print("dataset violations loaded. Shape: ", data_violations.shape)

dataset violations loaded. Shape:  (8388, 2)


In [11]:
data_train = pd.read_csv(
    "crystal_ball_data/SIMPLE_PREP/all_train_simple_rd.csv", index_col="index")
data_train.raw_text = data_train.raw_text.apply(literal_eval)
print("dataset train loaded. Shape: ", data_train.shape)

data_vt = data_violations.append(data_train)
print("dataset violations + train created. Shape: ", data_vt.shape)

tagged_documents = []
for index, row in data_vt.iterrows():
    tagged_documents.append(
        TaggedDocument(words=row['raw_text'], tags=[(row['tag'])]))
print("corpus tagged. Length: ", len(tagged_documents))

dataset train loaded. Shape:  (3125, 2)
dataset violations + train created. Shape:  (11513, 2)
corpus tagged. Length:  11513


In [13]:
path = "crystal_ball_data/SIMPLE_PREP/test_RAW_DATASET/"
datasets = []
for filename in os.listdir(path)[1:]:
    dataset = pd.read_csv(path + filename, index_col="index")
    dataset.raw_text = dataset.raw_text.apply(literal_eval)
    random.seed(6789)
    dataset = dataset.sample(frac=1, random_state=6789).reset_index(drop=True)
    print("dataset " + filename[:9] + " has been loaded. Shape: ",dataset.shape)
    datasets.append([dataset, filename[:9]])

dataset Article02 has been loaded. Shape:  (26, 2)
dataset Article03 has been loaded. Shape:  (140, 2)
dataset Article05 has been loaded. Shape:  (74, 2)
dataset Article06 has been loaded. Shape:  (226, 2)
dataset Article08 has been loaded. Shape:  (111, 2)
dataset Article10 has been loaded. Shape:  (52, 2)
dataset Article11 has been loaded. Shape:  (14, 2)
dataset Article13 has been loaded. Shape:  (52, 2)
dataset Article14 has been loaded. Shape:  (70, 2)


In [16]:
path = "crystal_ball_data/SIMPLE_PREP/train_RAW_DATASET/"
datasets_train = []
for filename in os.listdir(path)[1:]:
    dataset = pd.read_csv(path + filename, index_col="index")
    dataset.raw_text = dataset.raw_text.apply(literal_eval)
    random.seed(6789)
    dataset = dataset.sample(frac=1, random_state=6789).reset_index(drop=True)
    datasets_train.append([dataset, filename[:9]])
    print("dataset " + filename[:9] + " has been loaded. Shape: ",dataset.shape)

dataset Article02 has been loaded. Shape:  (112, 2)
dataset Article03 has been loaded. Shape:  (565, 2)
dataset Article05 has been loaded. Shape:  (298, 2)
dataset Article06 has been loaded. Shape:  (914, 2)
dataset Article08 has been loaded. Shape:  (456, 2)
dataset Article10 has been loaded. Shape:  (210, 2)
dataset Article11 has been loaded. Shape:  (62, 2)
dataset Article13 has been loaded. Shape:  (208, 2)
dataset Article14 has been loaded. Shape:  (286, 2)


In [17]:
def create_vector_dataset(df, model, verbose):
    '''
     This function creates a well structured dataset, which consists of n features and the tag.
     It transforms strings to vectors through the model Doc2Vec.
     which consists of n columns: n features and the tag. 
      Args:
         df (DataFrame): an semi-structured dataframe, look at create_raw_dataset.
         df_vector (DataFrame): an initialized dataframe with shape (n_features+1,).
         model (Doc2Vec) : a trained Doc2Vec model.
      Returns :
         (DataFrame) a Dataframe with shape (n_features+1,n_samples). 
    '''
    df_vector = pd.DataFrame()
    for index, row in df.iterrows():
        vector = model.infer_vector(row['raw_text'])
        X = {}
        for i in range(len(vector)):
            X['y_' + str(i)] = vector[i]
        X['tag'] = row['tag']
        df_vector = df_vector.append(X, ignore_index=True)
    df_vector.loc[:, 'y_0':'y_' + str(len(vector) - 1)] = normalize(
        df_vector.loc[:, 'y_0':'y_' + str(len(vector) - 1)], norm='l2', axis=0)
    if verbose >= 2:
        print(df_vector.head(5))
    return df_vector, len(vector)

In [18]:
def fit_once(df_test, df_train, model_D2V, verbose=0):
    #creating the dataset through Doc2Vec for feeding the SVM
    df_vector, vector_size = create_vector_dataset(df_train, model_D2V,
                                                   verbose)
    df_test_vector, _ = create_vector_dataset(df_test, model_D2V, verbose)
    print("Training the SVM with", len(df_vector), "samples")
    print("Testing the SVM with", len(df_test_vector), "samples")
    #fitting on train
    clf_svm = svm.LinearSVC(C=1)
    clf_svm.fit(df_vector.loc[:, 'y_0':'y_' + str(vector_size - 1)].values,
                df_vector.loc[:, 'tag'].values)
    #testing on test20
    predictions = clf_svm.predict(
        df_test_vector.loc[:, 'y_0':'y_' + str(vector_size - 1)].values)
    tag_test = df_test_vector.loc[:, 'tag'].values
    acc = metrics.accuracy_score(tag_test, predictions)
    if verbose >= 0:
        print("ACCURACY:", acc)
        #print("Precision:",metrics.precision_score(tag_test, predictions))
        #print("Recall:",metrics.recall_score(tag_test, predictions))
    return acc

In [35]:
if __name__ == "__main__":
 def main() :
    summary = {}
    param_grid = {
        'vector_size': [100],
        'epochs': [20],
        'min_count': [100],
        'window': [10],
        'hs': [1],
        'negative': [20],
        'ns_exponent': [0.75],
        'dm_mean': [0],
        'dbow_words': [0]
    }    
    parameters = list(ParameterGrid(param_grid))
    print("\nTotal combination of parameters: %d" % len(parameters))
    for parameter in parameters:
        print("parameters:", parameter)
        start = time()
        model_D2V = Doc2Vec(
            tagged_documents,
            negative=parameter['negative'],
            ns_exponent=parameter['ns_exponent'],
            hs=parameter['hs'],
            window=parameter['window'],
            dm_mean=parameter['dm_mean'],
            dbow_words=parameter['dbow_words'],
            vector_size=parameter['vector_size'],
            epochs=parameter['epochs'],
            min_count=parameter['min_count'],
            workers=os.cpu_count())
        print("model Doc2Vec created. Time elasped: "+str(timedelta(seconds=(time() - start))))
        l = []
        for dataset, filename in datasets:
            print("\n", filename)
            print("Number of samples: ", dataset.shape[0])
            train = pd.DataFrame()
            for d in datasets_train:
                if d[1] == filename:
                    train = d[0]
                    print("fitting ", d[1])
                    break
            acc = fit_once(dataset, train, model_D2V, verbose=0)
            l.append((filename, acc))
        summary.update({str(parameter): l})

    print("Finished.")
    return summary

In [36]:
%time summary=main()


Total combination of parameters: 1
parameters: {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 1, 'min_count': 100, 'negative': 20, 'ns_exponent': 0.75, 'vector_size': 100, 'window': 10}
model Doc2Vec created. Time elasped: 0:20:19.763975

 Article02
Number of samples:  26
fitting  Article02
Training the SVM with 112 samples
Testing the SVM with 26 samples
ACCURACY: 0.7692307692307693

 Article03
Number of samples:  140
fitting  Article03
Training the SVM with 565 samples
Testing the SVM with 140 samples
ACCURACY: 0.7642857142857142

 Article05
Number of samples:  74
fitting  Article05
Training the SVM with 298 samples
Testing the SVM with 74 samples
ACCURACY: 0.7027027027027027

 Article06
Number of samples:  226
fitting  Article06
Training the SVM with 914 samples
Testing the SVM with 226 samples
ACCURACY: 0.7433628318584071

 Article08
Number of samples:  111
fitting  Article08
Training the SVM with 456 samples
Testing the SVM with 111 samples
ACCURACY: 0.7027027027027027

 Art

In [37]:
max_a = 0
for i in summary:
    print("\nParameters: ", i)
    total_average = 0
    for j in range(len(summary[i])):
        #print(summary[i][j][0],"average: %f" % summary[i][j][1])
        total_average += summary[i][j][1]
    total_average = total_average / len(summary[i])
    if total_average > max_a:
        max_a = total_average
        best_par = i
    print("Total average: %f" % total_average)
print("BEST PARAMETER: ", best_par, "\nBEST ACC:", max_a)


Parameters:  {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 1, 'min_count': 100, 'negative': 20, 'ns_exponent': 0.75, 'vector_size': 100, 'window': 10}
Total average: 0.706273
BEST PARAMETER:  {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 1, 'min_count': 100, 'negative': 20, 'ns_exponent': 0.75, 'vector_size': 100, 'window': 10} 
BEST ACC: 0.7062733938840133



Parameters:  {'dbow_words': 1, 'dm_mean': 1, 'epochs': 20, 'hs': 0, 'min_count': 1, 'negative': 20, 'ns_exponent': 0.75, 'vector_size': 500, 'window': 3}
Total average: 0.733849
BEST PARAMETER:  {'dbow_words': 1, 'dm_mean': 1, 'epochs': 20, 'hs': 0, 'min_count': 1, 'negative': 20, 'ns_exponent': 0.75, 'vector_size': 500, 'window': 3} 
BEST ACC: 0.7338490294242506



Parameters:  {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 0, 'min_count': 1, 'negative': 5, 'ns_exponent': 1, 'vector_size': 500, 'window': 3}
Total average: 0.737149
BEST PARAMETER:  {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 0, 'min_count': 1, 'negative': 5, 'ns_exponent': 1, 'vector_size': 500, 'window': 3} 
BEST ACC: 0.7371488331665322
