# MODEL 4

The applied preprocessing is the simple one. 
The Dataset we are taking into account is composed by: violations + train+ test20-except N-th article.
The Doc2Vec is trained on dataset described above and it is tested on the N-th article of the test20.

In [1]:
import os
import sys
import shutil
import random
import re
import regex
import string
from random import shuffle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import KFold
from gensim import models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import svm, metrics
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from sklearn.preprocessing import normalize
from module_preprocessing import apply_preprocessing
from ast import literal_eval
from sklearn.model_selection import ParameterGrid
from time import time
from datetime import timedelta

#### Loading the data

In [2]:
data_violations = pd.read_csv(
    "crystal_ball_data/SIMPLE_PREP/all_violations_simple_rd.csv",
    index_col="index")
data_violations.raw_text = data_violations.raw_text.apply(literal_eval)
print("dataset violations loaded. Shape: ", data_violations.shape)

data_all_train = pd.read_csv(
    "crystal_ball_data/SIMPLE_PREP/all_train_simple_rd.csv", index_col="index")
data_all_train.raw_text = data_all_train.raw_text.apply(literal_eval)
print("dataset test20 loaded. Shape: ", data_all_train.shape)

data_vt = data_violations.append(data_all_train)
print("dataset violations + train created. Shape: ", data_vt.shape)

dataset violations loaded. Shape:  (8388, 2)
dataset test20 loaded. Shape:  (3125, 2)
dataset violations + train created. Shape:  (11513, 2)


In [4]:
path = "crystal_ball_data/SIMPLE_PREP/test_RAW_DATASET/"
datasets = []
for filename in os.listdir(path)[1:]:
    dataset = pd.read_csv(path + filename, index_col="index")
    dataset.raw_text = dataset.raw_text.apply(literal_eval)
    random.seed(6789)
    dataset = dataset.sample(frac=1, random_state=6789).reset_index(drop=True)
    print("dataset " + filename[:9] + " has been loaded. Shape: ",dataset.shape)
    datasets.append([dataset, filename[:9]])

dataset Article02 has been loaded. Shape:  (26, 2)
dataset Article03 has been loaded. Shape:  (140, 2)
dataset Article05 has been loaded. Shape:  (74, 2)
dataset Article06 has been loaded. Shape:  (226, 2)
dataset Article08 has been loaded. Shape:  (111, 2)
dataset Article10 has been loaded. Shape:  (52, 2)
dataset Article11 has been loaded. Shape:  (14, 2)
dataset Article13 has been loaded. Shape:  (52, 2)
dataset Article14 has been loaded. Shape:  (70, 2)


#### Main

In [5]:
def fit(df_test, model_D2V, verbose=0):
    '''
     Given a text, this function creates the vector through the Doc2Vec model and calculates 
     the class of the most similar text contained in the model. 
     In the end it calculates the accuracy.  
      Args:
        pandas.DataFrame, dataframe
        gensim.models.doc2vec.Doc2Vec, Doc2Vec model
        int, verbose
      Returns :
        float, accuracy
         
    '''
    print("Testing applying 1-NN on", len(df_test), "samples")
    predictions = []
    for index, row in df_test.iterrows():
        vector = model_D2V.infer_vector(row['raw_text'])
        most_similar = model_D2V.docvecs.most_similar([vector], topn=2)
        predictions.append(most_similar[0][0][1])
        #print("MOST SIMILAR to:",index,most_similar[0],"1st matches: ",row['tag']==most_similar[0][0][1]," 2nd matches: ",row['tag']==most_similar[1][0][1],most_similar[1][1])
        #print(data.iloc[most_similar[0][0][0]]['tag']==most_similar[0][0][1])
        if verbose >= 2:
            print("vector: ", vector)
            print("tagged: ", most_similar[0][0], "similarity: ",
                  most_similar[1][1])
    tag_test = df_test.loc[:, 'tag'].values
    acc = metrics.accuracy_score(tag_test, predictions)
    if verbose >= 0:
        print("ACCURACY:", acc)
    return acc

In [13]:
summary = {}
param_grid = {
    'vector_size': [100],
    'epochs': [20],
    'min_count': [1],
    'window': [10],
    'hs': [1],
    'negative': [5],
    'ns_exponent': [1],
    'dm_mean': [1],
    'dbow_words': [1]
}
parameters = list(ParameterGrid(param_grid))
print("\nTotal combination of parameters: %d" % len(parameters))
for parameter in parameters:
    print("\nparameters:", parameter)
    l = []
    for i, (dataset, filename) in enumerate(datasets):
        print(filename)
        print("Number of samples: ", dataset.shape[0])
        data = data_vt
        temp = datasets[:i] + datasets[i + 1:]
        for d in temp:
            data = data.append(d[0])
        data = data.sample(frac=1, random_state=6789).reset_index(drop=True)
        tagged_documents = []
        for index, row in data.iterrows():
            tagged_documents.append(
                TaggedDocument(
                    words=row['raw_text'], tags=[(index, row['tag'])]))
        print("Training the Doc2Vec with", len(data), "samples")
        start = time()
        model_D2V = Doc2Vec(
            tagged_documents,
            negative=parameter['negative'],
            ns_exponent=parameter['ns_exponent'],
            hs=parameter['hs'],
            window=parameter['window'],
            dm_mean=parameter['dm_mean'],
            dm_concat=0,
            dbow_words=parameter['dbow_words'],
            vector_size=parameter['vector_size'],
            epochs=parameter['epochs'],
            min_count=parameter['min_count'],
            workers=os.cpu_count())
        print("model Doc2Vec created. Time elasped: " +
              str(timedelta(seconds=(time() - start))))
        accuracy = fit(dataset, model_D2V, verbose=0)
        l.append((filename, accuracy))
    summary.update({str(parameter): l})

print("Finished.")


Total combination of parameters: 1

parameters: {'dbow_words': 1, 'dm_mean': 1, 'epochs': 20, 'hs': 1, 'min_count': 1, 'negative': 5, 'ns_exponent': 1, 'vector_size': 100, 'window': 10}
Article02
Number of samples:  26
Training the Doc2Vec with 12252 samples
model Doc2Vec created. Time elasped: 0:26:03.231175
Testing applying 1-NN on 26 samples
ACCURACY: 0.7307692307692307
Article03
Number of samples:  140
Training the Doc2Vec with 12138 samples
model Doc2Vec created. Time elasped: 0:33:25.022874
Testing applying 1-NN on 140 samples
ACCURACY: 0.75
Article05
Number of samples:  74
Training the Doc2Vec with 12204 samples
model Doc2Vec created. Time elasped: 0:33:06.417797
Testing applying 1-NN on 74 samples
ACCURACY: 0.6621621621621622
Article06
Number of samples:  226
Training the Doc2Vec with 12052 samples
model Doc2Vec created. Time elasped: 0:29:00.850104
Testing applying 1-NN on 226 samples
ACCURACY: 0.672566371681416
Article08
Number of samples:  111
Training the Doc2Vec with 1216

In [14]:
for i in summary:
    print("\nParameters: ", i)
    total_average = 0
    for j in range(len(summary[i])):
        print(summary[i][j][0], "average: %f" % summary[i][j][1])
        total_average += summary[i][j][1]
    total_average = total_average / len(summary[i])
    print("Total average: %f" % total_average)
    


Parameters:  {'dbow_words': 1, 'dm_mean': 1, 'epochs': 20, 'hs': 1, 'min_count': 1, 'negative': 5, 'ns_exponent': 1, 'vector_size': 100, 'window': 10}
Article02 average: 0.730769
Article03 average: 0.750000
Article05 average: 0.662162
Article06 average: 0.672566
Article08 average: 0.720721
Article10 average: 0.653846
Article11 average: 0.642857
Article13 average: 0.750000
Article14 average: 0.742857
Total average: 0.702864



Parameters:  {'dbow_words': 0, 'dm_mean': 0, 'epochs': 20, 'hs': 0, 'min_count': 1, 'negative': 5, 'ns_exponent': 1, 'vector_size': 500, 'window': 3}
Article02 average: 0.653846
Article03 average: 0.742857
Article05 average: 0.662162
Article06 average: 0.663717
Article08 average: 0.720721
Article10 average: 0.673077
Article11 average: 0.714286
Article13 average: 0.769231
Article14 average: 0.728571
Total average: 0.703163



Parameters:  {'dbow_words': 1, 'dm_mean': 1, 'epochs': 20, 'hs': 1, 'min_count': 1, 'negative': 20, 'ns_exponent': 1, 'vector_size': 100, 'window': 3}
Article02 average: 0.653846
Article03 average: 0.735714
Article05 average: 0.675676
Article06 average: 0.663717
Article08 average: 0.702703
Article10 average: 0.750000
Article11 average: 0.642857
Article13 average: 0.788462
Article14 average: 0.714286
Total average: 0.703029
