## **MIT 807**: Mini-dissertation, Department of Computer Science, University of Pretoria

**Title**: Identifying financial risk through Natural Language Processing of company annual reports

**Student**: Lamont Theron, u19333634@tuks.co.za


---
Last updated: 2020-11-22

Note: Pipeline of pre-processing and classifiers (pred > 0.5)

Requirements

1.   pageText.csv, docReadability.csv in GitHub\data\processed folder

In [None]:
#Define paths
pathData='../data/'
pathModels='../models/'
pathFigures='../reports/figures/'

### Colab

In [1]:
from google.colab import drive

In [2]:
#Mount Google Drive to get paths
drive.mount('/content/drive')

Mounted at /content/drive


#1 Variables

In [3]:
#Code control

In [4]:
F1filename='6_F1.csv'

In [6]:
writeF1=False

In [106]:
writeF1=True

In [8]:
writeCSV=False

In [107]:
writeCSV=True

In [10]:
printError=True

In [133]:
printError=False

In [11]:
printHead=True

In [12]:
printHead=False

In [13]:
#Define paths
pathData='./drive/My Drive/MIT 807 Big Data Science Mini-Dissertation/GitHub/data/'
pathModels='./drive/My Drive/MIT 807 Big Data Science Mini-Dissertation/GitHub/models/'
pathFigures='./drive/My Drive/MIT 807 Big Data Science Mini-Dissertation/GitHub/reports/figures/'

In [14]:
# Word2vec properties

window_size = 5 #Maximum distance between the current and predicted word within a sentence

In [15]:
threads = 2 #worker threads to train the model (checked on Lenovo to run on CPU0-3)

#2 Import and install packages

In [16]:
#To read CSV into dataframe
import pandas as pd         

In [17]:
#Vectorisers         
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [18]:
import sklearn

In [19]:
#for top n tokens
from collections import Counter

In [20]:
import nltk

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [22]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [23]:
import time

In [24]:
#Cross validation and testing
from sklearn.model_selection import train_test_split, cross_val_score

In [25]:
from sklearn.model_selection import RepeatedKFold

In [26]:
#F1 score
from sklearn.metrics import f1_score #, make_scorer

In [27]:
# Word Representations in Vector Space (word embedding)
from gensim.models import Word2Vec

In [28]:
#to load saved Word2Vec models
from gensim.models import KeyedVectors

In [29]:
#RegexpTokenizer
from nltk.tokenize import RegexpTokenizer

In [30]:
#Required by RegexpTokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
from nltk.corpus import stopwords

In [32]:
# To transform the word embedding
import numpy as np

In [33]:
#Used in main loop
import csv

In [34]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression

In [35]:
#Support Vector Machine
#from sklearn import svm
from sklearn.svm import SVC

In [36]:
#Ensemble/Bagged decision tree: Random Forest
from sklearn.ensemble import RandomForestClassifier

In [37]:
#Ensemble/Bagged decision tree: Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

In [38]:
#Multi-Layer Perceptron Classifier
from sklearn.neural_network import MLPClassifier

In [39]:
#For NN models
import tensorflow as tf

In [40]:
from keras.models import Sequential

In [41]:
#from keras.layers import Input

In [42]:
from keras.layers.embeddings import Embedding

In [43]:
#For LSTM
from keras.layers import LSTM,Dropout,Dense,Activation

In [44]:
#For CNN
from keras.layers import Conv1D,Flatten

In [45]:
from keras.layers import MaxPooling1D

In [46]:
#For Keras F1 function
from keras import backend as K

In [47]:
#Keras wrapper for use with sklearn.
from keras.wrappers.scikit_learn import KerasClassifier

In [48]:
#Normalise
from sklearn.preprocessing import MinMaxScaler

In [49]:
#Standardise
from sklearn.preprocessing import StandardScaler

In [50]:
#For printing in main loop
from datetime import datetime

In [51]:
from pytz import timezone

In [52]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.22.2.post1.


#3 Read text

In [53]:
#Read pageText
try:
    pageText_df=pd.read_csv(pathData+'processed/pageText.csv', index_col=[0])
except FileNotFoundError:
    print('File pageText.csv is missing from ../data/processed/ folder')

In [54]:
#Check the shape
#(60825, 26)

pageText_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60825 entries, 0 to 60824
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   docID       60825 non-null  int64 
 1   docName     60825 non-null  object
 2   year        60825 non-null  int64 
 3   coID        60825 non-null  int64 
 4   coName      60825 non-null  object
 5   risk        60825 non-null  int64 
 6   pageRatio   60825 non-null  int64 
 7   pages       60825 non-null  int64 
 8   oldPdfPage  60825 non-null  int64 
 9   pdfPage     60825 non-null  int64 
 10  text        60212 non-null  object
 11  wordCount   60825 non-null  int64 
 12  forward1    60825 non-null  int64 
 13  forward2    60825 non-null  int64 
 14  LMneg       60825 non-null  int64 
 15  LMpos       60825 non-null  int64 
 16  H6neg       60825 non-null  int64 
 17  H6pos       60825 non-null  int64 
 18  H8neg       60825 non-null  int64 
 19  H8pos       60825 non-null  int64 
 20  uncert

In [55]:
#Group pages into documents
aggFunc={'text':' '.join,'wordCount':'sum', 'forward1':'sum','forward2':'sum',
         'LMneg':'sum', 'LMpos':'sum', 'H6neg':'sum', 'H6pos':'sum',
         'H8neg':'sum', 'H8pos':'sum', 'uncert':'sum', 'causal':'sum', 
         'causalM':'sum', 'causalM50':'sum', 'perf':'sum', 'strat':'sum'}
docText_df=pageText_df[~pageText_df['text'].isnull()].groupby(['docID', 'docName', 'year', 'coID', 'coName', 'risk', 'pageRatio','pages'], as_index=False).agg(aggFunc)

In [56]:
#Check the shape
#(261, 24)

docText_df.shape

(261, 24)

In [57]:
#Check docText before dropping pageText dataframe
docText_df.head()

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat
0,0,2009-ABSA Group annual-report.pdf,2009,2,Absa Group,0,1,353,Absa Group Limited. Authorised financial serv...,163172,949,1116,3511,2068,1620,1273,1608,1229,4455,2421,1115,767,2285,7766
1,1,2010- ABSA Group annual-report.pdf,2010,2,Absa Group,0,0,551,Absa Group Limited Annual report ������������...,93166,88,142,822,297,520,133,519,126,965,469,210,164,335,1834
2,2,2009-12-31-Absa-Bank-Annual-Report.pdf,2009,3,Absa Bank Ltd,0,1,268,Absa Bank Limited. Authorised financial servi...,118457,698,803,2693,1266,1108,662,1106,638,3025,1896,792,538,1837,5122
3,3,2010-12-31-Absa-Bank-Annual-Report.pdf,2010,3,Absa Bank Ltd,0,0,304,Absa Bank Limited Annual report for the year ...,120593,782,860,2667,1219,1212,629,1207,595,3095,2060,952,671,1936,5240
4,4,1-african-bank-holdings-ir-2017.pdf,2017,34,African Bank Ltd (N),0,1,94,Integrated Report 2017\nwww.africanbank.co.za...,36079,340,403,678,806,390,588,387,562,1314,410,352,263,205,2646


In [58]:
#Read document readability
try:
    docRead_df=pd.read_csv(pathData+'processed/docReadability.csv', index_col=[0])
except FileNotFoundError:
    print('File docReadability.csv is missing from ../data/processed/ folder')

In [59]:
#Check the shape
#265 entries
#r_sm     258 non-null

docRead_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 0 to 264
Data columns (total 23 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   coID     265 non-null    int64  
 1   coName   265 non-null    object 
 2   docID    265 non-null    int64  
 3   docName  265 non-null    object 
 4   year     265 non-null    int64  
 5   r_gf     261 non-null    float64
 6   r_f      261 non-null    float64
 7   r_fk     261 non-null    float64
 8   r_dc     261 non-null    float64
 9   r_ari    261 non-null    float64
 10  r_cl     261 non-null    float64
 11  r_lw     261 non-null    float64
 12  r_sm     258 non-null    float64
 13  r_sp     261 non-null    float64
 14  ts_gf    265 non-null    float64
 15  ts_f     265 non-null    float64
 16  ts_fk    265 non-null    float64
 17  ts_dc    265 non-null    float64
 18  ts_ari   265 non-null    float64
 19  ts_cl    265 non-null    float64
 20  ts_lw    265 non-null    float64
 21  ts_sm    265 non

#4 Functions

Can save/load transformed data and features

In [60]:
#Term frequency
#Defaults: analyzer='word', lowercase=True
#Consider strip_accents='ascii', max_df  to ignore corpus-specific stop words, min_df cut-off, dtype to specify dataframe
#Ngram range from 1 for classifier, from n for EDA

#vectorizer=CountVectorizer() #token_pattern='\w+', feature_names=['00', '000', '0000', '00000',..]
#vectorizer=CountVectorizer(token_pattern=r'\b[A-Za-z]{2,}\b')

def fit_tf_vectorizer(text,nTo=1,nFrom=1,featureMax=None):
    vectorizer=CountVectorizer(token_pattern='[A-Za-z]{2,}',
                               stop_words='english',ngram_range=(nFrom,nTo),
                               max_features=featureMax)
    X=vectorizer.fit_transform(text)
    return X, vectorizer

In [61]:
#Term frequency inverse document frequency
#Defaults: analyzer='word', lowercase=True

def fit_tfidf_vectorizer(text,nTo=1,nFrom=1,featureMax=None):
    vectorizer=TfidfVectorizer(token_pattern='[A-Za-z]{2,}',
                               stop_words='english',ngram_range=(nFrom,nTo),
                               max_features=featureMax)
    X=vectorizer.fit_transform(text)
    return X, vectorizer

In [62]:
#Helper function to tokenize documents using regular expression pattern

#Regex:
#A-Z: upper case
#A-z: lower case
#{2,}: 2 or more characters

#Can also use from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))

def reg_tokenize(documents):

    #Set stopwords
    stop_words = stopwords.words('english')

    documents_tokenized = []
    for doc in documents:
        tokens = []
        for word in RegexpTokenizer(r'[A-Za-z]{2,}').tokenize(doc): #Same as vectoriser
            word=word.lower()  # Convert to lower case
            if word not in stop_words: 
                tokens.append(word)
        documents_tokenized.append(tokens)
    return documents_tokenized

In [63]:
# Function to transform document word vectors

def embedding_transform(documents_tokenized, w2v_model, embedding_size, transform_type='mean'):
    transformed_docs = []
    for words in documents_tokenized:
        transformed_doc = []
        for w in words:
            if w in w2v_model:
                transformed_doc.append(w2v_model[w])
            else:
                transformed_doc.append(np.zeros(embedding_size)) ## Add a default 0 vector for unknown words 
        if  transform_type == 'mean':
              transformed_doc = np.mean(transformed_doc, axis=0)
        elif transform_type == 'sum':
              transformed_doc = np.sum(transformed_doc, axis=0)
        elif transform_type == 'max':
              transformed_doc = np.amax(transformed_doc, axis=0)
        elif transform_type == 'power2':
              transformed_doc = np.power(np.mean(np.power(transformed_doc,2), axis=0),0.5) # Power of 2 mean
        elif transform_type == 'power-1':
              transformed_doc = np.power(np.mean(np.power(transformed_doc,-1), axis=0),-1) # Power of -1 mean
        elif transform_type == 'power-2':
              transformed_doc = np.power(np.mean(np.power(transformed_doc,-2), axis=0),-0.5) # Power of -2 mean
        else:
              raiseNotImplementedError()

        transformed_docs.append(transformed_doc)
    output = np.array(transformed_docs)
    #output = transformed_docs
    return output

In [64]:
#Function to create training data and resample

#Code='00' gives shape (210, 59)
#Code='20' gives shape (245, 59)
#Code='50' gives shape (408, 59)

def getDocTrain(docText_df,code):
  #Start training set with risk documents not in test set
  docTrain_df=docText_df[(docText_df['risk']==1) & (~docText_df['docID'].isin([6,358]))]

  #Sample other risk documents with replacement
  if code=='50':
    #Get 204 risk documents
    docTrain_df=docTrain_df.sample(n=204,axis='index',replace=True,random_state=1)
  elif code=='20':
    #Get 41 risk documents
    docTrain_df=docTrain_df.sample(n=41,axis='index',replace=True,random_state=1)

  #Append 204 non-risk documents
  docTrain_df=docTrain_df.append(docText_df[(docText_df['risk']==0) & (~docText_df['docID'].isin(docTest_df['docID']))])

  return docTrain_df

In [65]:
#Begin by clearing f1_df under 7.3.1

#50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.98 (+/- 0.02)|Test F1: 0.57|7s

def funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test):
    totalStart = time.time()
    loopTotal=len(resample)*len(clfs)*len(features)*len(transforms)*len(scales)
    loop=1

    #Display code control parameters
    print('writeCSV',writeCSV)
    print('writeF1',writeF1)

    #Delete root tokens on feature change
    try:
        r_train
    except NameError:
        if printError:
            print('No training roots to delete')
    else:
        del(r_train)

    try:
        r_test
    except NameError:
        if printError:
            print('No test roots to delete')
    else:
        del(r_test)

    #Resample data loop####################### 
    for r in resample:
        r_code=r[0:2]

        #Classifier loop####################### 
        for clfAbb, clfDesc, clf in clfs:

            #Scaling loop####################### 
            for scale in scales:

                #Feature loop####################### 
                for f, fDesc in features:

                    #Transform loop#######################
                    for transAbb,transDesc,nStart,nEnd,nStep,sizeStart,sizeEnd,sizeStep in transforms:
                        
                        iterStart = time.time()
                        #Distinguish BOW from WV
                        if transAbb[0:2]=='tf':
                            nDesc='nTo:'
                            sizeDesc='FeatureMax:'
                        else:
                            nDesc='Min_word_count:'
                            sizeDesc='Embedding_size:'

                        trainTotal=round(1+(nEnd-1-nStart)/nStep)*round(1+(sizeEnd-1-sizeStart)/sizeStep)
                        train=1
                        #Word count training loop#######################
                        for n in range(nStart,nEnd,nStep):
                            
                            #Size training loop#######################
                            for s in range(sizeStart,sizeEnd,sizeStep):
                                print('Iteration',loop,'of',loopTotal,'| Training',train,'of',trainTotal,'|',datetime.now(tz=timezone('Africa/Johannesburg')).strftime("%d/%m/%Y %H:%M:%S"))
                                trainStart = time.time()

                                condition=r+','+clfAbb+','+f+','+scale[0]+','+transAbb+','+nDesc+str(n)+','+sizeDesc+str(s)
                                if len(f1_df[f1_df['condition']==condition])==0:
                                    #if printError:
                                    #      print('...not in F1 data frame')

                                    #2 Get training data (checked)####################### 
                                    try:
                                        y_train
                                    except NameError:
                                        docTrain_df=getDocTrain(docText_df,r_code)
                                        #Shuffle rows
                                        x_train=docTrain_df.sample(frac=1).reset_index(drop=True)
                                        y_train=x_train.risk

                                    #3 Get individual counts ####################### 
                                    if 'c0' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['wordCount']])
                                        except NameError:
                                            xc_train=x_train[['wordCount']]
                                        try:
                                            xc_test=xc_test.join(x_test[['wordCount']])
                                        except NameError:
                                            xc_test=x_test[['wordCount']]
                                        if printError:
                                            print('...appended wordCount')
                                    if 'c1' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['forward1']])
                                        except NameError:
                                            xc_train=x_train[['forward1']]
                                        try:
                                            xc_test=xc_test.join(x_test[['forward1']])
                                        except NameError:
                                            xc_test=x_test[['forward1']]
                                        if printError:
                                            print('...appended forward1')
                                    if 'c2' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['forward2']])
                                        except NameError:
                                            xc_train=x_train[['forward2']]
                                        try:
                                            xc_test=xc_test.join(x_test[['forward2']])
                                        except NameError:
                                            xc_test=x_test[['forward2']]
                                        if printError:
                                            print('...appended forward2')
                                    if 'c3' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['LMneg']])
                                        except NameError:
                                            xc_train=x_train[['LMneg']]
                                        try:
                                            xc_test=xc_test.join(x_test[['LMneg']])
                                        except NameError:
                                            xc_test=x_test[['LMneg']]
                                        if printError:
                                            print('...appended LMneg')
                                    if 'c4' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['LMpos']])
                                        except NameError:
                                            xc_train=x_train[['LMpos']]
                                        try:
                                            xc_test=xc_test.join(x_test[['LMpos']])
                                        except NameError:
                                            xc_test=x_test[['LMpos']]
                                        if printError:
                                            print('...appended LMpos')
                                    if 'c5' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H6neg']])
                                        except NameError:
                                            xc_train=x_train[['H6neg']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H6neg']])
                                        except NameError:
                                            xc_test=x_test[['H6neg']]
                                        if printError:
                                            print('...appended H6neg')
                                    if 'c6' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H6pos']])
                                        except NameError:
                                            xc_train=x_train[['H6pos']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H6pos']])
                                        except NameError:
                                            xc_test=x_test[['H6pos']]
                                        if printError:
                                            print('...appended H6pos')
                                    if 'c7' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H8neg']])
                                        except NameError:
                                            xc_train=x_train[['H8neg']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H8neg']])
                                        except NameError:
                                            xc_test=x_test[['H8neg']]
                                        if printError:
                                            print('...appended H8neg')
                                    if 'c8' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H8pos']])
                                        except NameError:
                                            xc_train=x_train[['H8pos']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H8pos']])
                                        except NameError:
                                            xc_test=x_test[['H8pos']]
                                        if printError:
                                            print('...appended H8pos')
                                    if 'c9' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['uncert']])
                                        except NameError:
                                            xc_train=x_train[['uncert']]
                                        try:
                                            xc_test=xc_test.join(x_test[['uncert']])
                                        except NameError:
                                            xc_test=x_test[['uncert']]
                                        if printError:
                                            print('...appended uncert')
                                    if 'cA' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causal']])
                                        except NameError:
                                            xc_train=x_train[['causal']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causal']])
                                        except NameError:
                                            xc_test=x_test[['causal']]
                                        if printError:
                                            print('...appended causal')
                                    if 'cB' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causalM']])
                                        except NameError:
                                            xc_train=x_train[['causalM']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causalM']])
                                        except NameError:
                                            xc_test=x_test[['causalM']]
                                        if printError:
                                            print('...appended causalM')
                                    if 'cC' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causalM50']])
                                        except NameError:
                                            xc_train=x_train[['causalM50']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causalM50']])
                                        except NameError:
                                            xc_test=x_test[['causalM50']]
                                        if printError:
                                            print('...appended causalM50')
                                    if 'cD' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['perf']])
                                        except NameError:
                                            xc_train=x_train[['perf']]
                                        try:
                                            xc_test=xc_test.join(x_test[['perf']])
                                        except NameError:
                                            xc_test=x_test[['perf']]
                                        if printError:
                                            print('...appended perf')
                                    if 'cE' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['strat']])
                                        except NameError:
                                            xc_train=x_train[['strat']]
                                        try:
                                            xc_test=xc_test.join(x_test[['strat']])
                                        except NameError:
                                            xc_test=x_test[['strat']]
                                        if printError:
                                            print('...appended strat')
                                    if 'cF' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['forward1_scaled']])
                                        except NameError:
                                            xc_train=x_train[['forward1_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['forward1_scaled']])
                                        except NameError:
                                            xc_test=x_test[['forward1_scaled']]
                                        if printError:
                                            print('...appended forward1_scaled')
                                    if 'cG' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['forward2_scaled']])
                                        except NameError:
                                            xc_train=x_train[['forward2_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['forward2_scaled']])
                                        except NameError:
                                            xc_test=x_test[['forward2_scaled']]
                                        if printError:
                                            print('...appended forward2_scaled')
                                    if 'cH' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['LMneg_scaled']])
                                        except NameError:
                                            xc_train=x_train[['LMneg_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['LMneg_scaled']])
                                        except NameError:
                                            xc_test=x_test[['LMneg_scaled']]
                                        if printError:
                                            print('...appended LMneg_scaled')
                                    if 'cI' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['LMpos_scaled']])
                                        except NameError:
                                            xc_train=x_train[['LMpos_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['LMpos_scaled']])
                                        except NameError:
                                            xc_test=x_test[['LMpos_scaled']]
                                        if printError:
                                            print('...appended LMpos_scaled')
                                    if 'cJ' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H6neg_scaled']])
                                        except NameError:
                                            xc_train=x_train[['H6neg_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H6neg_scaled']])
                                        except NameError:
                                            xc_test=x_test[['H6neg_scaled']]
                                        if printError:
                                            print('...appended H6neg_scaled')
                                    if 'cK' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H6pos_scaled']])
                                        except NameError:
                                            xc_train=x_train[['H6pos_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H6pos_scaled']])
                                        except NameError:
                                            xc_test=x_test[['H6pos_scaled']]
                                        if printError:
                                            print('...appended H6pos_scaled')
                                    if 'cL' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H8neg_scaled']])
                                        except NameError:
                                            xc_train=x_train[['H8neg_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H8neg_scaled']])
                                        except NameError:
                                            xc_test=x_test[['H8neg_scaled']]
                                        if printError:
                                            print('...appended H8neg_scaled')
                                    if 'cM' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H8pos_scaled']])
                                        except NameError:
                                            xc_train=x_train[['H8pos_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H8pos_scaled']])
                                        except NameError:
                                            xc_test=x_test[['H8pos_scaled']]
                                        if printError:
                                            print('...appended H8pos_scaled')
                                    if 'cN' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['uncert_scaled']])
                                        except NameError:
                                            xc_train=x_train[['uncert_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['uncert_scaled']])
                                        except NameError:
                                            xc_test=x_test[['uncert_scaled']]
                                        if printError:
                                            print('...appended uncert_scaled')
                                    if 'cO' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causal_scaled']])
                                        except NameError:
                                            xc_train=x_train[['causal_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causal_scaled']])
                                        except NameError:
                                            xc_test=x_test[['causal_scaled']]
                                        if printError:
                                            print('...appended causal_scaled')
                                    if 'cP' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causalM_scaled']])
                                        except NameError:
                                            xc_train=x_train[['causalM_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causalM_scaled']])
                                        except NameError:
                                            xc_test=x_test[['causalM_scaled']]
                                        if printError:
                                            print('...appended causalM_scaled')
                                    if 'cQ' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['causalM50_scaled']])
                                        except NameError:
                                            xc_train=x_train[['causalM50_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['causalM50_scaled']])
                                        except NameError:
                                            xc_test=x_test[['causalM50_scaled']]
                                        if printError:
                                            print('...appended causalM50_scaled')
                                    if 'cR' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['perf_scaled']])
                                        except NameError:
                                            xc_train=x_train[['perf_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['perf_scaled']])
                                        except NameError:
                                            xc_test=x_test[['perf_scaled']]
                                        if printError:
                                            print('...appended perf_scaled')
                                    if 'cS' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['strat_scaled']])
                                        except NameError:
                                            xc_train=x_train[['strat_scaled']]
                                        try:
                                            xc_test=xc_test.join(x_test[['strat_scaled']])
                                        except NameError:
                                            xc_test=x_test[['strat_scaled']]
                                        if printError:
                                            print('...appended strat_scaled')
                                    if 'cT' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['LMtone']])
                                        except NameError:
                                            xc_train=x_train[['LMtone']]
                                        try:
                                            xc_test=xc_test.join(x_test[['LMtone']])
                                        except NameError:
                                            xc_test=x_test[['LMtone']]
                                        if printError:
                                            print('...appended LMtone')
                                    if 'cU' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H6tone']])
                                        except NameError:
                                            xc_train=x_train[['H6tone']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H6tone']])
                                        except NameError:
                                            xc_test=x_test[['H6tone']]
                                        if printError:
                                            print('...appended H6tone')
                                    if 'cV' in f:
                                        try:
                                            xc_train=xc_train.join(x_train[['H8tone']])
                                        except NameError:
                                            xc_train=x_train[['H8tone']]
                                        try:
                                            xc_test=xc_test.join(x_test[['H8tone']])
                                        except NameError:
                                            xc_test=x_test[['H8tone']]
                                        if printError:
                                            print('...appended H8tone')
                                    #3 Get all counts (checked)####################### 
                                    if 'ca' in f:
                                        try:
                                            xc_train
                                        except NameError:
                                            xc_train=x_train[['wordCount','forward1','forward2','LMneg','LMpos','H6neg','H6pos','H8neg','H8pos','uncert','causal','causalM','causalM50','perf','strat',
                                                              'forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                              'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                        try:
                                            xc_test
                                        except NameError:
                                            xc_test=x_test[['wordCount','forward1','forward2','LMneg','LMpos','H6neg','H6pos','H8neg','H8pos','uncert','causal','causalM','causalM50','perf','strat',
                                                            'forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                              'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                    #3 Get scaled counts and tone (checked)####################### 
                                    elif 'cs' in f:
                                        try:
                                            xc_train
                                        except NameError:
                                            #xc_train=x_train[['wordCount']].join(x_train.iloc[:,-35:-18]) was all counts
                                            xc_train=x_train[['wordCount','forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                              'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                        try:
                                            xc_test
                                        except NameError:
                                            #xc_test=x_test[['wordCount']].join(x_test.iloc[:,-35:-18])
                                            xc_test=x_test[['wordCount','forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                              'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]

                                    #3 Get individual readability indices ####################### 
                                    if 'r1' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_gf']])
                                        except NameError:
                                            xr_train=x_train[['r_gf']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_gf']])
                                        except NameError:
                                            xr_test=x_test[['r_gf']]
                                        if printError:
                                            print('...appended r_gf')
                                    if 'r2' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_f']])
                                        except NameError:
                                            xr_train=x_train[['r_f']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_f']])
                                        except NameError:
                                            xr_test=x_test[['r_f']]
                                        if printError:
                                            print('...appended r_f')
                                    if 'r3' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_fk']])
                                        except NameError:
                                            xr_train=x_train[['r_fk']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_fk']])
                                        except NameError:
                                            xr_test=x_test[['r_fk']]
                                        if printError:
                                            print('...appended r_fk')
                                    if 'r4' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_dc']])
                                        except NameError:
                                            xr_train=x_train[['r_dc']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_dc']])
                                        except NameError:
                                            xr_test=x_test[['r_dc']]
                                        if printError:
                                            print('...appended r_dc')
                                    if 'r5' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_ari']])
                                        except NameError:
                                            xr_train=x_train[['r_ari']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_ari']])
                                        except NameError:
                                            xr_test=x_test[['r_ari']]
                                        if printError:
                                            print('...appended r_ari')
                                    if 'r6' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_cl']])
                                        except NameError:
                                            xr_train=x_train[['r_cl']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_cl']])
                                        except NameError:
                                            xr_test=x_test[['r_cl']]
                                        if printError:
                                            print('...appended r_cl')
                                    if 'r7' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_lw']])
                                        except NameError:
                                            xr_train=x_train[['r_lw']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_lw']])
                                        except NameError:
                                            xr_test=x_test[['r_lw']]
                                        if printError:
                                            print('...appended r_lw')
                                    if 'r8' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_sm']])
                                        except NameError:
                                            xr_train=x_train[['r_sm']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_sm']])
                                        except NameError:
                                            xr_test=x_test[['r_sm']]
                                        if printError:
                                            print('...appended r_sm')
                                    if 'r9' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['r_sp']])
                                        except NameError:
                                            xr_train=x_train[['r_sp']]
                                        try:
                                            xr_test=xr_test.join(x_test[['r_sp']])
                                        except NameError:
                                            xr_test=x_test[['r_sp']]
                                        if printError:
                                            print('...appended r_sp')
                                    if 'rB' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_gf']])
                                        except NameError:
                                            xr_train=x_train[['ts_gf']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_gf']])
                                        except NameError:
                                            xr_test=x_test[['ts_gf']]
                                        if printError:
                                            print('...appended ts_gf')
                                    if 'rC' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_f']])
                                        except NameError:
                                            xr_train=x_train[['ts_f']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_f']])
                                        except NameError:
                                            xr_test=x_test[['ts_f']]
                                        if printError:
                                            print('...appended ts_f')
                                    if 'rD' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_fk']])
                                        except NameError:
                                            xr_train=x_train[['ts_fk']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_fk']])
                                        except NameError:
                                            xr_test=x_test[['ts_fk']]
                                        if printError:
                                            print('...appended ts_fk')
                                    if 'rE' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_dc']])
                                        except NameError:
                                            xr_train=x_train[['ts_dc']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_dc']])
                                        except NameError:
                                            xr_test=x_test[['ts_dc']]
                                            if printError:
                                                print('...appended ts_dc')
                                    if 'rF' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_ari']])
                                        except NameError:
                                            xr_train=x_train[['ts_ari']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_ari']])
                                        except NameError:
                                            xr_test=x_test[['ts_ari']]
                                        if printError:
                                            print('...appended ts_ari')
                                    if 'rG' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_cl']])
                                        except NameError:
                                            xr_train=x_train[['ts_cl']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_cl']])
                                        except NameError:
                                            xr_test=x_test[['ts_cl']]
                                        if printError:
                                            print('...appended ts_cl')
                                    if 'rH' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_lw']])
                                        except NameError:
                                            xr_train=x_train[['ts_lw']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_lw']])
                                        except NameError:
                                            xr_test=x_test[['ts_lw']]
                                        if printError:
                                            print('...appended ts_lw')
                                    if 'rI' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_sm']])
                                        except NameError:
                                            xr_train=x_train[['ts_sm']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_sm']])
                                        except NameError:
                                            xr_test=x_test[['ts_sm']]
                                        if printError:
                                            print('...appended ts_sm')
                                    if 'rJ' in f:
                                        try:
                                            xr_train=xr_train.join(x_train[['ts_ts']])
                                        except NameError:
                                            xr_train=x_train[['ts_ts']]
                                        try:
                                            xr_test=xr_test.join(x_test[['ts_ts']])
                                        except NameError:
                                            xr_test=x_test[['ts_ts']]
                                        if printError:
                                            print('...appended ts_ts')
                                    #4 Get all readability indices (checked)####################### 
                                    if 'ra' in f:
                                        try:
                                            xr_train
                                        except NameError:                                    
                                            xr_train=x_train[['r_gf', 'r_f', 'r_fk', 'r_dc', 'r_ari','r_cl', 'r_lw', 'r_sm', 'r_sp', 
                                                              'ts_gf', 'ts_f', 'ts_fk', 'ts_dc','ts_ari', 'ts_cl', 'ts_lw', 'ts_sm', 'ts_ts']]
                                            
                                        try:
                                            xr_test
                                        except NameError:                                    
                                            xr_test=x_test[['r_gf', 'r_f', 'r_fk', 'r_dc', 'r_ari','r_cl', 'r_lw', 'r_sm', 'r_sp', 
                                                            'ts_gf', 'ts_f', 'ts_fk', 'ts_dc','ts_ari', 'ts_cl', 'ts_lw', 'ts_sm', 'ts_ts']]
                                    #4 Get select 3 readability indices (checked)####################### 
                                    elif 'rs' in f:
                                        try:
                                            xr_train
                                        except NameError:                                    
                                            xr_train=x_train[['ts_gf','ts_f','ts_ts']]
                                            
                                        try:
                                            xr_test
                                        except NameError:                                    
                                            xr_test=x_test[['ts_gf','ts_f','ts_ts']]

                                    #5 Get frequency or word vectors as data frame
                                    if transAbb=='tf':
                                        #5.1 Get TF vectors as data frame (did not load so slow but correct)#######################ValueError: Object arrays cannot be loaded when allow_pickle=False                              
                                        pathTrain=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                        pathTest=pathData+'interim/'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                        try:
                                            #load vectorised training data frame from disk
                                            #v_train=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0])
                                            v_train=pd.read_csv(pathTest+'_v_train.csv', index_col=[0]) #original training sample (TF values are same on 50:50 and 20:80 samples)
                                            #Join loaded data to docID and docName of randomised training data
                                            v_train_df=x_train[['docID']].merge(v_train,how='left',on='docID')
                                            
                                            #load vectorised test data frame from disk
                                            v_test=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                            #Join loaded data to docID and docName of randomised test data
                                            v_test_df=x_test[['docID']].merge(v_test,how='left',on='docID')
                                            if printError:
                                                print('...loaded vectorized file')
                                        except:                              
                                            v_train, t_vectorizer=fit_tf_vectorizer(x_train.text,nTo=n,featureMax=s)
                                            v_test = t_vectorizer.transform(x_test.text)

                                            #Join sparse matrix to docID and docName of training data
                                            v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train.todense(),columns=t_vectorizer.get_feature_names()))
                                            #Join sparse matrix to docID and docName of test data
                                            v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test.todense(),columns=t_vectorizer.get_feature_names()))

                                            #save vectorised training data frame to disk
                                            if writeCSV:
                                                #v_train_df.to_csv(pathTrain+'_v_train.csv')
                                                v_train_df.drop_duplicates().to_csv(pathTest+'_v_train.csv') #duplicates explode with join (TF values are same on 50:50 and 20:80 samples)
                                                #save vectorised test data frame to disk
                                                v_test_df.to_csv(pathTest+'_v_test.csv')
                                                if printError:
                                                    print('...saved vectorized file')
                                            #delete to save memory
                                            del(t_vectorizer)
                                            del(v_train)
                                            del(v_test)

                                    elif transAbb=='tfidf':
                                        #5.2 Get TFIDF vectors as data frame (did not load so slow but correct)#######################ValueError: Object arrays cannot be loaded when allow_pickle=False                                                                
                                        pathTrain=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                        pathTest=pathData+'interim/'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                        try:
                                            #load vectorised training data frame from disk
                                            #v_train=pd.read_csv(pathTrain+'_v_train.csv.ignore', index_col=[0]) #TFIDF values differ across the original, 50:50 and 20:80 training samples
                                            v_train=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0]) #TFIDF values differ across the original, 50:50 and 20:80 training samples
                                            #Join loaded data to docID and docName of randomised training data
                                            v_train_df=x_train[['docID']].merge(v_train,how='left',on='docID')

                                            #load vectorised test data frame from disk
                                            v_test=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                            #Join loaded data to docID and docName of randomised test data
                                            v_test_df=x_test[['docID']].merge(v_test,how='left',on='docID')
                                            if printError:
                                                print('...loaded vectorized file')
                                        except:                              
                                            if printError:
                                                print('...vectorizing data')
                                            v_train, t_vectorizer=fit_tfidf_vectorizer(x_train.text,nTo=n,featureMax=s)
                                            v_test = t_vectorizer.transform(x_test.text)

                                            #Join sparse matrix to docID and docName of training data
                                            v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train.todense(),columns=t_vectorizer.get_feature_names()))
                                            #Join sparse matrix to docID and docName of test data
                                            v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test.todense(),columns=t_vectorizer.get_feature_names()))

                                            #save vectorised training data frame to disk
                                            if writeCSV:
                                                v_train_df.drop_duplicates().to_csv(pathTrain+'_v_train.csv') #duplicates explode with join
                                                #save vectorised test data frame to disk
                                                v_test_df.to_csv(pathTest+'_v_test.csv')
                                                if printError:
                                                    print('...saved vectorized file')
                                            #delete to save memory
                                            del(t_vectorizer)
                                            del(v_train)
                                            del(v_test)
                                    else:
                                        #5.3 Get word vectors (WV) as data frame #######################
                                        #if printError:
                                        #    print('...get RegEx tokens')

                                        #6. Get RegEx tokens (fixed was wrong order)#######################                                    
                                        try:
                                            t_train
                                        except NameError:
                                            #Tokenize and remove stop words from training data
                                            try:
                                                #Get training tokens as list of lists
                                                t_train=list(x_train.merge(docToken_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Tokenize
                                                t_train=reg_tokenize(x_train.text)
                                                print('...tokenized training data')
                                                
                                        try:
                                            t_test
                                        except NameError:
                                            #Tokenize and remove stop words from testing data
                                            try:
                                                #Get test tokens as list of lists
                                                t_test=list(x_test.merge(docToken_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Tokenize
                                                t_test=reg_tokenize(x_test.text)    
                                                print('...tokenized test data')       
                                        #End of 6. Get RegEx tokens#######################

                                        #7 Get root tokens (fixed was wrong order)#######################
                                        try:
                                            r_train
                                        except NameError:
                                            #Stemming/Lemmatization of training data
                                            if 'ps' in f:
                                                #Stem training data
                                                try:
                                                    #Get training tokens as list of lists
                                                    r_train=list(x_train.merge(docTokenPS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_train=[[PorterStemmer().stem(plural) for plural in doc] for doc in t_train]
                                                    print('...PorterStemming training data')
                                            elif 'ls' in f:
                                                #Stem training data
                                                try:
                                                    #Get training tokens as list of lists
                                                    r_train=list(x_train.merge(docTokenLS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_train=[[LancasterStemmer().stem(plural) for plural in doc] for doc in t_train]
                                                    print('...LancasterStemming training data')
                                            elif 'ss' in f:
                                                #Stem training data
                                                try:
                                                    #Get training tokens as list of lists
                                                    r_train=list(x_train.merge(docTokenSS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_train=[[SnowballStemmer("english").stem(plural) for plural in doc] for doc in t_train]
                                                    print('...SnowballStemming training data')
                                            elif 'wl' in f:
                                                #Lemmatize training data
                                                try:
                                                    #Get training tokens as list of lists
                                                    r_train=list(x_train.merge(docTokenWL_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Lemmatize
                                                    r_train=[[WordNetLemmatizer().lemmatize(plural) for plural in doc] for doc in t_train]
                                                    print('...Lemmatizing training data')
                                            else:
                                                r_train=t_train.copy()
                                        try:
                                            r_test
                                        except NameError:
                                            #Stemming/Lemmatization of test data
                                            if 'ps' in f:
                                                #Stem testing data
                                                try:
                                                    #Get test tokens as list of lists
                                                    r_test=list(x_test.merge(docTokenPS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_test=[[PorterStemmer().stem(plural) for plural in doc] for doc in t_test]
                                                    print('...PorterStemming test data') 
                                            elif 'ls' in f:
                                                #Stem testing data
                                                try:
                                                    #Get test tokens as list of lists
                                                    r_test=list(x_test.merge(docTokenLS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_test=[[LancasterStemmer().stem(plural) for plural in doc] for doc in t_test]
                                                    print('...LancasterStemming test data') 
                                            elif 'ss' in f:
                                                #Stem testing data
                                                try:
                                                    #Get test tokens as list of lists
                                                    r_test=list(x_test.merge(docTokenSS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Stem
                                                    r_test=[[SnowballStemmer("english").stem(plural) for plural in doc] for doc in t_test]
                                                    print('...SnowballStemming test data') 
                                            elif 'wl' in f:
                                                #Lemmatize testing data
                                                try:
                                                    #Get test tokens as list of lists
                                                    r_test=list(x_test.merge(docTokenWL_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                                except:
                                                    #Lemmatize
                                                    r_test=[[WordNetLemmatizer().lemmatize(plural) for plural in doc] for doc in t_test]
                                                    print('...Lemmatizing test data') 
                                            else:
                                                r_test=t_test.copy()
                                        #End of 7 Get root tokens#######################
                                        
                                        transform_type=transAbb.split('_')[2] #Get transform function from third term
                                        wv_sg=len(transAbb.split('wv_sg'))-1 #Get Skip Gram term
                                        pathWV=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s)
                                        pathTrain=pathWV+'_'+transform_type
                                        pathTest=pathData+'interim/'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s)+'_'+transform_type

                                        #if printError:
                                        #    print('...start embedding')
                                        #8 Embed training words as vectors (fixed was wrong order)#######################
                                        try:
                                            #load training data set from disk
                                            #v_train_df=pd.read_csv(pathTrain+'_v_train.csv.ignore', index_col=[0])
                                            v_train_df=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0])
                                            
                                            #merge loaded data to docID of training data to deal with shuffle
                                            v_train_df=x_train[['docID']].merge(v_train_df,how='left',on='docID')

                                            if printError:
                                                print('...loaded training word vector file')
                                        except:
                                            if printError:
                                                print('...did not load training word vector file')
                                            #Embed                            
                                            try:
                                                #load from disk
                                                wv=KeyedVectors.load(pathWV, mmap='r')
                                                print('...loading keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                            except:
                                                print('...converting words with count>=',str(n),'to vector of size',str(s))
                                                #limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling.
                                                w2v_model = Word2Vec(r_train, size=s, window=window_size, min_count=n, workers=1, sg=wv_sg) #Removed threads for reproduceability
                                                w2v_model.wv.save(pathWV) #Save keyed vectors
                                                wv=w2v_model.wv
                                                del w2v_model #save memory

                                            #Transform (align with Embed header)
                                            print('...embedding training vectors and transforming')
                                            v_train = embedding_transform(r_train, wv, embedding_size=s, transform_type=transform_type)

                                            #Join numpy array to docID and docName of training data
                                            v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train)) 

                                            #save vectorised training data frame to disk
                                            if writeCSV:
                                                v_train_df.drop_duplicates().to_csv(pathTrain+'_v_train.csv') #duplicates explode with join 

                                            #delete to save memory
                                            del(v_train)

                                            if printError and writeCSV:
                                                print('...saved training word vector file')

                                        #8 Embed test words as vectors (fixed was wrong order)#######################
                                        try:
                                            #load test data set from disk
                                            v_test_df=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                            
                                            #merge loaded data to docID of test data to deal with shuffle
                                            v_test_df=x_test[['docID']].merge(v_test_df,how='left',on='docID')

                                            if printError:
                                                print('...loaded test word vector file')
                                        except:
                                            #Embed
                                            try:
                                                wv
                                                print('...reusing keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                            except:
                                                try:
                                                    #load from disk
                                                    wv=KeyedVectors.load(pathWV, mmap='r')
                                                    print('...loading keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                                except:
                                                    print('...converting words with count >',str(n),'to vector of size',str(s))
                                                    #limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling.
                                                    w2v_model = Word2Vec(r_train, size=s, window=window_size, min_count=n, workers=1, sg=wv_sg, seed=1) #Removed threads for reproduceability
                                                    w2v_model.wv.save(pathWV) #Save keyed vectors
                                                    wv=w2v_model.wv
                                                    del w2v_model #save memory

                                            #Transform (align with Embed header)
                                            print('...embedding test vectors and transforming')
                                            v_test = embedding_transform(r_test, wv, embedding_size=s, transform_type=transform_type)

                                            #Join numpy array to docID and docName of test data
                                            v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test))

                                            #save vectorised test data frame to disk
                                            if writeCSV:
                                                v_test_df.to_csv(pathTest+'_v_test.csv')
                                                if printError:
                                                    print('...saved test word vector  file')
                                            #delete to save memory
                                            del(v_test)
                                            del(wv)

                                        try:
                                            wv
                                        except NameError:
                                            if printError:
                                                print('No wv to delete')
                                        else:
                                            del(wv)                                      
                                        #End of 8 Embed words as vectors#######################

                                    #End of 5 Get frequency or word vectors as data frame#######################

                                    #Remove document identifier and name from dataframe
                                    v_train=v_train_df.iloc[:,2:]
                                    v_test=v_test_df.iloc[:,2:]

                                    #delete to save memory
                                    del(v_train_df)
                                    del(v_test_df)

                                    #Join to counts
                                    if 'c' in f:
                                        v_train=v_train.join(xc_train)
                                        v_test=v_test.join(xc_test)

                                    #Join to readability indices
                                    if 'r' in f:
                                        v_train=v_train.join(xr_train)
                                        v_test=v_test.join(xr_test)

                                    #Scale (changes df into numpy array)
                                    if scale[0]=='norm':
                                        #Normalise to [0,1]
                                        scalerNorm = MinMaxScaler().fit(v_train)
                                        v_train = pd.DataFrame(scalerNorm.transform(v_train),columns=v_train.columns)
                                        v_test = pd.DataFrame(scalerNorm.transform(v_test),columns=v_test.columns)
                                    elif scale[0]=='std':
                                        scalerStd = StandardScaler().fit(v_train)
                                        v_train = pd.DataFrame(scalerStd.transform(v_train),columns=v_train.columns)
                                        v_test = pd.DataFrame(scalerStd.transform(v_test),columns=v_test.columns)
                                    
                                    #Cross-Validate (includes fit)
                                    if printError:
                                        print('...cross validating on training data')
                                    if printHead:
                                        print(v_train.head())       

                                    #Adjust for Keras models
                                    if clfAbb[0:4]=='LSTM':
                                        #LSTM has shape (samples,timestep,features)
                                        v_train=np.array(v_train).reshape(v_train.shape[0], 1, v_train.shape[1])
                                        v_test=np.array(v_test).reshape(v_test.shape[0], 1, v_test.shape[1])
                                    elif clfAbb[0:3]=='CNN':
                                        #CNN has shape (samples,features,dimensions/channel)
                                        v_train=np.array(v_train).reshape(v_train.shape[0], v_train.shape[1],1)
                                        v_test=np.array(v_test).reshape(v_test.shape[0], v_test.shape[1],1)

                                    if clfAbb=='LSTM1':    #2-layer LSTM with 50% dropout
                                        def createLSTM():
                                            model=Sequential()

                                            #LSTM layer
                                            model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train.shape[2])))

                                            #50% dropout layer
                                            model.add(Dropout(0.5))

                                            #LSTM layer
                                            model.add(LSTM(64))

                                            #50% dropout layer
                                            model.add(Dropout(0.5))

                                            #Dense output layer with softmax activation
                                            model.add(Dense(1, activation='sigmoid')) #For binary classification

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model
                                    elif clfAbb=='LSTM2':    #2-layer LSTM with 20% dropout
                                        def createLSTM():
                                            model=Sequential()

                                            #LSTM layer
                                            model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train.shape[2])))

                                            #20% dropout layer
                                            model.add(Dropout(0.2))

                                            #LSTM layer
                                            model.add(LSTM(64))

                                            #20% dropout layer
                                            model.add(Dropout(0.2))

                                            #Dense output layer with softmax activation
                                            model.add(Dense(1, activation='sigmoid')) #For binary classification

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model
                                    elif clfAbb=='CNN1a':    #4 convolution layers with sigmoid activation (no dropout)
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layers with sigmoid activation
                                            model.add(Conv1D(filters=64, kernel_size=3, activation='sigmoid',input_shape=(v_train.shape[1], 1))) #v_train_sfd
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
                                            #50% dropout layer
                                            #model.add(Dropout(0.5))
                                            #Convolution layer
                                            model.add(Conv1D(filters=48, kernel_size=3, activation='sigmoid'))
                                            model.add(Flatten())
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model
                                    elif clfAbb=='CNN1b':    #4 convolution layers with ReLU activation (no dropout)
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layers with sigmoid activation
                                            model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train.shape[1], 1)))
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                            #50% dropout layer
                                            #model.add(Dropout(0.5))
                                            #Convolution layer
                                            model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
                                            model.add(Flatten())
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model
                                    elif clfAbb=='CNN1c':    #4 convolution layers with ReLU activation (20% dropout)
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layers with sigmoid activation
                                            model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train.shape[1], 1)))
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                            model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                            #20% dropout layer
                                            model.add(Dropout(0.2))
                                            #Convolution layer
                                            model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
                                            model.add(Flatten())
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model  
                                    elif clfAbb=='CNN1d':    #4 convolution layers with no activation (no dropout)
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layers with sigmoid activation
                                            model.add(Conv1D(filters=64, kernel_size=3,input_shape=(v_train.shape[1], 1)))
                                            model.add(Conv1D(filters=100, kernel_size=3))
                                            model.add(Conv1D(filters=100, kernel_size=3))
                                            #50% dropout layer
                                            #model.add(Dropout(0.5))
                                            #Convolution layer
                                            model.add(Conv1D(filters=48, kernel_size=3))
                                            model.add(Flatten())
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))


                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model   
                                    elif clfAbb=='CNN2':    #1 convolution layer with max pooling and LSTM layers
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layer with Rectified Linear Unit (ReLU) activation
                                            model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu',input_shape=(v_train.shape[1], 1)))
                                            #Max pooling layer
                                            model.add(MaxPooling1D(pool_size=2))
                                            #LSTM layer
                                            model.add(LSTM(100))
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model  
                                    elif clfAbb=='CNN3':    #3 convolution layers with max pooling
                                        def createCNN():
                                            model=Sequential()

                                            #Convolution layer with relu activation
                                            model.add(Conv1D(filters=128, kernel_size=5, activation='relu',input_shape=(v_train.shape[1], 1),padding='same'))
                                            #Max pooling layer
                                            model.add(MaxPooling1D(pool_size=3)) #pool_size=5
                                            #Convolution layer with relu activation
                                            model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
                                            #Max pooling layer
                                            model.add(MaxPooling1D(pool_size=3)) #pool_size=5
                                            #Convolution layer with relu activation
                                            model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
                                            #Max pooling layer
                                            model.add(MaxPooling1D(pool_size=3)) #pool_size=35
                                            model.add(Flatten())
                                            #Dense output layer with sigmoid activation
                                            model.add(Dense(1, activation='sigmoid'))

                                            model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                            return model  

                                    #Wrap Keras models
                                    if clfAbb[0:4]=='LSTM':
                                        clf=KerasClassifier(build_fn=createLSTM,epochs=5,validation_split=0.2,verbose=0)
                                    elif clfAbb[0:3]=='CNN':
                                        clf=KerasClassifier(build_fn=createCNN,epochs=5,validation_split=0.2,verbose=0)

                                    #f1_train=cross_val_score(clf,v_train,y_train,cv=5,scoring='f1',n_jobs=threads)
                                    #5 times 5-fold Cross-Validate
                                    cv=RepeatedKFold(n_splits=5,n_repeats=5,random_state=0)
                                    f1_train=cross_val_score(clf,v_train,y_train,cv=cv,scoring='f1',n_jobs=threads)

                                    #Fit
                                    #LogisticRegression: n_jobs=Number of CPU cores if multi_class=’ovr’”. Ignored with ‘liblinear’. None means 1. -1 means all processors. 
                                    clf.fit(v_train,y_train) #max_iter=1000 to allow convergence (default=100)

                                    #Test
                                    if printHead:
                                        print(v_test.head())                                    
                                    pred_test=(clf.predict(v_test) > 0.5).astype("int32") #clf.predict_proba(t_test); Addresses warning using sigmoid
                                    f1_test=f1_score(y_test,pred_test)

                                    trainDuration=time.time()-trainStart
                                    f1_df=f1_df.append([{'condition':condition,
                                                        'data':r,'model':clfAbb,'feature_set':f,'scale':scale[0],'feature':transAbb,'h1_desc':nDesc,'h1':n,'h2_desc':sizeDesc,'h2':s,
                                                          'f1Train0':f1_train[0],'f1Train1':f1_train[1],'f1Train2':f1_train[2],'f1Train3':f1_train[3],'f1Train4':f1_train[4],
                                                          'f1Train_mean':f1_train.mean(),'f1Train_2std':f1_train.std()*2, #95% of values fall within +/- 2 standard deviations
                                                          'f1Test':f1_test,
                                                          'duration':trainDuration}],
                                                        ignore_index=True)
                                    #print(f1_df)
                                    
                                    print(condition+'|Train F1: %0.4f (+/- %0.4f)' % (f1_train.mean(),f1_train.std()*2)+'|Test F1: %0.4f' % (f1_test)+'|%0.1ds' % trainDuration)
                                else:
                                    print(condition+' in df|Train F1: %0.4f (+/- %0.4f)' % (f1_df.loc[f1_df['condition']==condition,'f1Train_mean'],f1_df.loc[f1_df['condition']==condition,'f1Train_2std'])+'|Test F1: %0.4f' % f1_df.loc[f1_df['condition']==condition,'f1Test']+'|%0.1ds' % f1_df.loc[f1_df['condition']==condition,'duration'])
                                train+=1
                            #Size training loop#######################

                        #End of Word count training loop#######################   

                        #Write to disk
                        if writeF1:
                            f1_df.to_csv(pathFigures+F1filename)
                        
                        #Print duration for iteration and total
                        iterDuration=(time.time()-iterStart)/60
                        totalDuration=(time.time()-totalStart)/60/60                            
                        print('Iteration',loop,'of',loopTotal,'| Duration %0.2fm' % iterDuration+' | %0.2fh since start' % totalDuration)
                        loop+=1

                        try:
                            xc_train
                        except NameError:
                            if printError:
                                print('No training XC to delete')
                        else:
                            del(xc_train)

                        try:
                            xc_test
                        except NameError:
                            if printError:
                                print('No test XC to delete')
                        else:
                            del(xc_test)      

                        try:
                            xr_train
                        except NameError:
                            if printError:
                                print('No training XR to delete')
                        else:
                            del(xr_train)

                        try:
                            xr_test
                        except NameError:
                            if printError:
                                print('No test XR to delete')
                        else:
                            del(xr_test)                                                    
                    #End of Transform loop#######################

                    #Delete root tokens on feature change
                    try:
                        r_train
                    except NameError:
                        if printError:
                            print('No training roots to delete')
                    else:
                        del(r_train)

                    try:
                        r_test
                    except NameError:
                        if printError:
                            print('No test roots to delete')
                    else:
                        del(r_test)
                #End of Feature loop#######################

            #End of Scaling loop#######################
          
        #End of Classifier loop#######################

        #Delete features on data set change
        if len(resample)>1:
            try:
                x_train
            except NameError:
                if printError:
                    print('No training X to delete')
            else:
                del(x_train)

            try:
                y_train
            except NameError:
                if printError:
                    print('No training Y to delete')
            else:
                del(y_train)
                
            try:
                t_train
            except NameError:
                if printError:
                    print('No training tokens to delete')
            else:
                del(t_train)
    #End of Resample data loop#######################

In [66]:
#Define function for LSTM1: 2-layer LSTM with 50% dropout)

def createLSTM1():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #LSTM layer
    model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train_stf.shape[2])))

    #50% dropout layer
    model.add(Dropout(0.5))

    #LSTM layer
    model.add(LSTM(64))

    #50% dropout layer
    model.add(Dropout(0.5))

    #Dense output layer with softmax activation
    model.add(Dense(1, activation='sigmoid')) #For binary classification
    
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1]) #optimizer=tf.keras.optimizers.Adam(learning_rate=0.3)

    return model

In [67]:
#Define function for LSTM2: 2-layer LSTM with 20% dropout)

def createLSTM2():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #LSTM layer
    model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train_stf.shape[2])))

    #20% dropout layer
    model.add(Dropout(0.2))

    #LSTM layer
    model.add(LSTM(64))

    #20% dropout layer
    model.add(Dropout(0.2))

    #Dense output layer with softmax activation
    model.add(Dense(1, activation='sigmoid')) #For binary classification
    
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1]) #optimizer=tf.keras.optimizers.Adam(learning_rate=0.3)

    return model

In [68]:
#Define function for CNN1a: 4 convolution layers with sigmoid activation (no dropout)

def createCNN1a():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layers with sigmoid activation
    model.add(Conv1D(filters=64, kernel_size=3, activation='sigmoid',input_shape=(v_train.shape[1], 1))) #v_train_sfd
    model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
    model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
    #50% dropout layer
    #model.add(Dropout(0.5))
    #Convolution layer
    model.add(Conv1D(filters=48, kernel_size=3, activation='sigmoid'))
    model.add(Flatten())
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [69]:
#Define function for CNN1b: 4 convolution layers with ReLU activation (no dropout)

def createCNN1b():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layers with sigmoid activation
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train_sfd.shape[1], 1)))
    model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
    model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
    #20% dropout layer
    #model.add(Dropout(0.2))
    #Convolution layer
    model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
    model.add(Flatten())
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [70]:
#Define function for CNN1c: 4 convolution layers with ReLU activation (20% dropout)

def createCNN1c():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layers with sigmoid activation
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train_sfd.shape[1], 1)))
    model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
    model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
    #20% dropout layer
    model.add(Dropout(0.2))
    #Convolution layer
    model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
    model.add(Flatten())
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [71]:
#Define function for CNN1d: 4 convolution layers with no activation (no dropout)
def createCNN1d():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layers with sigmoid activation
    model.add(Conv1D(filters=64, kernel_size=3,input_shape=(v_train_sfd.shape[1], 1)))
    model.add(Conv1D(filters=100, kernel_size=3))
    model.add(Conv1D(filters=100, kernel_size=3))
    #50% dropout layer
    #model.add(Dropout(0.5))
    #Convolution layer
    model.add(Conv1D(filters=48, kernel_size=3))
    model.add(Flatten())
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [72]:
#Define function for CNN2: 1 convolution layer with max pooling and LSTM layers

def createCNN2():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layer with Rectified Linear Unit (ReLU) activation
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu',input_shape=(v_train_sfd.shape[1], 1)))
    #Max pooling layer
    model.add(MaxPooling1D(pool_size=2))
    #LSTM layer
    model.add(LSTM(100))
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [73]:
#Define function for CNN3: 3 convolution layers with max pooling

#ValueError: Negative dimension size caused by subtracting 35 from 1 

def createCNN3():
    model=Sequential()

    #Embedding layer
    #model.add(Embedding(vocab_size, output_dim=output_dim, input_length=input_length))

    #Convolution layer with relu activation
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu',input_shape=(v_train_sfd.shape[1], 1),padding='same'))
    #Max pooling layer
    model.add(MaxPooling1D(pool_size=3)) #pool_size=5
    #Convolution layer with relu activation
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
    #Max pooling layer
    model.add(MaxPooling1D(pool_size=3)) #pool_size=5
    #Convolution layer with relu activation
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
    #Max pooling layer
    model.add(MaxPooling1D(pool_size=3)) #pool_size=35
    model.add(Flatten())
    #Dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

    return model

In [74]:
#Define own f1 function for Keras
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

#5 Add linguistic indices and tokenise

*   Scale word counts and calculate tone
*   Add all readability indices
*   Tokenise for word vectors

##5.1 Scale word counts and calculate tone

*   Use scale and tone as predictors instead of absolute counts

In [75]:
#Scale word counts
docText_df['forward1_scaled']=docText_df['forward1'].div(docText_df['wordCount'])
docText_df['forward2_scaled']=docText_df['forward2'].div(docText_df['wordCount'])
docText_df['LMneg_scaled']=docText_df['LMneg'].div(docText_df['wordCount'])
docText_df['LMpos_scaled']=docText_df['LMpos'].div(docText_df['wordCount'])
docText_df['H6neg_scaled']=docText_df['H6neg'].div(docText_df['wordCount'])
docText_df['H6pos_scaled']=docText_df['H6pos'].div(docText_df['wordCount'])
docText_df['H8neg_scaled']=docText_df['H8neg'].div(docText_df['wordCount'])
docText_df['H8pos_scaled']=docText_df['H8pos'].div(docText_df['wordCount'])
docText_df['uncert_scaled']=docText_df['uncert'].div(docText_df['wordCount'])
docText_df['causal_scaled']=docText_df['causal'].div(docText_df['wordCount'])
docText_df['causalM_scaled']=docText_df['causalM'].div(docText_df['wordCount'])
docText_df['causalM50_scaled']=docText_df['causalM50'].div(docText_df['wordCount'])
docText_df['perf_scaled']=docText_df['perf'].div(docText_df['wordCount'])
docText_df['strat_scaled']=docText_df['strat'].div(docText_df['wordCount'])

#Calculate net tone
docText_df['LMtone']=(docText_df['LMpos']-docText_df['LMneg']).div(docText_df['LMpos']+docText_df['LMneg'])
docText_df['H6tone']=(docText_df['H6pos']-docText_df['H6neg']).div(docText_df['H6pos']+docText_df['H6neg'])
docText_df['H8tone']=(docText_df['H8pos']-docText_df['H8neg']).div(docText_df['H8pos']+docText_df['H8neg'])

##5.2 Join readability indices

*   Use textstat (ts) Gunning fog (gf), Flesch-Kincaid Reading Ease (f) and Text Standard (ts)

In [76]:
#Merge readability indices to document data frame
docText_df=docText_df.merge(docRead_df.iloc[:,5:].join(docRead_df['docID']),on='docID')

In [77]:
#Check enhanced document data frame
docText_df

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
0,0,2009-ABSA Group annual-report.pdf,2009,2,Absa Group,0,1,353,Absa Group Limited. Authorised financial serv...,163172,949,1116,3511,2068,1620,1273,1608,1229,4455,2421,1115,767,2285,7766,0.005816,0.006839,0.021517,0.012674,0.009928,0.007802,0.009855,0.007532,0.027302,0.014837,0.006833,0.004701,0.014004,0.047594,-0.258649,-0.119945,-0.133592,20.839232,27.015038,17.958110,13.767369,19.490907,12.286263,25.916595,35.478918,11.098509,24.30,2.83,27.6,3.35,33.0,13.25,26.666667,22.8,28.0
1,1,2010- ABSA Group annual-report.pdf,2010,2,Absa Group,0,0,551,Absa Group Limited Annual report ������������...,93166,88,142,822,297,520,133,519,126,965,469,210,164,335,1834,0.000945,0.001524,0.008823,0.003188,0.005581,0.001428,0.005571,0.001352,0.010358,0.005034,0.002254,0.001760,0.003596,0.019685,-0.469169,-0.592649,-0.609302,105.779594,-154.391804,97.243008,31.386752,118.044060,-5.284238,142.570531,25.131209,45.914175,121.52,-202.14,116.7,15.29,196.2,65.23,65.000000,31.1,65.0
2,2,2009-12-31-Absa-Bank-Annual-Report.pdf,2009,3,Absa Bank Ltd,0,1,268,Absa Bank Limited. Authorised financial servi...,118457,698,803,2693,1266,1108,662,1106,638,3025,1896,792,538,1837,5122,0.005892,0.006779,0.022734,0.010687,0.009354,0.005589,0.009337,0.005386,0.025537,0.016006,0.006686,0.004542,0.015508,0.043239,-0.360445,-0.251977,-0.268349,21.979868,25.429146,18.836096,13.881922,20.607270,12.020478,27.722046,31.995903,11.538132,25.13,9.36,27.2,3.48,34.0,13.36,26.333333,22.9,27.0
3,3,2010-12-31-Absa-Bank-Annual-Report.pdf,2010,3,Absa Bank Ltd,0,0,304,Absa Bank Limited Annual report for the year ...,120593,782,860,2667,1219,1212,629,1207,595,3095,2060,952,671,1936,5240,0.006485,0.007131,0.022116,0.010108,0.010050,0.005216,0.010009,0.004934,0.025665,0.017082,0.007894,0.005564,0.016054,0.043452,-0.372620,-0.316676,-0.339623,22.374741,24.574999,19.115607,13.876534,21.039939,12.154128,28.249444,29.740934,11.653701,24.56,10.68,26.7,3.38,33.1,13.07,37.000000,22.7,27.0
4,4,1-african-bank-holdings-ir-2017.pdf,2017,34,African Bank Ltd (N),0,1,94,Integrated Report 2017\nwww.africanbank.co.za...,36079,340,403,678,806,390,588,387,562,1314,410,352,263,205,2646,0.009424,0.011170,0.018792,0.022340,0.010810,0.016298,0.010726,0.015577,0.036420,0.011364,0.009756,0.007290,0.005682,0.073339,0.086253,0.202454,0.184405,20.962552,22.417591,18.585266,12.384232,20.771345,13.920283,26.355480,26.497887,10.319961,25.14,3.03,27.5,7.62,34.0,14.75,22.333333,22.9,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,341,State Bank of India_AR_2017_English.pdf,2017,28,Bank of India - Johannesburg Branch,0,1,106,"Printed in Japan SBI Holdings, Inc. Ann...",59609,489,536,992,1028,415,680,410,646,1789,1179,589,325,1109,2978,0.008203,0.008992,0.016642,0.017246,0.006962,0.011408,0.006878,0.010837,0.030012,0.019779,0.009881,0.005452,0.018605,0.049959,0.017822,0.242009,0.223485,24.222745,15.441370,21.417951,13.309255,25.046873,14.539908,32.241343,28.139361,11.784310,23.82,5.46,26.6,7.27,33.8,15.45,18.750000,22.9,27.0
257,342,State Bank of India_AR_2018_English.pdf,2018,28,Bank of India - Johannesburg Branch,0,1,260,TRANSFORMATION \nON TRACK ANNUAL REPORT 2018...,126772,580,672,2493,2074,1160,1304,1155,1239,3676,1450,685,443,1111,5721,0.004575,0.005301,0.019665,0.016360,0.009150,0.010286,0.009111,0.009773,0.028997,0.011438,0.005403,0.003494,0.008764,0.045128,-0.091745,0.058442,0.035088,16.095672,39.634343,14.263356,12.988064,16.474915,13.339028,19.666632,20.736967,9.691879,18.54,9.49,23.0,2.78,27.9,15.33,68.000000,19.9,23.0
258,343,State Bank of India_AR_2019_English.pdf,2019,28,Bank of India - Johannesburg Branch,0,1,260,TRANSFORMATION \nON TRACK ANNUAL REPORT 2018...,126772,580,672,2493,2074,1160,1304,1155,1239,3676,1450,685,443,1111,5721,0.004575,0.005301,0.019665,0.016360,0.009150,0.010286,0.009111,0.009773,0.028997,0.011438,0.005403,0.003494,0.008764,0.045128,-0.091745,0.058442,0.035088,16.095672,39.634343,14.263356,12.988064,16.474915,13.339028,19.666632,20.736967,9.691879,18.54,9.49,23.0,2.78,27.9,15.33,68.000000,19.9,23.0
259,357,VBS-2015-Annual-Report.pdf,2015,1,VBS Mutual Bank,1,1,56,People with a purpose. “People with a Purpos...,19318,118,134,396,236,172,116,170,116,510,284,108,74,252,709,0.006108,0.006937,0.020499,0.012217,0.008904,0.006005,0.008800,0.006005,0.026400,0.014701,0.005591,0.003831,0.013045,0.036702,-0.253165,-0.194444,-0.188811,22.335384,24.774939,18.995163,13.780807,21.108055,12.473820,28.080321,16.404323,11.415405,19.95,-76.44,35.3,7.05,26.2,13.19,19.000000,20.6,13.0


##5.3 Delete data frames and drop null documents

*   Delete pageText_df and docRead_df
*   Drop 3 documents with index 36,37,38



In [78]:
#Delete pageText dataframe to avoid memory error with tfidf_vectorizer later
del pageText_df

In [79]:
#Delete docRead dataframe to avoid memory error with tfidf_vectorizer later
del docRead_df

In [80]:
#Drop documents with nulls
docText_df.drop([36,37,38],inplace=True)

In [81]:
#reset index to help with adding tokens
docText_df.reset_index(drop=True,inplace=True)

##5.4 Read tokens raw and normalised else tokenise and save



###5.4.1 Raw tokens

In [82]:
%%time
#1.83 s with file
#51.5 s without file

try:
    docToken_df=pd.read_csv(pathData+'processed/docToken.csv', index_col=[0])
except Exception as e:
    #print(e)

    #Get document description
    docToken_df=docText_df.iloc[:,:8]

    #Get RegEx tokens
    docToken_df['text']=pd.Series(reg_tokenize(docText_df.text))

    #Check data frame
    print(docToken_df)

    #Check that there are no nulls
    print(docToken_df[docToken_df['text'].isna()])

    #Save to disk
    docToken_df.to_csv(pathData+'processed/docToken.csv')

CPU times: user 1.43 s, sys: 302 ms, total: 1.73 s
Wall time: 2.87 s


###5.4.2 PorterStemmer

6min 10s

In [83]:
%%time
#2.44 s with file
#6min 10s without file

try:
    docTokenPS_df=pd.read_csv(pathData+'processed/docTokenPS.csv', index_col=[0])
except Exception as e:
    #print(e)

    #Get document description
    docTokenPS_df=docText_df.iloc[:,:8]

    #Get PorterStems
    docTokenPS_df['text']=pd.Series([[PorterStemmer().stem(plural) for plural in doc] for doc in docToken_df['text']])

    #Check data frame
    print(docTokenPS_df)

    #Check that there are no nulls
    print(docTokenPS_df[docTokenPS_df['text'].isna()])

    #Save to disk
    docTokenPS_df.to_csv(pathData+'processed/docTokenPS.csv')

CPU times: user 1.25 s, sys: 106 ms, total: 1.36 s
Wall time: 2.62 s


###5.4.3 LancasterStemmer

21min 37s: Slower than other Stemmers


In [84]:
%%time
#2.44 s with file
#21min 37s without file

try:
    docTokenLS_df=pd.read_csv(pathData+'processed/docTokenLS.csv', index_col=[0])
except Exception as e:
    #print(e)

    #Get document description
    docTokenLS_df=docText_df.iloc[:,:8]

    #Get LancasterStems
    docTokenLS_df['text']=pd.Series([[LancasterStemmer().stem(plural) for plural in doc] for doc in docToken_df['text']])

    #Check data frame
    print(docTokenLS_df)

    #Check that there are no nulls
    print(docTokenLS_df[docTokenLS_df['text'].isna()])

    #Save to disk
    docTokenLS_df.to_csv(pathData+'processed/docTokenLS.csv')

CPU times: user 1.14 s, sys: 77.3 ms, total: 1.21 s
Wall time: 2.19 s


###5.4.4 SnowballStemmer

3min 54s

In [85]:
%%time
#2.5 s with file
#3min 54s without file

try:
    docTokenSS_df=pd.read_csv(pathData+'processed/docTokenSS.csv', index_col=[0])
except Exception as e:
    #print(e)

    #Get document description
    docTokenSS_df=docText_df.iloc[:,:8]

    #Get SnowballStems
    docTokenSS_df['text']=pd.Series([[SnowballStemmer("english").stem(plural) for plural in doc] for doc in docToken_df['text']])

    #Check data frame
    print(docTokenSS_df)

    #Check that there are no nulls
    print(docTokenSS_df[docTokenSS_df['text'].isna()])

    #Save to disk
    docTokenSS_df.to_csv(pathData+'processed/docTokenSS.csv')

CPU times: user 1.23 s, sys: 165 ms, total: 1.4 s
Wall time: 2.36 s


###5.4.5 WordNetLemmatizer

In [86]:
%%time
#2.69 s with file
##6min 10s without file

try:
    docTokenWL_df=pd.read_csv(pathData+'processed/docTokenWL.csv', index_col=[0])
except Exception as e:
    #print(e)

    #Get document description
    docTokenWL_df=docText_df.iloc[:,:8]

    #Get WordNet Lemmas
    docTokenWL_df['text']=pd.Series([[WordNetLemmatizer().lemmatize(plural) for plural in doc] for doc in docToken_df['text']])

    #Check data frame
    print(docTokenWL_df)

    #Check that there are no nulls
    print(docTokenWL_df[docTokenWL_df['text'].isna()])

    #Save to disk
    docTokenWL_df.to_csv(pathData+'processed/docTokenWL.csv')

CPU times: user 1.35 s, sys: 176 ms, total: 1.52 s
Wall time: 2.68 s


#6 Create test set

*   Test on 2018 and 2019 and last year of risk reports

Need more companies flagged for risk (only 8 documents)

In [87]:
docTest_df=docText_df[(docText_df['docID']==6)]

In [88]:
docTest_df=docTest_df.append(docText_df[docText_df['docID']==358])

In [89]:
docTest_df=docTest_df.append(docText_df[docText_df['year']==2018])

In [90]:
docTest_df=docTest_df.append(docText_df[docText_df['year']==2019])

In [91]:
docTest_df

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
6,6,1464958705-4144_African-Bank-2014annualreport.pdf,2014,35,African Bank Ltd (O),1,1,59,report 2014 Annual Contents Board of director...,24953,249,270,496,230,173,111,172,107,538,499,226,148,302,939,0.009979,0.01082,0.019877,0.009217,0.006933,0.004448,0.006893,0.004288,0.021561,0.019998,0.009057,0.005931,0.012103,0.037631,-0.366391,-0.21831,-0.232975,23.726228,15.411503,20.238319,13.199735,22.296245,14.116434,29.79638,27.004461,11.140464,23.51,-1.48,27.2,7.34,32.1,14.75,89.0,24.1,24.0
257,358,VBS-2016-Annual-Report.pdf,2016,1,VBS Mutual Bank,1,2,47,A n n u A l \nR e p o R t\n2 0 1 6 People w...,36696,214,243,693,367,307,164,303,161,873,497,177,111,380,1212,0.005832,0.006622,0.018885,0.010001,0.008366,0.004469,0.008257,0.004387,0.02379,0.013544,0.004823,0.003025,0.010355,0.033028,-0.307547,-0.303609,-0.306034,23.873462,25.284331,20.134767,14.630884,21.617547,10.02975,30.776711,18.7741,11.695419,26.9,-2.67,29.7,3.92,33.6,10.69,8.142857,22.5,30.0
13,13,african-bank-integrated-report-2018-final-webs...,2018,34,African Bank Ltd (N),0,1,119,Integrated Report 2018\nAdvancing lives R1.0...,47505,409,499,987,1304,617,838,609,810,2076,678,489,387,337,3588,0.00861,0.010504,0.020777,0.02745,0.012988,0.01764,0.01282,0.017051,0.043701,0.014272,0.010294,0.008147,0.007094,0.075529,0.138368,0.15189,0.141649,23.707975,15.078705,20.89989,12.670502,23.643166,14.236283,30.750212,24.27398,11.136485,25.28,2.01,27.9,7.49,34.8,15.16,74.0,23.5,28.0
20,22,alBaraka 2018 AnnualReport.pdf,2018,19,Albaraka Bank Ltd,0,1,96,INTEGRATED\nANNUAL REPORT Your Partner Bank ...,46792,327,356,1011,788,409,443,407,424,1493,711,355,244,646,2090,0.006988,0.007608,0.021606,0.01684,0.008741,0.009467,0.008698,0.009061,0.031907,0.015195,0.007587,0.005215,0.013806,0.044666,-0.123958,0.039906,0.020457,21.531462,23.974219,18.58404,13.281631,20.278276,12.76032,26.658706,23.647325,10.85803,22.05,1.98,25.9,7.13,29.5,13.65,24.333333,21.5,22.0
34,36,Bank of Baroda Annualreport2017-18.pdf,2018,26,Bank of Baroda,0,1,270,Chairman's Statement 05 MD & CEO’s Statemen...,155021,362,417,1414,1346,582,892,576,850,2066,757,393,270,744,3101,0.002335,0.00269,0.009121,0.008683,0.003754,0.005754,0.003716,0.005483,0.013327,0.004883,0.002535,0.001742,0.004799,0.020004,-0.024638,0.210312,0.192146,35.200776,26.973975,29.978002,22.379777,30.121447,-5.406596,46.01413,19.253489,20.939934,36.1,-1.95,35.6,4.79,51.9,17.66,75.0,19.6,36.0
42,47,Bank of Canara Annual Report 2018 (5324830319)...,2018,30,Bank of Canara,0,1,356,1 dm{f©H$ [anmoQ>©\n2018-2019\nANNUAL REPORT ...,179412,496,564,1470,1015,633,617,626,599,1757,860,319,209,533,2406,0.002765,0.003144,0.008193,0.005657,0.003528,0.003439,0.003489,0.003339,0.009793,0.004793,0.001778,0.001165,0.002971,0.01341,-0.183099,-0.0128,-0.022041,21.905451,53.505065,17.560037,18.374934,27.920733,14.412825,29.995608,14.45898,14.692188,35.6,-0.83,35.2,4.73,50.9,17.37,8.5,18.5,36.0
55,62,"Bidvest+Bank+Annual+Report,+year+ended+30+June...",2018,20,Bidvest Bank Ltd,0,1,147,ABRIDGED \nANNUAL REPORT \n2017/2018 Who we...,37423,171,203,733,799,446,534,439,517,1363,460,298,227,348,2218,0.004569,0.005424,0.019587,0.021351,0.011918,0.014269,0.011731,0.013815,0.036421,0.012292,0.007963,0.006066,0.009299,0.059268,0.043081,0.089796,0.08159,19.214709,27.691326,17.425142,13.691936,19.936971,13.945948,24.862267,19.979281,10.805937,19.2,17.64,21.9,6.79,27.4,15.1,82.0,20.5,22.0
56,63,2018_bnp_paribas_integrated_report.pdf,2018,12,Bnp Paribas South Africa Branch,0,1,56,The bank for \na changing \nworld 2018 INT...,18258,108,140,172,587,110,426,107,414,680,233,175,124,71,1556,0.005915,0.007668,0.009421,0.03215,0.006025,0.023332,0.00586,0.022675,0.037244,0.012762,0.009585,0.006792,0.003889,0.085223,0.546772,0.589552,0.589251,19.49005,24.608836,17.267496,12.990149,18.917906,14.142628,23.424669,17.916177,9.908065,22.48,2.38,25.7,7.61,31.2,15.91,18.0,22.1,26.0
66,73,capitec_bank_integrated_annual_report_2018.pdf,2018,36,Capitec Bank,0,2,127,Integrated Annual Report 2018 Why we are in ...,129279,759,905,2770,2427,1357,1337,1344,1311,4259,1815,1033,804,1318,6384,0.005871,0.007,0.021427,0.018773,0.010497,0.010342,0.010396,0.010141,0.032944,0.014039,0.00799,0.006219,0.010195,0.049382,-0.066,-0.007424,-0.012429,20.45278,30.332499,17.367941,13.959615,19.048979,12.055481,25.122903,22.190621,11.173025,21.04,11.05,24.4,2.93,28.5,12.84,10.166667,20.7,13.0
85,92,China Construction Bank Corporation Annual Rep...,2018,31,China Construction Bank Corporation - Jhb Branch,0,1,299,China Construction Bank Corporation Annual R...,130532,763,895,2788,2062,1327,1314,1320,1219,3804,1910,722,427,1876,6149,0.005845,0.006857,0.021359,0.015797,0.010166,0.010066,0.010112,0.009339,0.029142,0.014632,0.005531,0.003271,0.014372,0.047107,-0.149691,-0.004922,-0.039779,24.532807,18.735809,20.682198,13.501333,25.078692,15.284003,31.59064,19.421281,11.939988,20.53,4.01,25.1,2.93,30.9,16.2,13.8,22.0,22.0


In [92]:
#Get other risk documents
docText_df[(docText_df['risk']==1) & (~docText_df['docID'].isin([6,358]))]

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
5,5,1464958532-AfricanBank_IR_FY20131.pdf,2013,35,African Bank Ltd (O),1,1,332,Integrated Report \nI for the year ended 30...,111518,833,952,2551,1782,1118,1195,1105,1171,3450,2039,997,700,1684,5381,0.00747,0.008537,0.022875,0.015979,0.010025,0.010716,0.009909,0.010501,0.030937,0.018284,0.00894,0.006277,0.015101,0.048252,-0.177475,0.03329,0.028998,22.589726,22.264731,19.046237,13.71256,20.880175,12.946425,27.700429,29.720487,11.308212,20.85,20.52,22.9,3.04,28.8,13.77,33.5,21.5,23.0
8,8,1517467421-ABILIAR2012completelowres.pdf,2012,35,African Bank Ltd (O),1,1,360,>ABIL in perspective African Bank Investments...,148280,989,1138,2548,2409,1166,1567,1154,1528,4475,2310,1224,921,1591,7692,0.00667,0.007675,0.017184,0.016246,0.007864,0.010568,0.007783,0.010305,0.030179,0.015579,0.008255,0.006211,0.01073,0.051875,-0.028041,0.146725,0.139448,21.979161,21.876332,18.729265,13.3374,20.335909,13.204049,26.612999,20.203746,10.948028,22.12,0.46,26.4,3.23,30.4,13.88,18.75,22.0,22.0
9,9,2009 Full report - PDF 7MB - African Bank.pdf,2009,35,African Bank Ltd (O),1,1,208,Annual Report 2009 Our vision is to enable \...,78638,431,493,1514,1188,669,681,660,667,2298,1214,573,414,1288,3057,0.005481,0.006269,0.019253,0.015107,0.008507,0.00866,0.008393,0.008482,0.029223,0.015438,0.007287,0.005265,0.016379,0.038874,-0.120651,0.008889,0.005275,22.626269,23.106608,19.648376,14.259305,21.809318,12.28197,29.32429,19.620378,11.996173,24.43,3.23,27.4,3.57,32.5,12.96,21.0,22.1,13.0
10,10,2011 Full integrated annual report - African B...,2011,35,African Bank Ltd (O),1,1,130,for the year ended 30 September 2011 Giving ...,45937,348,396,567,895,334,678,328,663,1469,501,289,207,342,2515,0.007576,0.008621,0.012343,0.019483,0.007271,0.014759,0.00714,0.014433,0.031979,0.010906,0.006291,0.004506,0.007445,0.054749,0.22435,0.339921,0.338042,22.199623,20.395354,19.586503,13.549493,21.419155,12.91168,28.339506,16.567215,11.159838,24.86,3.23,27.4,7.54,33.1,13.71,23.0,22.4,23.0
11,11,ABIL_ar_sep10.pdf,2010,35,African Bank Ltd (O),1,1,268,Annual Report 2 0 1 0 A focus on our people ...,92136,606,689,1703,1454,793,878,788,852,2744,1372,714,522,1384,4030,0.006577,0.007478,0.018484,0.015781,0.008607,0.009529,0.008553,0.009247,0.029782,0.014891,0.007749,0.005666,0.015021,0.04374,-0.078872,0.050868,0.039024,22.192729,23.98455,18.986276,13.781339,20.983495,12.62012,27.884223,20.299047,11.447347,27.65,-21.72,32.9,4.0,36.8,13.72,72.0,23.7,33.0
256,357,VBS-2015-Annual-Report.pdf,2015,1,VBS Mutual Bank,1,1,56,People with a purpose. “People with a Purpos...,19318,118,134,396,236,172,116,170,116,510,284,108,74,252,709,0.006108,0.006937,0.020499,0.012217,0.008904,0.006005,0.0088,0.006005,0.0264,0.014701,0.005591,0.003831,0.013045,0.036702,-0.253165,-0.194444,-0.188811,22.335384,24.774939,18.995163,13.780807,21.108055,12.47382,28.080321,16.404323,11.415405,19.95,-76.44,35.3,7.05,26.2,13.19,19.0,20.6,13.0


In [93]:
#Get other non-risk documents
#Update report for doc 36 removed

#(205, 41)
#(204, 41)

docText_df[(docText_df['risk']==0) & (~docText_df['docID'].isin(docTest_df['docID']))].shape

(204, 59)

#7 Loop through classifiers

##7.1 Code control

###7.1.1 Resampling

In [140]:
#Prepare resampling percentage
resample = []

resample.append('50:50')
#resample.append('20:80')
#resample.append('00')

#resample = ('20:80','50:50')
#resample = ('00','20:80','50:50')

###7.1.2 Classifiers

In [181]:
#Prepare classifiers

#Random state is ignored when probability=False
#Probability=True is slower but needed for application of model

clfs = []

In [182]:
clfs.append(('LR', 'LogisticRegression(liblinear)', LogisticRegression(solver='liblinear')))

In [156]:
#class_weight='balanced' automatically adjust weights inversely proportional to class frequencies = n_samples / (n_classes * np.bincount(y))
clfs.append(('SVMa', 'Support Vector Machine(auto)', SVC(gamma='auto', random_state=0, probability=False))) #Consider StandardScaler() in pipeline

In [183]:
clfs.append(('SVMs', 'Support Vector Machine(scale)', SVC(gamma='scale', random_state=0, probability=False)))

In [184]:
#Modern classifiers
clfs.append(('MLP8', 'Multi-Layer Perceptron', MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)))

In [185]:
clfs.append(('CNN1a', '4 convolution layers with sigmoid activation (no dropout)', createCNN1a))

In [186]:
clfs.append(('CNN1b', '4 convolution layers with ReLU activation (no dropout)', createCNN1b))

In [187]:
clfs.append(('CNN1c', '4 convolution layers with ReLU activation (20% dropout)', createCNN1c))

In [188]:
clfs.append(('CNN1d', '4 convolution layers with no activation (no dropout)', createCNN1d))

In [189]:
clfs.append(('CNN2', '1 convolution layer with max pooling and LSTM layers', createCNN2))

In [190]:
#Does not run on 20 features
clfs.append(('CNN3', '3 convolution layers with max pooling', createCNN3))

In [191]:
clfs.append(('LSTM1', '2-layer LSTM with 50% dropout', createLSTM1))

In [192]:
clfs.append(('LSTM2', '2-layer LSTM with 20% dropout', createLSTM2))

In [None]:
#Was not in proposal or method
#clfs.append(('RF20', 'Random Forest', RandomForestClassifier(max_depth=20, random_state=0,n_jobs=threads)))
#clfs.append(('ET20', 'Extra Trees', ExtraTreesClassifier(max_depth=20, random_state=0,n_jobs=threads)))

###7.1.3 Features

In [97]:
#Prepare features
features = []

In [98]:
features.append(('x','text'))
#features.append(('ps','Porter Stemmer'))
#features.append(('ls','Lancaster Stemmer'))
#features.append(('ss','Snowball Stemmer'))
#features.append(('wl','WordNet Lemmatizer'))

#features.append(('x+cs','text and scaled wordcounts'))
#features.append(('ps+cs','Porter Stemmer and scaled wordcounts'))
#features.append(('ls+cs','Lancaster Stemmer and scaled wordcounts'))
#features.append(('ss+cs','Snowball Stemmer and scaled wordcounts'))
#features.append(('wl+cs','WordNet Lemmatizer and scaled wordcounts'))

#features.append(('x+ca','text and all wordcounts'))

#features.append(('x+cs+rs','text and scaled wordcounts and readability'))
#features.append(('x+ca+ra','text and all wordcounts and all readability'))

####7.1.3.1 Individual lingusitic features

In [None]:
#Append individual word counts
features.append(('x+c0','text and wordCount'))
features.append(('x+c1','text and forward1'))
features.append(('x+c2','text and forward2'))
features.append(('x+c3','text and LMneg'))
features.append(('x+c4','text and LMpos'))
features.append(('x+c5','text and H6neg'))
features.append(('x+c6','text and H6pos'))
features.append(('x+c7','text and H8neg'))
features.append(('x+c8','text and H8pos'))
features.append(('x+c9','text and uncert'))
features.append(('x+cA','text and causal'))
features.append(('x+cB','text and causalM'))
features.append(('x+cC','text and causalM50'))
features.append(('x+cD','text and perf'))
features.append(('x+cE','text and strat'))
features.append(('x+cF','text and forward1_scaled'))
features.append(('x+cG','text and forward2_scaled'))
features.append(('x+cH','text and LMneg_scaled'))
features.append(('x+cI','text and LMpos_scaled'))
features.append(('x+cJ','text and H6neg_scaled'))
features.append(('x+cK','text and H6pos_scaled'))
features.append(('x+cL','text and H8neg_scaled'))
features.append(('x+cM','text and H8pos_scaled'))
features.append(('x+cN','text and uncert_scaled'))
features.append(('x+cO','text and causal_scaled'))
features.append(('x+cP','text and causalM_scaled'))
features.append(('x+cQ','text and causalM50_scaled'))
features.append(('x+cR','text and perf_scaled'))
features.append(('x+cS','text and strat_scaled'))
features.append(('x+cT','text and LMtone'))
features.append(('x+cU','text and H6tone'))
features.append(('x+cV','text and H8tone'))

In [None]:
#Append individual readability indices
features.append(('x+r1','text and r_gf'))
features.append(('x+r2','text and r_f'))
features.append(('x+r3','text and r_fk'))
features.append(('x+r4','text and r_dc'))
features.append(('x+r5','text and r_ari'))
features.append(('x+r6','text and r_cl'))
features.append(('x+r7','text and r_lw'))
features.append(('x+r8','text and r_sm'))
features.append(('x+r9','text and r_sp'))
features.append(('x+rB','text and ts_gf'))
features.append(('x+rC','text and ts_f'))
features.append(('x+rD','text and ts_fk'))
features.append(('x+rE','text and ts_dc'))
features.append(('x+rF','text and ts_ari'))
features.append(('x+rG','text and ts_cl'))
features.append(('x+rH','text and ts_lw'))
features.append(('x+rI','text and ts_sm'))
features.append(('x+rJ','text and ts_ts'))

####7.1.3.2 Subsets of lingusitic features

In [None]:
#Append features selected by RFECV on 40TFIDF
features.append(('x+r8+rC+rH','text, r_sm, ts_f, ts_lw'))

In [None]:
#Append features selected by RFECV on 40TFIDF plus next 3
features.append(('x+r8+rC+rH+c0+cR+rF','text, r_sm, ts_f, ts_lw, wordCount, perf_scaled, ts_ari'))

In [None]:
#Append features selected by RFECV on 40TFIDF plus next 4
#features.append(('x+r8+rC+rH+c0+cR+r2+rF','text, r_sm, ts_f, ts_lw, wordCount, perf_scaled, r_f, ts_ari'))

In [None]:
#Append features selected by ANOVA18
features.append(('x+cJ+rG+cV+c0','text, H6neg_scaled, ts_cl, H8tone, wordCount'))

In [None]:
#Append unique features with positive training impact on LR with 20 features
features.append(('x+cG+cI+cL+r8','text, forward2_scaled, LMpos_scaled, H8neg_scaled, r_sm'))

In [None]:
#Append features with positive training impact on LR with 20 features
features.append(('x+cG+cI+cL+r8+rC+rJ','text, forward2_scaled, LMpos_scaled, H8neg_scaled, r_sm, ts_f, ts_ts'))

In [None]:
#Append features with positive impact on LR with 20 features
features.append(('x+cG+cI+cL+r8+rC+rH+rJ','text, forward2_scaled, LMpos_scaled, H8neg_scaled, r_sm, ts_f, ts_lw, ts_ts'))

###7.1.4 Feature scaling

In [99]:
scales = []

In [100]:
#Normalisation
scales.append(('norm','normalise [0,1]'))

In [None]:
#Standardisation
scales.append(('std','standardise'))

In [None]:
#No scaling
scales.append(('raw','no scaling'))

###7.1.5 Feature transformation

####7.1.5.1 Bag of word transforms

In [176]:
transforms = []

In [172]:
#TFIDF

#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,20,101,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,4,1,20,101,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,7,1,20,101,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,7,1,20,21,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,40,41,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',2,7,1,40,41,20))
transforms.append(('tfidf','Term Frequency Inverse Document Frequency',2,7,1,60,61,20))

#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,40,101,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,60,61,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,40,41,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,30,31,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,20,41,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,20,21,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,10,11,20))
#transforms.append(('tfidf','Term Frequency Inverse Document Frequency',1,2,1,10,41,10))

In [None]:
#TF

#transforms.append(('tf','Term Frequency',1,2,1,40,101,20))
#transforms.append(('tf','Term Frequency',1,2,1,40,41,20))
#transforms.append(('tf','Term Frequency',1,4,1,20,101,20))
#transforms.append(('tf','Term Frequency',1,7,1,20,101,20))
#transforms.append(('tf','Term Frequency',1,7,1,20,21,20))

####7.1.5.2 Word embedding

In [193]:
transforms = []

In [178]:
#CBOW

transforms.append(('wv_cbow_mean','Word Vectors using Continuous Bag of Words and mean function',4,9,1,200,201,100))
#transforms.append(('wv_cbow_mean','Word Vectors using Continuous Bag of Words and mean function',4,9,1,300,501,100))
#transforms.append(('wv_cbow_mean','Word Vectors using Continuous Bag of Words and mean function',4,9,1,100,501,100))
#transforms.append(('wv_cbow_mean','Word Vectors using Continuous Bag of Words and max function',8,9,1,300,301,100))
#transforms.append(('wv_cbow_mean','Word Vectors using Continuous Bag of Words and max function',4,5,1,300,301,100))

#transforms.append(('wv_cbow_sum','Word Vectors using Continuous Bag of Words and sum function',4,9,1,300,501,100))

#transforms.append(('wv_cbow_max','Word Vectors using Continuous Bag of Words and max function',4,9,1,300,501,100))
#transforms.append(('wv_cbow_max','Word Vectors using Continuous Bag of Words and max function',4,9,1,100,501,100))
#transforms.append(('wv_cbow_max','Word Vectors using Continuous Bag of Words and max function',8,9,1,300,301,100))

#transforms.append(('wv_cbow_power2','Word Vectors using Continuous Bag of Words and power of 2 function',4,5,1,500,501,100))
#transforms.append(('wv_cbow_power2','Word Vectors using Continuous Bag of Words and power of 2 function',4,9,1,300,501,100))
#transforms.append(('wv_cbow_power2','Word Vectors using Continuous Bag of Words and power of 2 function',5,6,1,300,301,100))
#transforms.append(('wv_cbow_power2','Word Vectors using Continuous Bag of Words and power of 2 function',4,9,1,100,501,100))
#transforms.append(('wv_cbow_power2','Word Vectors using Continuous Bag of Words and power of 2 function',8,9,1,300,301,100))

#transforms.append(('wv_cbow_power-1','Word Vectors using Continuous Bag of Words and power of -1 function',4,9,1,300,501,100))

#transforms.append(('wv_cbow_power-2','Word Vectors using Continuous Bag of Words and power of -2 function',4,9,1,300,501,100))

In [194]:
#SG

transforms.append(('wv_sg_mean','Word Vectors using Skip Gram and mean function',4,9,1,200,501,100))
#transforms.append(('wv_sg_mean','Word Vectors using Skip Gram and mean function',4,9,1,100,501,100))
#transforms.append(('wv_sg_mean','Word Vectors using Skip Gram and max function',8,9,1,300,301,100))

#transforms.append(('wv_sg_sum','Word Vectors using Skip Gram and sum function',4,9,1,300,501,100))

#transforms.append(('wv_sg_max','Word Vectors using Skip Gram and max function',4,9,1,300,501,100))
#transforms.append(('wv_sg_max','Word Vectors using Skip Gram and max function',8,9,1,300,301,100))

#transforms.append(('wv_sg_power2','Word Vectors using Skip Gram and power of 2 function',4,9,1,300,501,100))
#transforms.append(('wv_sg_power2','Word Vectors using Skip Gram and power of 2 function',4,9,1,100,501,100))
#transforms.append(('wv_sg_power2','Word Vectors using Skip Gram and max function',8,9,1,300,301,100))

#transforms.append(('wv_sg_power-1','Word Vectors using Skip Gram and power of -1 function',4,9,1,300,501,100))

#transforms.append(('wv_sg_power-2','Word Vectors using Skip Gram and power of -2 function',4,9,1,300,501,100))

##7.2 Initialise

In [195]:
#Shuffle test rows
x_test=docTest_df.sample(frac=1).reset_index(drop=True)
y_test=x_test.risk

In [196]:
try:
    f1_df=pd.read_csv(pathFigures+F1filename, index_col=[0])
except FileNotFoundError:
    print(F1filename+' is missing from folder '+pathFigures)
    #f1_df=pd.DataFrame(columns=['condition','f1Trains','f1Train','f1Test','duration'])
    f1_df=pd.DataFrame(columns=['condition',
                                'data','model','feature_set','scale','feature','h1_desc','h1','h2_desc','h2',
                                'f1Train0','f1Train1','f1Train2','f1Train3','f1Train4',
                                'f1Train_mean','f1Train_2std',
                                'f1Test',
                                'duration'])

In [197]:
f1_df.shape

(1185, 19)

In [None]:
f1_df

Unnamed: 0,condition,data,model,feature_set,scale,feature,h1_desc,h1,h2_desc,h2,f1Train0,f1Train1,f1Train2,f1Train3,f1Train4,f1Train_mean,f1Train_2std,f1Test,duration
0,"50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:10",50:50,MLP8,x,norm,tfidf,nTo:,1,FeatureMax:,10,0.964706,0.857143,0.977273,1.0,0.969072,0.977248,0.057273,0.5,15.283632
1,"50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:20",50:50,MLP8,x,norm,tfidf,nTo:,1,FeatureMax:,20,0.987952,0.986301,0.988506,0.986667,1.0,0.992572,0.017196,0.5,10.996095
2,"50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:30",50:50,MLP8,x,norm,tfidf,nTo:,1,FeatureMax:,30,0.876712,0.986301,1.0,0.986667,0.989474,0.987998,0.048094,0.8,8.585283
3,"50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:40",50:50,MLP8,x,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,0.986301,1.0,0.986667,1.0,0.994472,0.01562,1.0,8.009985
4,"50:50,MLP8,x+r8+rC+rH,norm,tfidf,nTo:1,Feature...",50:50,MLP8,x+r8+rC+rH,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,0.985915,0.987654,0.994639,0.012225,0.666667,9.82345
5,"50:50,MLP8,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,n...",50:50,MLP8,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,0.985915,0.987654,0.995124,0.012016,1.0,7.856045
6,"50:50,MLP8,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,Feat...",50:50,MLP8,x+cJ+rG+cV+c0,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,0.985915,0.987654,0.994183,0.014292,0.666667,7.767228
7,"50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:40",50:50,LR,x,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,0.978261,1.0,0.995054,0.01727,1.0,1.400445
8,"50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:40",50:50,SVMa,x,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,0.978261,1.0,0.995054,0.01727,1.0,0.256699
9,"50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:40",50:50,SVMs,x,norm,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.666667,0.211314


In [None]:
f1_df.tail(5)

Unnamed: 0,condition,data,model,feature_set,scale,feature,h1_desc,h1,h2_desc,h2,f1Train0,f1Train1,f1Train2,f1Train3,f1Train4,f1Train_mean,f1Train_2std,f1Test,duration
710,"50:50,CNN3,x,raw,tfidf,nTo:1,FeatureMax:100",50:50,CNN3,x,raw,tfidf,nTo:,1,FeatureMax:,100,0.84375,1.0,0.976744,0.976744,0.793651,0.961556,0.120872,0.8,50.092592
711,"50:50,CNN3,x,std,tfidf,nTo:1,FeatureMax:40",50:50,CNN3,x,std,tfidf,nTo:,1,FeatureMax:,40,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.666667,37.129726
712,"50:50,CNN3,x,std,tfidf,nTo:1,FeatureMax:60",50:50,CNN3,x,std,tfidf,nTo:,1,FeatureMax:,60,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,40.650623
713,"50:50,CNN3,x,std,tfidf,nTo:1,FeatureMax:80",50:50,CNN3,x,std,tfidf,nTo:,1,FeatureMax:,80,1.0,1.0,1.0,1.0,1.0,0.999365,0.006221,0.0,52.232672
714,"50:50,CNN3,x,std,tfidf,nTo:1,FeatureMax:100",50:50,CNN3,x,std,tfidf,nTo:,1,FeatureMax:,100,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,50.167661


In [None]:
f1_df.loc[f1_df['h2']==20,'model'].unique()

array(['MLP8', 'LR', 'SVMa', 'SVMs', 'LSTM1', 'LSTM2', 'CNN1a', 'CNN1b',
       'CNN1c', 'CNN1d', 'CNN2'], dtype=object)

##7.3 Results

###7.3.0 Debug

In [None]:
#gets all .vec files (don't need glob)
import os  

In [None]:
pathWV=pathData+'interim/'

In [None]:
#Read .vec filenames
fileList = [[name, root, os.path.join(root, name)] #root+'/'+name #
             for root, dirs, allFiles in os.walk(pathWV)
             for name in allFiles
             if name.endswith(".vec")]

In [None]:
len(fileList)

29

In [None]:
fileno=0
for name, root, path in fileList:
    #print(fileno,name,path[:len(path)-4])
    os.rename(path,path[:len(path)-4])
    fileno+=1
    if fileno>40:
        break

In [None]:
printError=True

In [None]:
#Begin by clearing f1_df under 7.3.1

#50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.98 (+/- 0.02)|Test F1: 0.57|7s
#50:50,LR,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9975 (+/- 0.0101)|Test F1: 0.6667|1s

#def funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test):
totalStart = time.time()
loopTotal=len(resample)*len(clfs)*len(features)*len(transforms)*len(scales)
loop=1

#Display code control parameters
print('writeCSV',writeCSV)
print('writeF1',writeF1)

#Delete root tokens on feature change
try:
    r_train
except NameError:
    if printError:
        print('No training roots to delete')
else:
    del(r_train)

try:
    r_test
except NameError:
    if printError:
        print('No test roots to delete')
else:
    del(r_test)

#Resample data loop####################### 
for r in resample:
    r_code=r[0:2]

    #Classifier loop####################### 
    for clfAbb, clfDesc, clf in clfs:

        #Scaling loop####################### 
        for scale in scales:

            #Feature loop####################### 
            for f, fDesc in features:

                #Transform loop#######################
                for transAbb,transDesc,nStart,nEnd,nStep,sizeStart,sizeEnd,sizeStep in transforms:
                    
                    iterStart = time.time()
                    #Distinguish BOW from WV
                    if transAbb[0:2]=='tf':
                        nDesc='nTo:'
                        sizeDesc='FeatureMax:'
                    else:
                        nDesc='Min_word_count:'
                        sizeDesc='Embedding_size:'

                    trainTotal=round(1+(nEnd-1-nStart)/nStep)*round(1+(sizeEnd-1-sizeStart)/sizeStep)
                    train=1
                    #Word count training loop#######################
                    for n in range(nStart,nEnd,nStep):
                        
                        #Size training loop#######################
                        for s in range(sizeStart,sizeEnd,sizeStep):
                            print('Iteration',loop,'of',loopTotal,'| Training',train,'of',trainTotal,'|',datetime.now(tz=timezone('Africa/Johannesburg')).strftime("%d/%m/%Y %H:%M:%S"))
                            trainStart = time.time()

                            condition=r+','+clfAbb+','+f+','+scale[0]+','+transAbb+','+nDesc+str(n)+','+sizeDesc+str(s)
                            if len(f1_df[f1_df['condition']==condition])==0:
                                #if printError:
                                #      print('...not in F1 data frame')

                                #2 Get training data (checked)####################### 
                                try:
                                    y_train
                                except NameError:
                                    docTrain_df=getDocTrain(docText_df,r_code)
                                    #Shuffle rows
                                    x_train=docTrain_df.sample(frac=1).reset_index(drop=True)
                                    y_train=x_train.risk

                                #3 Get individual counts ####################### 
                                if 'c0' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['wordCount']])
                                    except NameError:
                                        xc_train=x_train[['wordCount']]
                                    try:
                                        xc_test=xc_test.join(x_test[['wordCount']])
                                    except NameError:
                                        xc_test=x_test[['wordCount']]
                                    if printError:
                                        print('...appended wordCount')
                                if 'c1' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['forward1']])
                                    except NameError:
                                        xc_train=x_train[['forward1']]
                                    try:
                                        xc_test=xc_test.join(x_test[['forward1']])
                                    except NameError:
                                        xc_test=x_test[['forward1']]
                                    if printError:
                                        print('...appended forward1')
                                if 'c2' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['forward2']])
                                    except NameError:
                                        xc_train=x_train[['forward2']]
                                    try:
                                        xc_test=xc_test.join(x_test[['forward2']])
                                    except NameError:
                                        xc_test=x_test[['forward2']]
                                    if printError:
                                        print('...appended forward2')
                                if 'c3' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['LMneg']])
                                    except NameError:
                                        xc_train=x_train[['LMneg']]
                                    try:
                                        xc_test=xc_test.join(x_test[['LMneg']])
                                    except NameError:
                                        xc_test=x_test[['LMneg']]
                                    if printError:
                                        print('...appended LMneg')
                                if 'c4' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['LMpos']])
                                    except NameError:
                                        xc_train=x_train[['LMpos']]
                                    try:
                                        xc_test=xc_test.join(x_test[['LMpos']])
                                    except NameError:
                                        xc_test=x_test[['LMpos']]
                                    if printError:
                                        print('...appended LMpos')
                                if 'c5' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H6neg']])
                                    except NameError:
                                        xc_train=x_train[['H6neg']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H6neg']])
                                    except NameError:
                                        xc_test=x_test[['H6neg']]
                                    if printError:
                                        print('...appended H6neg')
                                if 'c6' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H6pos']])
                                    except NameError:
                                        xc_train=x_train[['H6pos']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H6pos']])
                                    except NameError:
                                        xc_test=x_test[['H6pos']]
                                    if printError:
                                        print('...appended H6pos')
                                if 'c7' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H8neg']])
                                    except NameError:
                                        xc_train=x_train[['H8neg']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H8neg']])
                                    except NameError:
                                        xc_test=x_test[['H8neg']]
                                    if printError:
                                        print('...appended H8neg')
                                if 'c8' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H8pos']])
                                    except NameError:
                                        xc_train=x_train[['H8pos']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H8pos']])
                                    except NameError:
                                        xc_test=x_test[['H8pos']]
                                    if printError:
                                        print('...appended H8pos')
                                if 'c9' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['uncert']])
                                    except NameError:
                                        xc_train=x_train[['uncert']]
                                    try:
                                        xc_test=xc_test.join(x_test[['uncert']])
                                    except NameError:
                                        xc_test=x_test[['uncert']]
                                    if printError:
                                        print('...appended uncert')
                                if 'cA' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causal']])
                                    except NameError:
                                        xc_train=x_train[['causal']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causal']])
                                    except NameError:
                                        xc_test=x_test[['causal']]
                                    if printError:
                                        print('...appended causal')
                                if 'cB' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causalM']])
                                    except NameError:
                                        xc_train=x_train[['causalM']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causalM']])
                                    except NameError:
                                        xc_test=x_test[['causalM']]
                                    if printError:
                                        print('...appended causalM')
                                if 'cC' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causalM50']])
                                    except NameError:
                                        xc_train=x_train[['causalM50']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causalM50']])
                                    except NameError:
                                        xc_test=x_test[['causalM50']]
                                    if printError:
                                        print('...appended causalM50')
                                if 'cD' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['perf']])
                                    except NameError:
                                        xc_train=x_train[['perf']]
                                    try:
                                        xc_test=xc_test.join(x_test[['perf']])
                                    except NameError:
                                        xc_test=x_test[['perf']]
                                    if printError:
                                        print('...appended perf')
                                if 'cE' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['strat']])
                                    except NameError:
                                        xc_train=x_train[['strat']]
                                    try:
                                        xc_test=xc_test.join(x_test[['strat']])
                                    except NameError:
                                        xc_test=x_test[['strat']]
                                    if printError:
                                        print('...appended strat')
                                if 'cF' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['forward1_scaled']])
                                    except NameError:
                                        xc_train=x_train[['forward1_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['forward1_scaled']])
                                    except NameError:
                                        xc_test=x_test[['forward1_scaled']]
                                    if printError:
                                        print('...appended forward1_scaled')
                                if 'cG' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['forward2_scaled']])
                                    except NameError:
                                        xc_train=x_train[['forward2_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['forward2_scaled']])
                                    except NameError:
                                        xc_test=x_test[['forward2_scaled']]
                                    if printError:
                                        print('...appended forward2_scaled')
                                if 'cH' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['LMneg_scaled']])
                                    except NameError:
                                        xc_train=x_train[['LMneg_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['LMneg_scaled']])
                                    except NameError:
                                        xc_test=x_test[['LMneg_scaled']]
                                    if printError:
                                        print('...appended LMneg_scaled')
                                if 'cI' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['LMpos_scaled']])
                                    except NameError:
                                        xc_train=x_train[['LMpos_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['LMpos_scaled']])
                                    except NameError:
                                        xc_test=x_test[['LMpos_scaled']]
                                    if printError:
                                        print('...appended LMpos_scaled')
                                if 'cJ' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H6neg_scaled']])
                                    except NameError:
                                        xc_train=x_train[['H6neg_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H6neg_scaled']])
                                    except NameError:
                                        xc_test=x_test[['H6neg_scaled']]
                                    if printError:
                                        print('...appended H6neg_scaled')
                                if 'cK' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H6pos_scaled']])
                                    except NameError:
                                        xc_train=x_train[['H6pos_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H6pos_scaled']])
                                    except NameError:
                                        xc_test=x_test[['H6pos_scaled']]
                                    if printError:
                                        print('...appended H6pos_scaled')
                                if 'cL' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H8neg_scaled']])
                                    except NameError:
                                        xc_train=x_train[['H8neg_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H8neg_scaled']])
                                    except NameError:
                                        xc_test=x_test[['H8neg_scaled']]
                                    if printError:
                                        print('...appended H8neg_scaled')
                                if 'cM' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H8pos_scaled']])
                                    except NameError:
                                        xc_train=x_train[['H8pos_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H8pos_scaled']])
                                    except NameError:
                                        xc_test=x_test[['H8pos_scaled']]
                                    if printError:
                                        print('...appended H8pos_scaled')
                                if 'cN' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['uncert_scaled']])
                                    except NameError:
                                        xc_train=x_train[['uncert_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['uncert_scaled']])
                                    except NameError:
                                        xc_test=x_test[['uncert_scaled']]
                                    if printError:
                                        print('...appended uncert_scaled')
                                if 'cO' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causal_scaled']])
                                    except NameError:
                                        xc_train=x_train[['causal_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causal_scaled']])
                                    except NameError:
                                        xc_test=x_test[['causal_scaled']]
                                    if printError:
                                        print('...appended causal_scaled')
                                if 'cP' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causalM_scaled']])
                                    except NameError:
                                        xc_train=x_train[['causalM_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causalM_scaled']])
                                    except NameError:
                                        xc_test=x_test[['causalM_scaled']]
                                    if printError:
                                        print('...appended causalM_scaled')
                                if 'cQ' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['causalM50_scaled']])
                                    except NameError:
                                        xc_train=x_train[['causalM50_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['causalM50_scaled']])
                                    except NameError:
                                        xc_test=x_test[['causalM50_scaled']]
                                    if printError:
                                        print('...appended causalM50_scaled')
                                if 'cR' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['perf_scaled']])
                                    except NameError:
                                        xc_train=x_train[['perf_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['perf_scaled']])
                                    except NameError:
                                        xc_test=x_test[['perf_scaled']]
                                    if printError:
                                        print('...appended perf_scaled')
                                if 'cS' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['strat_scaled']])
                                    except NameError:
                                        xc_train=x_train[['strat_scaled']]
                                    try:
                                        xc_test=xc_test.join(x_test[['strat_scaled']])
                                    except NameError:
                                        xc_test=x_test[['strat_scaled']]
                                    if printError:
                                        print('...appended strat_scaled')
                                if 'cT' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['LMtone']])
                                    except NameError:
                                        xc_train=x_train[['LMtone']]
                                    try:
                                        xc_test=xc_test.join(x_test[['LMtone']])
                                    except NameError:
                                        xc_test=x_test[['LMtone']]
                                    if printError:
                                        print('...appended LMtone')
                                if 'cU' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H6tone']])
                                    except NameError:
                                        xc_train=x_train[['H6tone']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H6tone']])
                                    except NameError:
                                        xc_test=x_test[['H6tone']]
                                    if printError:
                                        print('...appended H6tone')
                                if 'cV' in f:
                                    try:
                                        xc_train=xc_train.join(x_train[['H8tone']])
                                    except NameError:
                                        xc_train=x_train[['H8tone']]
                                    try:
                                        xc_test=xc_test.join(x_test[['H8tone']])
                                    except NameError:
                                        xc_test=x_test[['H8tone']]
                                    if printError:
                                        print('...appended H8tone')
                                #3 Get all counts (checked)####################### 
                                if 'ca' in f:
                                    try:
                                        xc_train
                                    except NameError:
                                        xc_train=x_train[['wordCount','forward1','forward2','LMneg','LMpos','H6neg','H6pos','H8neg','H8pos','uncert','causal','causalM','causalM50','perf','strat',
                                                          'forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                          'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                    try:
                                        xc_test
                                    except NameError:
                                        xc_test=x_test[['wordCount','forward1','forward2','LMneg','LMpos','H6neg','H6pos','H8neg','H8pos','uncert','causal','causalM','causalM50','perf','strat',
                                                        'forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                          'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                #3 Get scaled counts and tone (checked)####################### 
                                elif 'cs' in f:
                                    try:
                                        xc_train
                                    except NameError:
                                        #xc_train=x_train[['wordCount']].join(x_train.iloc[:,-35:-18]) was all counts
                                        xc_train=x_train[['wordCount','forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                          'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]
                                    try:
                                        xc_test
                                    except NameError:
                                        #xc_test=x_test[['wordCount']].join(x_test.iloc[:,-35:-18])
                                        xc_test=x_test[['wordCount','forward1_scaled','forward2_scaled','LMneg_scaled','LMpos_scaled','H6neg_scaled','H6pos_scaled','H8neg_scaled','H8pos_scaled',
                                                          'uncert_scaled','causal_scaled','causalM_scaled','causalM50_scaled','perf_scaled','strat_scaled','LMtone','H6tone','H8tone']]

                                #3 Get individual readability indices ####################### 
                                if 'r1' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_gf']])
                                    except NameError:
                                        xr_train=x_train[['r_gf']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_gf']])
                                    except NameError:
                                        xr_test=x_test[['r_gf']]
                                    if printError:
                                        print('...appended r_gf')
                                if 'r2' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_f']])
                                    except NameError:
                                        xr_train=x_train[['r_f']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_f']])
                                    except NameError:
                                        xr_test=x_test[['r_f']]
                                    if printError:
                                        print('...appended r_f')
                                if 'r3' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_fk']])
                                    except NameError:
                                        xr_train=x_train[['r_fk']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_fk']])
                                    except NameError:
                                        xr_test=x_test[['r_fk']]
                                    if printError:
                                        print('...appended r_fk')
                                if 'r4' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_dc']])
                                    except NameError:
                                        xr_train=x_train[['r_dc']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_dc']])
                                    except NameError:
                                        xr_test=x_test[['r_dc']]
                                    if printError:
                                        print('...appended r_dc')
                                if 'r5' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_ari']])
                                    except NameError:
                                        xr_train=x_train[['r_ari']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_ari']])
                                    except NameError:
                                        xr_test=x_test[['r_ari']]
                                    if printError:
                                        print('...appended r_ari')
                                if 'r6' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_cl']])
                                    except NameError:
                                        xr_train=x_train[['r_cl']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_cl']])
                                    except NameError:
                                        xr_test=x_test[['r_cl']]
                                    if printError:
                                        print('...appended r_cl')
                                if 'r7' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_lw']])
                                    except NameError:
                                        xr_train=x_train[['r_lw']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_lw']])
                                    except NameError:
                                        xr_test=x_test[['r_lw']]
                                    if printError:
                                        print('...appended r_lw')
                                if 'r8' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_sm']])
                                    except NameError:
                                        xr_train=x_train[['r_sm']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_sm']])
                                    except NameError:
                                        xr_test=x_test[['r_sm']]
                                    if printError:
                                        print('...appended r_sm')
                                if 'r9' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['r_sp']])
                                    except NameError:
                                        xr_train=x_train[['r_sp']]
                                    try:
                                        xr_test=xr_test.join(x_test[['r_sp']])
                                    except NameError:
                                        xr_test=x_test[['r_sp']]
                                    if printError:
                                        print('...appended r_sp')
                                if 'rB' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_gf']])
                                    except NameError:
                                        xr_train=x_train[['ts_gf']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_gf']])
                                    except NameError:
                                        xr_test=x_test[['ts_gf']]
                                    if printError:
                                        print('...appended ts_gf')
                                if 'rC' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_f']])
                                    except NameError:
                                        xr_train=x_train[['ts_f']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_f']])
                                    except NameError:
                                        xr_test=x_test[['ts_f']]
                                    if printError:
                                        print('...appended ts_f')
                                if 'rD' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_fk']])
                                    except NameError:
                                        xr_train=x_train[['ts_fk']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_fk']])
                                    except NameError:
                                        xr_test=x_test[['ts_fk']]
                                    if printError:
                                        print('...appended ts_fk')
                                if 'rE' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_dc']])
                                    except NameError:
                                        xr_train=x_train[['ts_dc']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_dc']])
                                    except NameError:
                                        xr_test=x_test[['ts_dc']]
                                        if printError:
                                            print('...appended ts_dc')
                                if 'rF' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_ari']])
                                    except NameError:
                                        xr_train=x_train[['ts_ari']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_ari']])
                                    except NameError:
                                        xr_test=x_test[['ts_ari']]
                                    if printError:
                                        print('...appended ts_ari')
                                if 'rG' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_cl']])
                                    except NameError:
                                        xr_train=x_train[['ts_cl']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_cl']])
                                    except NameError:
                                        xr_test=x_test[['ts_cl']]
                                    if printError:
                                        print('...appended ts_cl')
                                if 'rH' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_lw']])
                                    except NameError:
                                        xr_train=x_train[['ts_lw']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_lw']])
                                    except NameError:
                                        xr_test=x_test[['ts_lw']]
                                    if printError:
                                        print('...appended ts_lw')
                                if 'rI' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_sm']])
                                    except NameError:
                                        xr_train=x_train[['ts_sm']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_sm']])
                                    except NameError:
                                        xr_test=x_test[['ts_sm']]
                                    if printError:
                                        print('...appended ts_sm')
                                if 'rJ' in f:
                                    try:
                                        xr_train=xr_train.join(x_train[['ts_ts']])
                                    except NameError:
                                        xr_train=x_train[['ts_ts']]
                                    try:
                                        xr_test=xr_test.join(x_test[['ts_ts']])
                                    except NameError:
                                        xr_test=x_test[['ts_ts']]
                                    if printError:
                                        print('...appended ts_ts')
                                #4 Get all readability indices (checked)####################### 
                                if 'ra' in f:
                                    try:
                                        xr_train
                                    except NameError:                                    
                                        xr_train=x_train[['r_gf', 'r_f', 'r_fk', 'r_dc', 'r_ari','r_cl', 'r_lw', 'r_sm', 'r_sp', 
                                                          'ts_gf', 'ts_f', 'ts_fk', 'ts_dc','ts_ari', 'ts_cl', 'ts_lw', 'ts_sm', 'ts_ts']]
                                        
                                    try:
                                        xr_test
                                    except NameError:                                    
                                        xr_test=x_test[['r_gf', 'r_f', 'r_fk', 'r_dc', 'r_ari','r_cl', 'r_lw', 'r_sm', 'r_sp', 
                                                        'ts_gf', 'ts_f', 'ts_fk', 'ts_dc','ts_ari', 'ts_cl', 'ts_lw', 'ts_sm', 'ts_ts']]
                                #4 Get select 3 readability indices (checked)####################### 
                                elif 'rs' in f:
                                    try:
                                        xr_train
                                    except NameError:                                    
                                        xr_train=x_train[['ts_gf','ts_f','ts_ts']]
                                        
                                    try:
                                        xr_test
                                    except NameError:                                    
                                        xr_test=x_test[['ts_gf','ts_f','ts_ts']]

                                #5 Get frequency or word vectors as data frame
                                if transAbb=='tf':
                                    #5.1 Get TF vectors as data frame (did not load so slow but correct)#######################ValueError: Object arrays cannot be loaded when allow_pickle=False                              
                                    pathTrain=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                    pathTest=pathData+'interim/'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                    try:
                                        #load vectorised training data frame from disk
                                        #v_train=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0])
                                        v_train=pd.read_csv(pathTest+'_v_train.csv', index_col=[0]) #original training sample (TF values are same on 50:50 and 20:80 samples)
                                        #Join loaded data to docID and docName of randomised training data
                                        v_train_df=x_train[['docID']].merge(v_train,how='left',on='docID')
                                        
                                        #load vectorised test data frame from disk
                                        v_test=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                        #Join loaded data to docID and docName of randomised test data
                                        v_test_df=x_test[['docID']].merge(v_test,how='left',on='docID')
                                        if printError:
                                            print('...loaded vectorized file')
                                    except:                              
                                        v_train, t_vectorizer=fit_tf_vectorizer(x_train.text,nTo=n,featureMax=s)
                                        v_test = t_vectorizer.transform(x_test.text)

                                        #Join sparse matrix to docID and docName of training data
                                        v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train.todense(),columns=t_vectorizer.get_feature_names()))
                                        #Join sparse matrix to docID and docName of test data
                                        v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test.todense(),columns=t_vectorizer.get_feature_names()))

                                        #save vectorised training data frame to disk
                                        if writeCSV:
                                            #v_train_df.to_csv(pathTrain+'_v_train.csv')
                                            v_train_df.drop_duplicates().to_csv(pathTest+'_v_train.csv') #duplicates explode with join (TF values are same on 50:50 and 20:80 samples)
                                            #save vectorised test data frame to disk
                                            v_test_df.to_csv(pathTest+'_v_test.csv')
                                            if printError:
                                                print('...saved vectorized file')
                                        #delete to save memory
                                        del(t_vectorizer)
                                        del(v_train)
                                        del(v_test)

                                elif transAbb=='tfidf':
                                    #5.2 Get TFIDF vectors as data frame (did not load so slow but correct)#######################ValueError: Object arrays cannot be loaded when allow_pickle=False                                                                
                                    pathTrain=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                    pathTest=pathData+'interim/'+f[0]+'_'+transAbb+'_'+str(n)+'_'+str(s)
                                    try:
                                        #load vectorised training data frame from disk
                                        #v_train=pd.read_csv(pathTrain+'_v_train.csv.ignore', index_col=[0]) #TFIDF values differ across the original, 50:50 and 20:80 training samples
                                        v_train=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0]) #TFIDF values differ across the original, 50:50 and 20:80 training samples
                                        #Join loaded data to docID and docName of randomised training data
                                        v_train_df=x_train[['docID']].merge(v_train,how='left',on='docID')

                                        #load vectorised test data frame from disk
                                        v_test=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                        #Join loaded data to docID and docName of randomised test data
                                        v_test_df=x_test[['docID']].merge(v_test,how='left',on='docID')
                                        if printError:
                                            print('...loaded vectorized file')
                                    except:                              
                                        if printError:
                                            print('...vectorizing data')
                                        v_train, t_vectorizer=fit_tfidf_vectorizer(x_train.text,nTo=n,featureMax=s)
                                        v_test = t_vectorizer.transform(x_test.text)

                                        #Join sparse matrix to docID and docName of training data
                                        v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train.todense(),columns=t_vectorizer.get_feature_names()))
                                        #Join sparse matrix to docID and docName of test data
                                        v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test.todense(),columns=t_vectorizer.get_feature_names()))

                                        #save vectorised training data frame to disk
                                        if writeCSV:
                                            v_train_df.drop_duplicates().to_csv(pathTrain+'_v_train.csv') #duplicates explode with join
                                            #save vectorised test data frame to disk
                                            v_test_df.to_csv(pathTest+'_v_test.csv')
                                            if printError:
                                                print('...saved vectorized file')
                                        #delete to save memory
                                        del(t_vectorizer)
                                        del(v_train)
                                        del(v_test)
                                else:
                                    #5.3 Get word vectors (WV) as data frame #######################
                                    #if printError:
                                    #    print('...get RegEx tokens')

                                    #6. Get RegEx tokens (fixed was wrong order)#######################                                    
                                    try:
                                        t_train
                                    except NameError:
                                        #Tokenize and remove stop words from training data
                                        try:
                                            #Get training tokens as list of lists
                                            t_train=list(x_train.merge(docToken_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                        except:
                                            #Tokenize
                                            t_train=reg_tokenize(x_train.text)
                                            print('...tokenized training data')
                                            
                                    try:
                                        t_test
                                    except NameError:
                                        #Tokenize and remove stop words from testing data
                                        try:
                                            #Get test tokens as list of lists
                                            t_test=list(x_test.merge(docToken_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                        except:
                                            #Tokenize
                                            t_test=reg_tokenize(x_test.text)    
                                            print('...tokenized test data')       
                                    #End of 6. Get RegEx tokens#######################

                                    #7 Get root tokens (fixed was wrong order)#######################
                                    try:
                                        r_train
                                    except NameError:
                                        #Stemming/Lemmatization of training data
                                        if 'ps' in f:
                                            #Stem training data
                                            try:
                                                #Get training tokens as list of lists
                                                r_train=list(x_train.merge(docTokenPS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_train=[[PorterStemmer().stem(plural) for plural in doc] for doc in t_train]
                                                print('...PorterStemming training data')
                                        elif 'ls' in f:
                                            #Stem training data
                                            try:
                                                #Get training tokens as list of lists
                                                r_train=list(x_train.merge(docTokenLS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_train=[[LancasterStemmer().stem(plural) for plural in doc] for doc in t_train]
                                                print('...LancasterStemming training data')
                                        elif 'ss' in f:
                                            #Stem training data
                                            try:
                                                #Get training tokens as list of lists
                                                r_train=list(x_train.merge(docTokenSS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_train=[[SnowballStemmer("english").stem(plural) for plural in doc] for doc in t_train]
                                                print('...SnowballStemming training data')
                                        elif 'wl' in f:
                                            #Lemmatize training data
                                            try:
                                                #Get training tokens as list of lists
                                                r_train=list(x_train.merge(docTokenWL_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Lemmatize
                                                r_train=[[WordNetLemmatizer().lemmatize(plural) for plural in doc] for doc in t_train]
                                                print('...Lemmatizing training data')
                                        else:
                                            r_train=t_train.copy()
                                    try:
                                        r_test
                                    except NameError:
                                        #Stemming/Lemmatization of test data
                                        if 'ps' in f:
                                            #Stem testing data
                                            try:
                                                #Get test tokens as list of lists
                                                r_test=list(x_test.merge(docTokenPS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_test=[[PorterStemmer().stem(plural) for plural in doc] for doc in t_test]
                                                print('...PorterStemming test data') 
                                        elif 'ls' in f:
                                            #Stem testing data
                                            try:
                                                #Get test tokens as list of lists
                                                r_test=list(x_test.merge(docTokenLS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_test=[[LancasterStemmer().stem(plural) for plural in doc] for doc in t_test]
                                                print('...LancasterStemming test data') 
                                        elif 'ss' in f:
                                            #Stem testing data
                                            try:
                                                #Get test tokens as list of lists
                                                r_test=list(x_test.merge(docTokenSS_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Stem
                                                r_test=[[SnowballStemmer("english").stem(plural) for plural in doc] for doc in t_test]
                                                print('...SnowballStemming test data') 
                                        elif 'wl' in f:
                                            #Lemmatize testing data
                                            try:
                                                #Get test tokens as list of lists
                                                r_test=list(x_test.merge(docTokenWL_df[['docID','text']],how='left',on='docID')['text_y']) #Consider keeping data frame or nparray
                                            except:
                                                #Lemmatize
                                                r_test=[[WordNetLemmatizer().lemmatize(plural) for plural in doc] for doc in t_test]
                                                print('...Lemmatizing test data') 
                                        else:
                                            r_test=t_test.copy()
                                    #End of 7 Get root tokens#######################
                                    
                                    transform_type=transAbb.split('_')[2] #Get transform function from third term
                                    wv_sg=len(transAbb.split('wv_sg'))-1 #Get Skip Gram term
                                    pathWV=pathData+'interim/'+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s)
                                    pathTrain=pathWV+'_'+transform_type
                                    pathTest=pathData+'interim/'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s)+'_'+transform_type

                                    #if printError:
                                    #    print('...start embedding')
                                    #8 Embed training words as vectors (fixed was wrong order)#######################
                                    try:
                                        #load training data set from disk
                                        #v_train_df=pd.read_csv(pathTrain+'_v_train.csv.ignore', index_col=[0])
                                        v_train_df=pd.read_csv(pathTrain+'_v_train.csv', index_col=[0])
                                        
                                        #merge loaded data to docID of training data to deal with shuffle
                                        v_train_df=x_train[['docID']].merge(v_train_df,how='left',on='docID')

                                        if printError:
                                            print('...loaded training word vector file')
                                    except:
                                        if printError:
                                            print('...did not load training word vector file')
                                        #Embed                            
                                        try:
                                            #load from disk
                                            wv=KeyedVectors.load(pathWV, mmap='r')
                                            print('...loading keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                        except:
                                            print('...converting words with count>=',str(n),'to vector of size',str(s))
                                            #limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling.
                                            w2v_model = Word2Vec(r_train, size=s, window=window_size, min_count=n, workers=1, sg=wv_sg) #Removed threads for reproduceability
                                            w2v_model.wv.save(pathWV) #Save keyed vectors
                                            wv=w2v_model.wv
                                            del w2v_model #save memory

                                        #Transform (align with Embed header)
                                        print('...embedding training vectors and transforming')
                                        v_train = embedding_transform(r_train, wv, embedding_size=s, transform_type=transform_type)

                                        #Join numpy array to docID and docName of training data
                                        v_train_df=x_train[['docID','docName']].join(pd.DataFrame(v_train)) 

                                        #save vectorised training data frame to disk
                                        if writeCSV:
                                            v_train_df.drop_duplicates().to_csv(pathTrain+'_v_train.csv') #duplicates explode with join 

                                        #delete to save memory
                                        del(v_train)

                                        if printError and writeCSV:
                                            print('...saved training word vector file')

                                    #8 Embed test words as vectors (fixed was wrong order)#######################
                                    try:
                                        #load test data set from disk
                                        v_test_df=pd.read_csv(pathTest+'_v_test.csv', index_col=[0])
                                        
                                        #merge loaded data to docID of test data to deal with shuffle
                                        v_test_df=x_test[['docID']].merge(v_test_df,how='left',on='docID')

                                        if printError:
                                            print('...loaded test word vector file')
                                    except:
                                        #Embed
                                        try:
                                            wv
                                            print('...reusing keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                        except:
                                            try:
                                                #load from disk
                                                wv=KeyedVectors.load(pathWV, mmap='r')
                                                print('...loading keyed vector file: '+r_code+'_'+f[0]+'_'+transAbb[3:5]+'_'+str(n)+'_'+str(s))
                                            except:
                                                print('...converting words with count >',str(n),'to vector of size',str(s))
                                                #limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling.
                                                w2v_model = Word2Vec(r_train, size=s, window=window_size, min_count=n, workers=1, sg=wv_sg, seed=1) #Removed threads for reproduceability
                                                w2v_model.wv.save(pathWV) #Save keyed vectors
                                                wv=w2v_model.wv
                                                del w2v_model #save memory

                                        #Transform (align with Embed header)
                                        print('...embedding test vectors and transforming')
                                        v_test = embedding_transform(r_test, wv, embedding_size=s, transform_type=transform_type)

                                        #Join numpy array to docID and docName of test data
                                        v_test_df=x_test[['docID','docName']].join(pd.DataFrame(v_test))

                                        #save vectorised test data frame to disk
                                        if writeCSV:
                                            v_test_df.to_csv(pathTest+'_v_test.csv')
                                            if printError:
                                                print('...saved test word vector  file')
                                        #delete to save memory
                                        del(v_test)
                                        del(wv)

                                    try:
                                        wv
                                    except NameError:
                                        if printError:
                                            print('No wv to delete')
                                    else:
                                        del(wv)                                      
                                    #End of 8 Embed words as vectors#######################

                                #End of 5 Get frequency or word vectors as data frame#######################

                                #Remove document identifier and name from dataframe
                                v_train=v_train_df.iloc[:,2:]
                                v_test=v_test_df.iloc[:,2:]

                                #delete to save memory
                                del(v_train_df)
                                del(v_test_df)

                                #Join to counts
                                if 'c' in f:
                                    v_train=v_train.join(xc_train)
                                    v_test=v_test.join(xc_test)

                                #Join to readability indices
                                if 'r' in f:
                                    v_train=v_train.join(xr_train)
                                    v_test=v_test.join(xr_test)

                                #Scale (changes df into numpy array)
                                if scale[0]=='norm':
                                    #Normalise to [0,1]
                                    scalerNorm = MinMaxScaler().fit(v_train)
                                    v_train = pd.DataFrame(scalerNorm.transform(v_train),columns=v_train.columns)
                                    v_test = pd.DataFrame(scalerNorm.transform(v_test),columns=v_test.columns)
                                elif scale[0]=='std':
                                    scalerStd = StandardScaler().fit(v_train)
                                    v_train = pd.DataFrame(scalerStd.transform(v_train),columns=v_train.columns)
                                    v_test = pd.DataFrame(scalerStd.transform(v_test),columns=v_test.columns)
                                
                                #Cross-Validate (includes fit)
                                if printError:
                                    print('...cross validating on training data')
                                if printHead:
                                    print(v_train.head())       

                                #Adjust for Keras models
                                if clfAbb[0:4]=='LSTM':
                                    #LSTM has shape (samples,timestep,features)
                                    v_train=np.array(v_train).reshape(v_train.shape[0], 1, v_train.shape[1])
                                    v_test=np.array(v_test).reshape(v_test.shape[0], 1, v_test.shape[1])
                                elif clfAbb[0:3]=='CNN':
                                    #CNN has shape (samples,features,dimensions/channel)
                                    v_train=np.array(v_train).reshape(v_train.shape[0], v_train.shape[1],1)
                                    v_test=np.array(v_test).reshape(v_test.shape[0], v_test.shape[1],1)

                                if clfAbb=='LSTM1':    #2-layer LSTM with 50% dropout
                                    def createLSTM():
                                        model=Sequential()

                                        #LSTM layer
                                        model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train.shape[2])))

                                        #50% dropout layer
                                        model.add(Dropout(0.5))

                                        #LSTM layer
                                        model.add(LSTM(64))

                                        #50% dropout layer
                                        model.add(Dropout(0.5))

                                        #Dense output layer with softmax activation
                                        model.add(Dense(1, activation='sigmoid')) #For binary classification

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model
                                elif clfAbb=='LSTM2':    #2-layer LSTM with 20% dropout
                                    def createLSTM():
                                        model=Sequential()

                                        #LSTM layer
                                        model.add(LSTM(128, return_sequences=True,input_shape=(1, v_train.shape[2])))

                                        #20% dropout layer
                                        model.add(Dropout(0.2))

                                        #LSTM layer
                                        model.add(LSTM(64))

                                        #20% dropout layer
                                        model.add(Dropout(0.2))

                                        #Dense output layer with softmax activation
                                        model.add(Dense(1, activation='sigmoid')) #For binary classification

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model
                                elif clfAbb=='CNN1a':    #4 convolution layers with sigmoid activation (no dropout)
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layers with sigmoid activation
                                        model.add(Conv1D(filters=64, kernel_size=3, activation='sigmoid',input_shape=(v_train.shape[1], 1))) #v_train_sfd
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))
                                        #50% dropout layer
                                        #model.add(Dropout(0.5))
                                        #Convolution layer
                                        model.add(Conv1D(filters=48, kernel_size=3, activation='sigmoid'))
                                        model.add(Flatten())
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model
                                elif clfAbb=='CNN1b':    #4 convolution layers with ReLU activation (no dropout)
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layers with sigmoid activation
                                        model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train.shape[1], 1)))
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                        #50% dropout layer
                                        #model.add(Dropout(0.5))
                                        #Convolution layer
                                        model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
                                        model.add(Flatten())
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model
                                elif clfAbb=='CNN1c':    #4 convolution layers with ReLU activation (20% dropout)
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layers with sigmoid activation
                                        model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(v_train.shape[1], 1)))
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                        model.add(Conv1D(filters=100, kernel_size=3, activation='relu'))
                                        #20% dropout layer
                                        model.add(Dropout(0.2))
                                        #Convolution layer
                                        model.add(Conv1D(filters=48, kernel_size=3, activation='relu'))
                                        model.add(Flatten())
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model  
                                elif clfAbb=='CNN1d':    #4 convolution layers with no activation (no dropout)
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layers with sigmoid activation
                                        model.add(Conv1D(filters=64, kernel_size=3,input_shape=(v_train.shape[1], 1)))
                                        model.add(Conv1D(filters=100, kernel_size=3))
                                        model.add(Conv1D(filters=100, kernel_size=3))
                                        #50% dropout layer
                                        #model.add(Dropout(0.5))
                                        #Convolution layer
                                        model.add(Conv1D(filters=48, kernel_size=3))
                                        model.add(Flatten())
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))


                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model   
                                elif clfAbb=='CNN2':    #1 convolution layer with max pooling and LSTM layers
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layer with Rectified Linear Unit (ReLU) activation
                                        model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu',input_shape=(v_train.shape[1], 1)))
                                        #Max pooling layer
                                        model.add(MaxPooling1D(pool_size=2))
                                        #LSTM layer
                                        model.add(LSTM(100))
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model  
                                elif clfAbb=='CNN3':    #3 convolution layers with max pooling
                                    def createCNN():
                                        model=Sequential()

                                        #Convolution layer with relu activation
                                        model.add(Conv1D(filters=128, kernel_size=5, activation='relu',input_shape=(v_train.shape[1], 1),padding='same'))
                                        #Max pooling layer
                                        model.add(MaxPooling1D(pool_size=3)) #pool_size=5
                                        #Convolution layer with relu activation
                                        model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
                                        #Max pooling layer
                                        model.add(MaxPooling1D(pool_size=3)) #pool_size=5
                                        #Convolution layer with relu activation
                                        model.add(Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
                                        #Max pooling layer
                                        model.add(MaxPooling1D(pool_size=3)) #pool_size=35
                                        model.add(Flatten())
                                        #Dense output layer with sigmoid activation
                                        model.add(Dense(1, activation='sigmoid'))

                                        model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[f1])

                                        return model  

                                #Wrap Keras models
                                if clfAbb[0:4]=='LSTM':
                                    clf=KerasClassifier(build_fn=createLSTM,epochs=5,validation_split=0.2,verbose=0)
                                elif clfAbb[0:3]=='CNN':
                                    clf=KerasClassifier(build_fn=createCNN,epochs=5,validation_split=0.2,verbose=0)

                                #f1_train=cross_val_score(clf,v_train,y_train,cv=5,scoring='f1',n_jobs=threads)
                                #5 times 5-fold Cross-Validate
                                cv=RepeatedKFold(n_splits=5,n_repeats=5,random_state=0)
                                f1_train=cross_val_score(clf,v_train,y_train,cv=cv,scoring='f1',n_jobs=threads)

                                #Fit
                                #LogisticRegression: n_jobs=Number of CPU cores if multi_class=’ovr’”. Ignored with ‘liblinear’. None means 1. -1 means all processors. 
                                clf.fit(v_train,y_train) #max_iter=1000 to allow convergence (default=100)

                                #Test
                                if printHead:
                                    print(v_test.head())                                    
                                pred_test=(clf.predict(v_test) > 0.5).astype("int32") #clf.predict_proba(t_test); Addresses warning using sigmoid
                                f1_test=f1_score(y_test,pred_test)

                                trainDuration=time.time()-trainStart
                                f1_df=f1_df.append([{'condition':condition,
                                                    'data':r,'model':clfAbb,'feature_set':f,'scale':scale[0],'feature':transAbb,'h1_desc':nDesc,'h1':n,'h2_desc':sizeDesc,'h2':s,
                                                      'f1Train0':f1_train[0],'f1Train1':f1_train[1],'f1Train2':f1_train[2],'f1Train3':f1_train[3],'f1Train4':f1_train[4],
                                                      'f1Train_mean':f1_train.mean(),'f1Train_2std':f1_train.std()*2, #95% of values fall within +/- 2 standard deviations
                                                      'f1Test':f1_test,
                                                      'duration':trainDuration}],
                                                    ignore_index=True)
                                #print(f1_df)
                                
                                print(condition+'|Train F1: %0.4f (+/- %0.4f)' % (f1_train.mean(),f1_train.std()*2)+'|Test F1: %0.4f' % (f1_test)+'|%0.1ds' % trainDuration)
                            else:
                                print(condition+' in df|Train F1: %0.4f (+/- %0.4f)' % (f1_df.loc[f1_df['condition']==condition,'f1Train_mean'],f1_df.loc[f1_df['condition']==condition,'f1Train_2std'])+'|Test F1: %0.4f' % f1_df.loc[f1_df['condition']==condition,'f1Test']+'|%0.1ds' % f1_df.loc[f1_df['condition']==condition,'duration'])
                            train+=1
                        #Size training loop#######################

                    #End of Word count training loop#######################                             

writeCSV True
writeF1 True
No training roots to delete
No test roots to delete
Iteration 1 of 1 | Training 1 of 1 | 20/11/2020 23:11:39
...did not load training word vector file
...loading keyed vector file: 50_x_cb_5_300
...embedding training vectors and transforming


KeyboardInterrupt: ignored

In [None]:
printError=True

In [None]:
pathTrain=pathWV+'_'+transform_type

In [None]:
pathTrain

'./drive/My Drive/MIT 807 Big Data Science Mini-Dissertation/GitHub/data/interim/50_x_cb_5_300_power2'

In [None]:
pd.read_csv(pathTrain+'_v_train.csv', index_col=[0])

In [None]:
y_test[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    1
9    0
Name: risk, dtype: int64

In [None]:
x_test[:10]

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
0,36,Bank of Baroda Annualreport2017-18.pdf,2018,26,Bank of Baroda,0,1,270,Chairman's Statement 05 MD & CEO’s Statemen...,155021,362,417,1414,1346,582,892,576,850,2066,757,393,270,744,3101,0.002335,0.00269,0.009121,0.008683,0.003754,0.005754,0.003716,0.005483,0.013327,0.004883,0.002535,0.001742,0.004799,0.020004,-0.024638,0.210312,0.192146,35.200776,26.973975,29.978002,22.379777,30.121447,-5.406596,46.01413,19.253489,20.939934,36.1,-1.95,35.6,4.79,51.9,17.66,75.0,19.6,36.0
1,249,NASDAQ_MBWM_2018.pdf,2018,22,Mercantile Bank Ltd,0,1,161,OUR MISSION STATEMENT The mission of Mercanti...,63732,667,743,1364,1086,523,761,514,749,2180,1630,632,410,1014,2311,0.010466,0.011658,0.021402,0.01704,0.008206,0.011941,0.008065,0.011752,0.034206,0.025576,0.009917,0.006433,0.01591,0.036261,-0.113469,0.185358,0.186065,21.945081,25.721905,18.417551,12.940322,22.060766,14.790474,27.438244,16.445233,11.150541,58.61,-168.06,72.5,7.76,78.2,16.15,10.571429,35.1,73.0
2,290,Societe Generale_Financial Statements 2018.pdf,2018,17,Societe Generale Johannesburg Branch,0,1,185,1 31.12.2018 ...,70143,410,475,1738,539,621,245,614,244,1606,1176,449,256,1641,2811,0.005845,0.006772,0.024778,0.007684,0.008853,0.003493,0.008754,0.003479,0.022896,0.016766,0.006401,0.00365,0.023395,0.040075,-0.52657,-0.43418,-0.431235,23.549473,20.819342,20.524207,13.782186,24.441015,14.149893,31.387333,28.698559,12.095677,47.13,-88.27,54.3,6.34,63.0,14.93,23.666667,30.9,31.0
3,133,2019 Integrated Annual Report.pdf,2019,37,Grindrod Bank,0,2,126,cover_FINAL_09042020.indd 1 2020/04/21 12...,100281,561,649,1957,1569,980,1125,964,1091,2970,1517,1008,766,691,6425,0.005594,0.006472,0.019515,0.015646,0.009773,0.011218,0.009613,0.010879,0.029617,0.015127,0.010052,0.007639,0.006891,0.06407,-0.11004,0.068884,0.0618,21.159743,19.832112,18.499253,13.998226,20.5139,14.721598,25.730604,32.977858,11.004429,21.09,11.35,24.3,3.04,30.6,15.45,73.0,22.8,23.0
4,102,Citi Bank NA Annual Report 2018.pdf,2018,13,Citibank N.A,0,1,324,2018 ANNUAL REPORT Citi’s Value Proposition:...,173923,1444,1735,4967,2773,2215,1962,2145,1906,6262,4194,2194,1627,3165,7853,0.008303,0.009976,0.028559,0.015944,0.012736,0.011281,0.012333,0.010959,0.036004,0.024114,0.012615,0.009355,0.018198,0.045152,-0.283463,-0.06057,-0.058998,20.991866,24.745682,18.059231,13.324605,20.62685,14.246561,25.67792,23.407338,10.842558,18.35,17.85,21.8,2.58,27.9,15.45,12.8,20.7,13.0
5,114,Deutsche_Bank_Annual_Report_2019.pdf,2019,14,Deutsche Bank AG,0,1,470,Annual Report \n2019 Deutsche Bank Christia...,274817,2030,2462,8445,3330,4333,2265,4282,2188,9419,5987,3098,2221,4646,13700,0.007387,0.008959,0.03073,0.012117,0.015767,0.008242,0.015581,0.007962,0.034274,0.021785,0.011273,0.008082,0.016906,0.049851,-0.434395,-0.313428,-0.323648,21.501506,24.296577,18.548628,13.58788,21.361475,14.088487,26.903253,17.238542,11.197093,25.21,-42.12,34.5,3.38,35.6,14.64,13.8,23.8,35.0
6,62,"Bidvest+Bank+Annual+Report,+year+ended+30+June...",2018,20,Bidvest Bank Ltd,0,1,147,ABRIDGED \nANNUAL REPORT \n2017/2018 Who we...,37423,171,203,733,799,446,534,439,517,1363,460,298,227,348,2218,0.004569,0.005424,0.019587,0.021351,0.011918,0.014269,0.011731,0.013815,0.036421,0.012292,0.007963,0.006066,0.009299,0.059268,0.043081,0.089796,0.08159,19.214709,27.691326,17.425142,13.691936,19.936971,13.945948,24.862267,19.979281,10.805937,19.2,17.64,21.9,6.79,27.4,15.1,82.0,20.5,22.0
7,73,capitec_bank_integrated_annual_report_2018.pdf,2018,36,Capitec Bank,0,2,127,Integrated Annual Report 2018 Why we are in ...,129279,759,905,2770,2427,1357,1337,1344,1311,4259,1815,1033,804,1318,6384,0.005871,0.007,0.021427,0.018773,0.010497,0.010342,0.010396,0.010141,0.032944,0.014039,0.00799,0.006219,0.010195,0.049382,-0.066,-0.007424,-0.012429,20.45278,30.332499,17.367941,13.959615,19.048979,12.055481,25.122903,22.190621,11.173025,21.04,11.05,24.4,2.93,28.5,12.84,10.166667,20.7,13.0
8,6,1464958705-4144_African-Bank-2014annualreport.pdf,2014,35,African Bank Ltd (O),1,1,59,report 2014 Annual Contents Board of director...,24953,249,270,496,230,173,111,172,107,538,499,226,148,302,939,0.009979,0.01082,0.019877,0.009217,0.006933,0.004448,0.006893,0.004288,0.021561,0.019998,0.009057,0.005931,0.012103,0.037631,-0.366391,-0.21831,-0.232975,23.726228,15.411503,20.238319,13.199735,22.296245,14.116434,29.79638,27.004461,11.140464,23.51,-1.48,27.2,7.34,32.1,14.75,89.0,24.1,24.0
9,113,Deutsche_Bank_Annual_Report_2018.pdf,2018,14,Deutsche Bank AG,0,1,448,Annual Report \n2018 Deutsche Bank Deutsche...,262835,2078,2389,8052,3267,4040,2251,3997,2168,9068,5532,2695,1928,4240,12635,0.007906,0.009089,0.030635,0.01243,0.015371,0.008564,0.015207,0.008249,0.034501,0.021047,0.010254,0.007335,0.016132,0.048072,-0.422741,-0.284375,-0.296675,21.42832,23.885777,18.631529,13.613708,21.430435,14.109941,27.033883,21.463367,11.210719,16.22,23.02,19.8,2.28,24.4,14.4,66.0,19.7,20.0


In [None]:
v_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.219152,0.35408,0.684174,0.408055,0.568585,0.741791,0.22226,0.229904,0.535209,0.40625,0.292684,0.463378,0.23298,0.344141,0.342674,0.348846,0.411006,0.356117,0.345569,0.465857,0.722016,0.375758,0.864226,0.71839,0.445592,0.500519,0.259818,0.876196,0.449295,0.53865,0.318511,0.351132,0.51259,0.607331,0.801137,0.332977,0.791872,0.422101,0.645615,0.783875,...,0.351004,0.205652,0.416691,0.646701,0.679212,0.272443,0.732497,0.47969,0.339191,0.31325,0.569802,0.19222,0.749151,0.698813,0.780825,0.57406,0.374179,0.479459,0.631605,0.852961,0.667949,0.622391,0.590484,0.548548,0.60788,0.452655,0.509833,0.320372,0.204127,0.476084,0.206292,0.189745,0.479,0.00524,0.772201,0.402365,0.720858,0.661222,0.371651,0.593418
1,0.156299,0.339694,0.705365,0.573063,0.57627,0.628396,0.346461,0.209017,0.199471,0.525605,0.216903,0.317838,0.207208,0.320695,0.333549,0.563707,0.615347,0.383805,0.278365,0.210757,0.764123,0.427318,0.777317,0.700662,0.333459,0.347957,0.214675,0.903805,0.531112,0.454888,0.304247,0.314058,0.382158,0.714595,0.863253,0.26252,0.872694,0.534593,0.853709,0.867744,...,0.20482,0.2832,0.335754,0.539962,0.781944,0.171208,0.773153,0.487858,0.226845,0.247531,0.543006,0.07133,0.695502,0.621179,0.595487,0.616104,0.377394,0.321857,0.717089,0.955796,0.884945,0.499846,0.6784,0.594407,0.563776,0.353679,0.46333,0.099308,0.15272,0.337527,0.512012,0.142004,0.475473,0.080977,0.735116,0.389873,0.714008,0.816865,0.557116,0.499451
2,0.159472,0.568805,0.831852,0.866241,0.553696,0.719708,0.148148,0.448538,0.230329,0.391416,0.216163,0.306274,0.278526,0.573422,0.39294,0.411317,0.531316,0.189797,0.17277,0.349894,0.693559,0.324053,0.952717,0.756181,0.684122,0.395752,0.616572,0.909785,0.288273,0.239256,0.596611,0.266318,0.138519,0.814783,0.373069,0.079271,0.552339,0.584065,0.52287,0.631211,...,0.385551,0.600129,0.535366,0.286958,0.79686,0.007183,0.708671,0.026491,0.276963,0.247074,0.534136,0.41412,0.868303,0.734091,0.753621,0.535806,0.39141,0.704435,0.766716,0.801792,0.844662,0.31382,0.181289,0.594722,0.689214,0.435185,0.840613,0.133078,0.163621,0.232796,0.660008,0.361893,0.49651,0.22255,0.594053,0.120402,0.503004,0.568528,0.470583,0.70115
3,0.152679,0.376569,0.712671,0.595082,0.545283,0.584727,0.245059,0.213031,0.401238,0.48979,0.26863,0.336761,0.21383,0.458381,0.321966,0.720426,0.835788,0.444377,0.406625,0.268266,0.747503,0.369674,0.769912,0.685304,0.252895,0.382082,0.168709,0.938452,0.542826,0.481767,0.223612,0.328911,0.557058,0.685736,0.920185,0.293279,0.868037,0.452301,0.845122,0.889321,...,0.190656,0.217559,0.327132,0.582507,0.690969,0.169485,0.784713,0.493908,0.210549,0.303662,0.57554,0.031096,0.712151,0.599257,0.46435,0.572283,0.372617,0.283257,0.675999,0.909799,0.954237,0.514509,0.636144,0.613878,0.537782,0.333793,0.42355,0.132224,0.167181,0.469562,0.430286,0.166238,0.458015,0.061539,0.775917,0.422273,0.758,0.863075,0.544605,0.551607
4,0.432074,0.203289,0.403265,0.278763,0.068988,0.868511,0.335556,0.329906,0.484375,0.74355,0.278268,0.469078,0.514495,0.255286,0.484512,0.780598,-0.020062,0.52477,0.172798,0.666881,0.442953,0.523841,0.72179,0.603312,0.301454,0.58064,0.451015,0.431031,0.120197,1.008983,0.188217,0.530989,0.422756,0.502659,0.614556,0.489224,0.796933,0.167143,0.6187,0.952359,...,0.338446,0.0945,-0.077286,0.928921,0.71351,0.355178,0.626648,0.960287,0.60705,1.002035,0.76705,0.185629,0.56114,0.492918,0.40729,0.397606,0.42575,0.417165,0.564475,0.889781,0.805848,0.835261,0.997625,0.887134,0.685575,0.078172,0.437851,0.851442,0.258505,0.923585,0.214569,0.028326,0.265571,0.228159,0.683311,0.684077,0.836461,0.927542,0.273359,0.419598


In [None]:
#Fit
#LogisticRegression: n_jobs=Number of CPU cores if multi_class=’ovr’”. Ignored with ‘liblinear’. None means 1. -1 means all processors. 
clf.fit(v_train,y_train) #max_iter=1000 to allow convergence (default=100)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
#Test                               
pred_test=clf.predict(v_test) #clf.predict_proba(t_test)
f1_test=f1_score(y_test,pred_test)

In [None]:
#std
f1_test.mean()

0.0

In [None]:
#5-times, 5-fold cv (n_jobs=2, random state=0,std)
f1_train.mean()

1.0

In [None]:
#5-fold cv (n_jobs=2, no random state, std)
f1_train.mean()

1.0

In [None]:
#5-fold cv (n_jobs=2, no random state, norm)
f1_train.mean()

0.7944343426531549

In [None]:
#5-times, 5-fold cv (n_jobs=2, no random state, norm)
f1_train.mean()

0.8055325024495843

In [None]:
#5-times, 5-fold cv (n_jobs=2, no random state)
f1_train.mean()

0.26911065949997476

In [None]:
#1-times, 5-fold cv (n_jobs=2)
f1_train.mean()

0.3979843053846849

In [None]:
#5-times, 5-fold cv (n_jobs=2)
f1_train.mean()

0.36230790027139664

In [None]:
#5-times, 5-fold cv (n_jobs=1)
f1_train.mean()

0.36230790027139664

In [None]:
#5-fold cv (n_jobs=2)
f1_train.mean()

0.5960245084069535

In [None]:
#5-fold cv (n_jobs=1)
f1_train.mean()

0.5960245084069535

In [None]:
x_train.head()

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
0,61,"Bidvest+Bank+Annual+Report,+year+ended+30+June...",2017,20,Bidvest Bank Ltd,0,2,59,Bidvest Bank has planted many seeds over th...,41217,188,222,877,868,477,571,468,544,1493,495,292,217,365,2541,0.004561,0.005386,0.021278,0.021059,0.011573,0.013854,0.011355,0.013198,0.036223,0.01201,0.007084,0.005265,0.008856,0.061649,-0.005158,0.089695,0.075099,19.500847,30.966501,17.280738,14.230697,19.755907,12.934926,25.297174,17.353724,11.163409,20.29,14.19,23.2,3.13,28.7,14.52,8.833333,20.8,21.0
1,8,1517467421-ABILIAR2012completelowres.pdf,2012,35,African Bank Ltd (O),1,1,360,>ABIL in perspective African Bank Investments...,148280,989,1138,2548,2409,1166,1567,1154,1528,4475,2310,1224,921,1591,7692,0.00667,0.007675,0.017184,0.016246,0.007864,0.010568,0.007783,0.010305,0.030179,0.015579,0.008255,0.006211,0.01073,0.051875,-0.028041,0.146725,0.139448,21.979161,21.876332,18.729265,13.3374,20.335909,13.204049,26.612999,20.203746,10.948028,22.12,0.46,26.4,3.23,30.4,13.88,18.75,22.0,22.0
2,357,VBS-2015-Annual-Report.pdf,2015,1,VBS Mutual Bank,1,1,56,People with a purpose. “People with a Purpos...,19318,118,134,396,236,172,116,170,116,510,284,108,74,252,709,0.006108,0.006937,0.020499,0.012217,0.008904,0.006005,0.0088,0.006005,0.0264,0.014701,0.005591,0.003831,0.013045,0.036702,-0.253165,-0.194444,-0.188811,22.335384,24.774939,18.995163,13.780807,21.108055,12.47382,28.080321,16.404323,11.415405,19.95,-76.44,35.3,7.05,26.2,13.19,19.0,20.6,13.0
3,9,2009 Full report - PDF 7MB - African Bank.pdf,2009,35,African Bank Ltd (O),1,1,208,Annual Report 2009 Our vision is to enable \...,78638,431,493,1514,1188,669,681,660,667,2298,1214,573,414,1288,3057,0.005481,0.006269,0.019253,0.015107,0.008507,0.00866,0.008393,0.008482,0.029223,0.015438,0.007287,0.005265,0.016379,0.038874,-0.120651,0.008889,0.005275,22.626269,23.106608,19.648376,14.259305,21.809318,12.28197,29.32429,19.620378,11.996173,24.43,3.23,27.4,3.57,32.5,12.96,21.0,22.1,13.0
4,105,Deutsche_Bank_Annual_Report_2010.pdf,2010,14,Deutsche Bank AG,0,1,492,Deutsche Bank Annual Review 2010\nDelivering ...,208200,1307,1599,4893,2930,2380,2087,2352,2028,6897,3933,1934,1252,3394,9876,0.006278,0.00768,0.023501,0.014073,0.011431,0.010024,0.011297,0.009741,0.033127,0.01889,0.009289,0.006013,0.016302,0.047435,-0.250927,-0.065592,-0.073973,19.652102,26.895525,17.395007,13.350417,20.067945,14.463211,24.504868,19.587362,10.697824,18.66,0.22,24.5,2.69,27.5,15.1,12.333333,20.8,25.0


In [None]:
v_train_df.shape

(408, 302)

In [None]:
v_train_df.head()

Unnamed: 0,docID,docName,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,61,"Bidvest+Bank+Annual+Report,+year+ended+30+June...",0.191777,0.161816,0.144421,0.132216,0.166842,0.233829,0.196052,0.190809,0.121299,0.227302,0.203542,0.167896,0.252534,0.143162,0.194686,0.203265,0.190982,0.211462,0.195348,0.1275,0.168901,0.243312,0.151238,0.152022,0.142041,0.206186,0.169021,0.160461,0.231797,0.147674,0.214293,0.125821,0.220107,0.210299,0.156557,0.149751,0.139114,0.154882,...,0.228837,0.202706,0.188161,0.201624,0.143296,0.296675,0.272596,0.195369,0.224696,0.127119,0.217826,0.239603,0.284792,0.216601,0.155582,0.18148,0.149991,0.18099,0.143638,0.232813,0.185281,0.16119,0.156808,0.173983,0.179388,0.176798,0.18762,0.330431,0.208312,0.161082,0.186878,0.205552,0.228185,0.203008,0.199824,0.162008,0.200258,0.280399,0.142442,0.212514
1,8,1517467421-ABILIAR2012completelowres.pdf,0.191239,0.162572,0.144809,0.132045,0.164991,0.237354,0.194326,0.19535,0.121275,0.224375,0.203834,0.16831,0.253955,0.14308,0.191781,0.206774,0.190971,0.214363,0.196834,0.126838,0.168963,0.251025,0.149943,0.144868,0.143471,0.209541,0.169246,0.161339,0.231595,0.145298,0.217629,0.125653,0.21904,0.21031,0.15139,0.147987,0.13811,0.156699,...,0.228965,0.198955,0.181276,0.199865,0.143902,0.301025,0.271191,0.198345,0.225908,0.12692,0.218967,0.243505,0.285161,0.214774,0.155499,0.18203,0.148522,0.178499,0.145982,0.231386,0.188477,0.159418,0.156813,0.169909,0.181269,0.175787,0.186829,0.332428,0.20776,0.163545,0.184686,0.207373,0.228012,0.200844,0.202024,0.161048,0.201456,0.282423,0.140054,0.213528
2,357,VBS-2015-Annual-Report.pdf,0.190485,0.159764,0.141394,0.130219,0.16248,0.233673,0.19654,0.193405,0.122075,0.222628,0.203943,0.167958,0.254025,0.140755,0.187685,0.205689,0.187575,0.209416,0.195412,0.126434,0.168469,0.245732,0.150006,0.14168,0.142457,0.207432,0.166948,0.157431,0.229592,0.14423,0.213189,0.125758,0.217393,0.208268,0.151744,0.145809,0.137564,0.152403,...,0.228506,0.196807,0.179757,0.197284,0.14225,0.29192,0.269615,0.191736,0.221272,0.126849,0.213702,0.242102,0.283614,0.212701,0.152521,0.179952,0.147075,0.176719,0.144289,0.226975,0.183997,0.154134,0.15632,0.168142,0.175599,0.171371,0.184843,0.320669,0.20731,0.158572,0.181058,0.203409,0.226922,0.200327,0.199609,0.15975,0.201444,0.275655,0.140733,0.207932
3,9,2009 Full report - PDF 7MB - African Bank.pdf,0.190341,0.160885,0.143307,0.131487,0.164779,0.23568,0.194731,0.193003,0.120737,0.22378,0.204683,0.169114,0.252623,0.142501,0.189925,0.207455,0.189336,0.214472,0.196552,0.126018,0.168535,0.247818,0.150454,0.146372,0.14277,0.207642,0.169227,0.159023,0.231848,0.143711,0.215434,0.125136,0.219062,0.210043,0.152834,0.146645,0.137945,0.156018,...,0.229502,0.199161,0.180218,0.197688,0.143975,0.29876,0.27141,0.196879,0.224106,0.126227,0.218425,0.24158,0.28371,0.213636,0.154983,0.182248,0.146818,0.177767,0.144259,0.230141,0.185838,0.157296,0.156463,0.168208,0.179295,0.175822,0.185594,0.331419,0.207777,0.162301,0.183069,0.203839,0.228435,0.200506,0.201715,0.160497,0.202217,0.279966,0.140311,0.213056
4,105,Deutsche_Bank_Annual_Report_2010.pdf,0.189991,0.162528,0.14613,0.129473,0.163746,0.235346,0.194781,0.192743,0.121191,0.222235,0.203043,0.167373,0.251169,0.142696,0.191387,0.206928,0.190112,0.21398,0.197384,0.126323,0.167319,0.247992,0.14883,0.144205,0.142598,0.205548,0.167716,0.160464,0.230551,0.144382,0.216359,0.12591,0.21707,0.21059,0.150987,0.145894,0.13845,0.155953,...,0.229505,0.198302,0.180521,0.198075,0.143784,0.298408,0.271208,0.19632,0.2237,0.125642,0.21911,0.24192,0.286225,0.211527,0.153963,0.183133,0.147883,0.176989,0.144869,0.230738,0.184332,0.157187,0.154989,0.168679,0.178624,0.175481,0.185014,0.331681,0.208405,0.162799,0.18438,0.204585,0.227573,0.200315,0.201259,0.160676,0.201214,0.279002,0.140543,0.212114


In [None]:
v_train=v_train_df.iloc[:,2:]

In [None]:
v_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.191777,0.161816,0.144421,0.132216,0.166842,0.233829,0.196052,0.190809,0.121299,0.227302,0.203542,0.167896,0.252534,0.143162,0.194686,0.203265,0.190982,0.211462,0.195348,0.1275,0.168901,0.243312,0.151238,0.152022,0.142041,0.206186,0.169021,0.160461,0.231797,0.147674,0.214293,0.125821,0.220107,0.210299,0.156557,0.149751,0.139114,0.154882,0.160397,0.161308,...,0.228837,0.202706,0.188161,0.201624,0.143296,0.296675,0.272596,0.195369,0.224696,0.127119,0.217826,0.239603,0.284792,0.216601,0.155582,0.18148,0.149991,0.18099,0.143638,0.232813,0.185281,0.16119,0.156808,0.173983,0.179388,0.176798,0.18762,0.330431,0.208312,0.161082,0.186878,0.205552,0.228185,0.203008,0.199824,0.162008,0.200258,0.280399,0.142442,0.212514
1,0.191239,0.162572,0.144809,0.132045,0.164991,0.237354,0.194326,0.19535,0.121275,0.224375,0.203834,0.16831,0.253955,0.14308,0.191781,0.206774,0.190971,0.214363,0.196834,0.126838,0.168963,0.251025,0.149943,0.144868,0.143471,0.209541,0.169246,0.161339,0.231595,0.145298,0.217629,0.125653,0.21904,0.21031,0.15139,0.147987,0.13811,0.156699,0.160591,0.158421,...,0.228965,0.198955,0.181276,0.199865,0.143902,0.301025,0.271191,0.198345,0.225908,0.12692,0.218967,0.243505,0.285161,0.214774,0.155499,0.18203,0.148522,0.178499,0.145982,0.231386,0.188477,0.159418,0.156813,0.169909,0.181269,0.175787,0.186829,0.332428,0.20776,0.163545,0.184686,0.207373,0.228012,0.200844,0.202024,0.161048,0.201456,0.282423,0.140054,0.213528
2,0.190485,0.159764,0.141394,0.130219,0.16248,0.233673,0.19654,0.193405,0.122075,0.222628,0.203943,0.167958,0.254025,0.140755,0.187685,0.205689,0.187575,0.209416,0.195412,0.126434,0.168469,0.245732,0.150006,0.14168,0.142457,0.207432,0.166948,0.157431,0.229592,0.14423,0.213189,0.125758,0.217393,0.208268,0.151744,0.145809,0.137564,0.152403,0.160185,0.157077,...,0.228506,0.196807,0.179757,0.197284,0.14225,0.29192,0.269615,0.191736,0.221272,0.126849,0.213702,0.242102,0.283614,0.212701,0.152521,0.179952,0.147075,0.176719,0.144289,0.226975,0.183997,0.154134,0.15632,0.168142,0.175599,0.171371,0.184843,0.320669,0.20731,0.158572,0.181058,0.203409,0.226922,0.200327,0.199609,0.15975,0.201444,0.275655,0.140733,0.207932
3,0.190341,0.160885,0.143307,0.131487,0.164779,0.23568,0.194731,0.193003,0.120737,0.22378,0.204683,0.169114,0.252623,0.142501,0.189925,0.207455,0.189336,0.214472,0.196552,0.126018,0.168535,0.247818,0.150454,0.146372,0.14277,0.207642,0.169227,0.159023,0.231848,0.143711,0.215434,0.125136,0.219062,0.210043,0.152834,0.146645,0.137945,0.156018,0.160741,0.156931,...,0.229502,0.199161,0.180218,0.197688,0.143975,0.29876,0.27141,0.196879,0.224106,0.126227,0.218425,0.24158,0.28371,0.213636,0.154983,0.182248,0.146818,0.177767,0.144259,0.230141,0.185838,0.157296,0.156463,0.168208,0.179295,0.175822,0.185594,0.331419,0.207777,0.162301,0.183069,0.203839,0.228435,0.200506,0.201715,0.160497,0.202217,0.279966,0.140311,0.213056
4,0.189991,0.162528,0.14613,0.129473,0.163746,0.235346,0.194781,0.192743,0.121191,0.222235,0.203043,0.167373,0.251169,0.142696,0.191387,0.206928,0.190112,0.21398,0.197384,0.126323,0.167319,0.247992,0.14883,0.144205,0.142598,0.205548,0.167716,0.160464,0.230551,0.144382,0.216359,0.12591,0.21707,0.21059,0.150987,0.145894,0.13845,0.155953,0.159355,0.15608,...,0.229505,0.198302,0.180521,0.198075,0.143784,0.298408,0.271208,0.19632,0.2237,0.125642,0.21911,0.24192,0.286225,0.211527,0.153963,0.183133,0.147883,0.176989,0.144869,0.230738,0.184332,0.157187,0.154989,0.168679,0.178624,0.175481,0.185014,0.331681,0.208405,0.162799,0.18438,0.204585,0.227573,0.200315,0.201259,0.160676,0.201214,0.279002,0.140543,0.212114


In [None]:
x_test.head()

Unnamed: 0,docID,docName,year,coID,coName,risk,pageRatio,pages,text,wordCount,forward1,forward2,LMneg,LMpos,H6neg,H6pos,H8neg,H8pos,uncert,causal,causalM,causalM50,perf,strat,forward1_scaled,forward2_scaled,LMneg_scaled,LMpos_scaled,H6neg_scaled,H6pos_scaled,H8neg_scaled,H8pos_scaled,uncert_scaled,causal_scaled,causalM_scaled,causalM50_scaled,perf_scaled,strat_scaled,LMtone,H6tone,H8tone,r_gf,r_f,r_fk,r_dc,r_ari,r_cl,r_lw,r_sm,r_sp,ts_gf,ts_f,ts_fk,ts_dc,ts_ari,ts_cl,ts_lw,ts_sm,ts_ts
0,234,JP Morgan Chase and Co - annualreport-2018.pdf,2018,16,JP Morgan Chase and Co,0,1,336,ANNUAL REPORT 2018 Financial Highlights As o...,191552,1594,1851,5096,3517,2378,2560,2334,2511,7367,4374,2014,1393,3100,8513,0.008322,0.009663,0.026604,0.018361,0.012414,0.013365,0.012185,0.013109,0.03846,0.022835,0.010514,0.007272,0.016184,0.044442,-0.183328,0.036857,0.036533,20.484894,30.409332,16.923379,12.923495,19.511867,13.722717,24.058009,19.719031,10.450005,18.53,17.44,22.0,2.64,27.6,14.87,60.0,20.3,22.0
1,310,1587728295-SBSA2019AnnualReport.pdf,2019,11,Standard Bank of S A Ltd,0,1,220,The Standard Bank of South Africa\nANNUAL REP...,105926,565,683,2175,1268,835,733,828,720,2835,1824,747,524,1525,3861,0.005334,0.006448,0.020533,0.011971,0.007883,0.00692,0.007817,0.006797,0.026764,0.01722,0.007052,0.004947,0.014397,0.03645,-0.263433,-0.065051,-0.069767,23.04556,26.918005,19.337492,14.671988,21.902405,11.836795,29.364855,21.224474,12.274554,25.32,8.85,27.4,3.5,33.8,12.72,79.0,22.4,13.0
2,165,HSBC - 190219-annual-report-and-accounts-2018.pdf,2018,15,HSBC Bank Plc - Johannesburg Branch,0,1,322,HSBC Holdings plc Annual Report and Accounts ...,232672,1784,2040,5972,3604,2943,2611,2924,2514,7748,4339,2082,1507,2873,11695,0.007667,0.008768,0.025667,0.01549,0.012649,0.011222,0.012567,0.010805,0.0333,0.018649,0.008948,0.006477,0.012348,0.050264,-0.247285,-0.059777,-0.075395,22.026397,29.992855,18.168042,13.998006,21.480299,13.190182,27.557775,16.847623,11.538784,20.01,13.48,23.5,2.78,28.6,13.83,16.75,20.7,21.0
3,177,ICICI Bank annual-report-fy2018 (2017).pdf,2018,32,ICICI Bank Ltd,0,1,300,Partnering a\nDYnaMiC\ninDia AnnuAl RePoRt 20...,135580,646,712,2428,2142,1152,1282,1146,1215,3446,1845,759,541,1345,4777,0.004765,0.005252,0.017908,0.015799,0.008497,0.009456,0.008453,0.008961,0.025417,0.013608,0.005598,0.00399,0.00992,0.035234,-0.062582,0.05341,0.029225,17.314725,39.957036,14.347024,12.998978,16.823238,13.457871,20.29998,25.668521,9.832169,19.73,6.04,24.3,2.81,29.5,15.28,73.0,20.7,21.0
4,250,NASDAQ_MBWM_2019.pdf,2019,22,Mercantile Bank Ltd,0,1,148,2019 ANNUAL REPORT MISSION STATEMENT mercbank...,62406,673,752,1362,1106,503,805,493,796,2221,1591,611,393,993,2232,0.010784,0.01205,0.021825,0.017723,0.00806,0.012899,0.0079,0.012755,0.03559,0.025494,0.009791,0.006297,0.015912,0.035766,-0.103728,0.230887,0.235066,22.121562,24.70533,18.60225,12.970222,22.264555,14.936493,27.684991,17.429995,11.187792,62.6,-169.65,75.2,8.26,83.2,16.33,12.5,36.4,13.0


In [None]:
v_test_df.shape

(48, 302)

In [None]:
v_test_df.head()

Unnamed: 0,docID,docName,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,260,2019 Nedbank Limited Consolidated Annual Finan...,0.189233,0.160553,0.14075,0.127744,0.16079,0.231156,0.195433,0.192758,0.122052,0.222613,0.203063,0.167416,0.251042,0.139897,0.184509,0.20366,0.178858,0.207383,0.192403,0.124811,0.166196,0.243133,0.149098,0.142551,0.141445,0.205325,0.165879,0.15641,0.226751,0.138611,0.208759,0.122279,0.214914,0.207749,0.150998,0.146041,0.135225,0.149907,...,0.225381,0.194683,0.181177,0.194937,0.140602,0.287013,0.268699,0.188492,0.216694,0.126144,0.212357,0.24071,0.281145,0.209576,0.152087,0.179332,0.145914,0.176391,0.14294,0.219768,0.182843,0.149797,0.153776,0.166186,0.173077,0.164155,0.183409,0.316639,0.198147,0.158035,0.176996,0.198932,0.227148,0.200922,0.197576,0.160763,0.19907,0.271785,0.13786,0.200167
1,62,"Bidvest+Bank+Annual+Report,+year+ended+30+June...",0.191028,0.161305,0.143904,0.131768,0.16633,0.233531,0.194874,0.191469,0.121484,0.226579,0.203202,0.167535,0.252362,0.142343,0.19265,0.202776,0.19059,0.210703,0.194532,0.126015,0.168547,0.242677,0.151232,0.149021,0.140999,0.206607,0.168171,0.160876,0.230429,0.14689,0.214323,0.125419,0.219385,0.210169,0.156212,0.148357,0.137713,0.15437,...,0.228522,0.202654,0.186692,0.200523,0.142908,0.295372,0.27076,0.194811,0.223667,0.126789,0.216614,0.238754,0.282564,0.216656,0.15476,0.181406,0.149333,0.180143,0.143581,0.232313,0.185354,0.16045,0.156889,0.172519,0.178411,0.175643,0.187656,0.327966,0.207068,0.160071,0.18659,0.206425,0.227517,0.20181,0.199419,0.161607,0.199316,0.280159,0.141977,0.212451
2,37,Bank of Baroda Annualreport2018-19.pdf,0.190392,0.161771,0.141825,0.132243,0.16734,0.237252,0.192064,0.1918,0.122625,0.224887,0.206022,0.169639,0.252726,0.144816,0.1961,0.207495,0.194478,0.217485,0.197398,0.126584,0.164579,0.248085,0.151193,0.1448,0.140859,0.206966,0.171886,0.160327,0.229782,0.147229,0.216045,0.126516,0.218023,0.212205,0.15116,0.147552,0.135098,0.155155,...,0.232656,0.201784,0.182783,0.202268,0.144714,0.302228,0.271792,0.199583,0.226993,0.12687,0.221943,0.242478,0.288794,0.216114,0.155111,0.183609,0.149416,0.179047,0.144584,0.236109,0.185643,0.163289,0.157874,0.170109,0.179802,0.178804,0.186281,0.335902,0.213746,0.160371,0.185425,0.204654,0.231148,0.201017,0.201699,0.16036,0.201813,0.283721,0.140957,0.217333
3,36,Bank of Baroda Annualreport2017-18.pdf,0.191006,0.162062,0.141864,0.131883,0.167369,0.23708,0.193078,0.192356,0.12209,0.225279,0.206109,0.169704,0.252611,0.144967,0.196231,0.206391,0.19435,0.216997,0.197396,0.126422,0.165148,0.248124,0.150888,0.146172,0.14145,0.207352,0.171673,0.159851,0.230205,0.147207,0.215951,0.126385,0.218039,0.211832,0.151195,0.147084,0.136118,0.155119,...,0.232154,0.201327,0.182782,0.202406,0.144897,0.301943,0.272302,0.19975,0.227339,0.126948,0.221438,0.242879,0.28947,0.216036,0.154798,0.183018,0.14953,0.178765,0.14459,0.235972,0.185895,0.163162,0.157321,0.170794,0.179808,0.178858,0.185711,0.335619,0.213844,0.160677,0.185937,0.204685,0.230998,0.201802,0.201952,0.160547,0.202447,0.283942,0.140776,0.217386
4,127,firstrand-annual-report 2019.pdf,0.189662,0.162745,0.14309,0.126345,0.164656,0.232242,0.196038,0.192741,0.121531,0.224337,0.204242,0.167768,0.249478,0.143169,0.190419,0.20309,0.184787,0.210668,0.194485,0.124934,0.167869,0.244648,0.149888,0.145855,0.1425,0.20522,0.167333,0.157825,0.226987,0.142314,0.211701,0.124719,0.214262,0.209921,0.152945,0.145662,0.136555,0.151064,...,0.228045,0.199627,0.182766,0.197384,0.142822,0.292372,0.268413,0.193261,0.219412,0.125507,0.216024,0.241259,0.285425,0.21098,0.152252,0.180893,0.146572,0.176608,0.144067,0.225599,0.183165,0.156064,0.155464,0.171006,0.175951,0.171557,0.183003,0.323535,0.203835,0.159013,0.182614,0.201917,0.228005,0.202448,0.200078,0.160557,0.202461,0.27524,0.140996,0.207561


In [None]:
v_test_df=x_test[['docID']].merge(v_test_df,how='left',on='docID')

In [None]:
v_test_df.shape

(48, 302)

In [None]:
v_test_df.head()

Unnamed: 0,docID,docName,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,234,JP Morgan Chase and Co - annualreport-2018.pdf,0.191096,0.161322,0.143267,0.128438,0.163227,0.234918,0.193991,0.193451,0.1217,0.223419,0.203377,0.168357,0.25214,0.141544,0.189672,0.204029,0.184271,0.212695,0.197598,0.125477,0.165373,0.247101,0.148708,0.143937,0.142363,0.205616,0.168541,0.158155,0.228488,0.139668,0.214095,0.122449,0.214292,0.209419,0.148408,0.14636,0.136192,0.153237,...,0.228218,0.195404,0.182157,0.198277,0.142627,0.29288,0.269809,0.194706,0.220687,0.125819,0.216991,0.242003,0.284727,0.210754,0.153296,0.181708,0.148119,0.177153,0.144393,0.224415,0.184855,0.154676,0.15393,0.16703,0.176426,0.170846,0.182664,0.324576,0.204761,0.160917,0.179853,0.199726,0.229409,0.20164,0.199138,0.161852,0.200551,0.276393,0.139705,0.207788
1,310,1587728295-SBSA2019AnnualReport.pdf,0.189031,0.161584,0.142531,0.128193,0.161347,0.232009,0.194956,0.193997,0.120542,0.221858,0.204409,0.168147,0.251468,0.139115,0.184999,0.204505,0.181556,0.208161,0.194285,0.12529,0.167141,0.243237,0.149546,0.142143,0.142496,0.206924,0.166961,0.15824,0.227388,0.141236,0.208822,0.123637,0.216018,0.209527,0.150241,0.145082,0.136085,0.149104,...,0.226179,0.194286,0.17916,0.195393,0.141829,0.287177,0.266901,0.189632,0.218324,0.126235,0.211334,0.241994,0.28089,0.210257,0.152017,0.180336,0.147056,0.176218,0.142761,0.221117,0.184557,0.150469,0.154457,0.166146,0.174566,0.167457,0.183026,0.314428,0.202703,0.159233,0.179221,0.201854,0.227232,0.200774,0.198518,0.1604,0.200909,0.272673,0.138949,0.202532
2,165,HSBC - 190219-annual-report-and-accounts-2018.pdf,0.189407,0.16315,0.142553,0.12907,0.16365,0.233428,0.193398,0.194353,0.122341,0.222945,0.203491,0.167939,0.251542,0.141308,0.191635,0.207214,0.187191,0.210943,0.195531,0.126773,0.165746,0.246915,0.149229,0.141573,0.14244,0.206014,0.168795,0.159895,0.228245,0.143145,0.211155,0.124473,0.215216,0.21103,0.149174,0.147365,0.135342,0.150206,...,0.228585,0.195932,0.182049,0.199193,0.143443,0.294663,0.269885,0.192683,0.221516,0.126355,0.216203,0.244269,0.286512,0.211479,0.153096,0.182049,0.148358,0.177741,0.144327,0.225284,0.183957,0.15464,0.154428,0.168397,0.175585,0.169296,0.184226,0.322837,0.207258,0.158884,0.181468,0.202913,0.229292,0.20097,0.200142,0.160671,0.199557,0.276459,0.140053,0.205891
3,177,ICICI Bank annual-report-fy2018 (2017).pdf,0.188064,0.161658,0.139949,0.130881,0.164668,0.233431,0.193269,0.1933,0.123067,0.223717,0.204142,0.168615,0.251249,0.142499,0.189034,0.205234,0.1883,0.211038,0.19571,0.12457,0.166605,0.244596,0.150266,0.141176,0.141917,0.207184,0.168759,0.158794,0.227814,0.144569,0.210098,0.124848,0.21624,0.209378,0.150563,0.144254,0.134209,0.150101,...,0.228777,0.19832,0.17803,0.197543,0.143026,0.294737,0.270023,0.193538,0.222093,0.126354,0.215807,0.242351,0.284296,0.211926,0.153187,0.181747,0.146752,0.176099,0.143995,0.226253,0.183691,0.155501,0.154923,0.16805,0.1761,0.170339,0.182706,0.322835,0.207045,0.157639,0.180877,0.203359,0.228865,0.201094,0.200329,0.159709,0.200217,0.27823,0.139151,0.207546
4,250,NASDAQ_MBWM_2019.pdf,0.189961,0.160049,0.142307,0.12863,0.161355,0.234571,0.196472,0.191813,0.122662,0.222245,0.202816,0.167631,0.252119,0.141844,0.187534,0.201451,0.183859,0.211592,0.19811,0.124226,0.166105,0.245141,0.148911,0.144237,0.142037,0.20443,0.166445,0.157724,0.230831,0.139253,0.213972,0.121647,0.216697,0.207287,0.15155,0.14436,0.137125,0.153745,...,0.226974,0.196165,0.178442,0.196463,0.140906,0.29238,0.271592,0.194851,0.220666,0.127294,0.21704,0.239725,0.283998,0.208682,0.153087,0.180273,0.147912,0.176148,0.144786,0.225524,0.182149,0.152178,0.153145,0.165845,0.176354,0.172478,0.182422,0.326603,0.203263,0.160375,0.179965,0.198327,0.228237,0.199487,0.19815,0.162527,0.200186,0.276295,0.138696,0.207735


In [None]:
v_test=v_test_df.iloc[:,2:]

In [None]:
v_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.191096,0.161322,0.143267,0.128438,0.163227,0.234918,0.193991,0.193451,0.1217,0.223419,0.203377,0.168357,0.25214,0.141544,0.189672,0.204029,0.184271,0.212695,0.197598,0.125477,0.165373,0.247101,0.148708,0.143937,0.142363,0.205616,0.168541,0.158155,0.228488,0.139668,0.214095,0.122449,0.214292,0.209419,0.148408,0.14636,0.136192,0.153237,0.158303,0.156037,...,0.228218,0.195404,0.182157,0.198277,0.142627,0.29288,0.269809,0.194706,0.220687,0.125819,0.216991,0.242003,0.284727,0.210754,0.153296,0.181708,0.148119,0.177153,0.144393,0.224415,0.184855,0.154676,0.15393,0.16703,0.176426,0.170846,0.182664,0.324576,0.204761,0.160917,0.179853,0.199726,0.229409,0.20164,0.199138,0.161852,0.200551,0.276393,0.139705,0.207788
1,0.189031,0.161584,0.142531,0.128193,0.161347,0.232009,0.194956,0.193997,0.120542,0.221858,0.204409,0.168147,0.251468,0.139115,0.184999,0.204505,0.181556,0.208161,0.194285,0.12529,0.167141,0.243237,0.149546,0.142143,0.142496,0.206924,0.166961,0.15824,0.227388,0.141236,0.208822,0.123637,0.216018,0.209527,0.150241,0.145082,0.136085,0.149104,0.16008,0.154698,...,0.226179,0.194286,0.17916,0.195393,0.141829,0.287177,0.266901,0.189632,0.218324,0.126235,0.211334,0.241994,0.28089,0.210257,0.152017,0.180336,0.147056,0.176218,0.142761,0.221117,0.184557,0.150469,0.154457,0.166146,0.174566,0.167457,0.183026,0.314428,0.202703,0.159233,0.179221,0.201854,0.227232,0.200774,0.198518,0.1604,0.200909,0.272673,0.138949,0.202532
2,0.189407,0.16315,0.142553,0.12907,0.16365,0.233428,0.193398,0.194353,0.122341,0.222945,0.203491,0.167939,0.251542,0.141308,0.191635,0.207214,0.187191,0.210943,0.195531,0.126773,0.165746,0.246915,0.149229,0.141573,0.14244,0.206014,0.168795,0.159895,0.228245,0.143145,0.211155,0.124473,0.215216,0.21103,0.149174,0.147365,0.135342,0.150206,0.160066,0.156762,...,0.228585,0.195932,0.182049,0.199193,0.143443,0.294663,0.269885,0.192683,0.221516,0.126355,0.216203,0.244269,0.286512,0.211479,0.153096,0.182049,0.148358,0.177741,0.144327,0.225284,0.183957,0.15464,0.154428,0.168397,0.175585,0.169296,0.184226,0.322837,0.207258,0.158884,0.181468,0.202913,0.229292,0.20097,0.200142,0.160671,0.199557,0.276459,0.140053,0.205891
3,0.188064,0.161658,0.139949,0.130881,0.164668,0.233431,0.193269,0.1933,0.123067,0.223717,0.204142,0.168615,0.251249,0.142499,0.189034,0.205234,0.1883,0.211038,0.19571,0.12457,0.166605,0.244596,0.150266,0.141176,0.141917,0.207184,0.168759,0.158794,0.227814,0.144569,0.210098,0.124848,0.21624,0.209378,0.150563,0.144254,0.134209,0.150101,0.160477,0.156886,...,0.228777,0.19832,0.17803,0.197543,0.143026,0.294737,0.270023,0.193538,0.222093,0.126354,0.215807,0.242351,0.284296,0.211926,0.153187,0.181747,0.146752,0.176099,0.143995,0.226253,0.183691,0.155501,0.154923,0.16805,0.1761,0.170339,0.182706,0.322835,0.207045,0.157639,0.180877,0.203359,0.228865,0.201094,0.200329,0.159709,0.200217,0.27823,0.139151,0.207546
4,0.189961,0.160049,0.142307,0.12863,0.161355,0.234571,0.196472,0.191813,0.122662,0.222245,0.202816,0.167631,0.252119,0.141844,0.187534,0.201451,0.183859,0.211592,0.19811,0.124226,0.166105,0.245141,0.148911,0.144237,0.142037,0.20443,0.166445,0.157724,0.230831,0.139253,0.213972,0.121647,0.216697,0.207287,0.15155,0.14436,0.137125,0.153745,0.158463,0.15588,...,0.226974,0.196165,0.178442,0.196463,0.140906,0.29238,0.271592,0.194851,0.220666,0.127294,0.21704,0.239725,0.283998,0.208682,0.153087,0.180273,0.147912,0.176148,0.144786,0.225524,0.182149,0.152178,0.153145,0.165845,0.176354,0.172478,0.182422,0.326603,0.203263,0.160375,0.179965,0.198327,0.228237,0.199487,0.19815,0.162527,0.200186,0.276295,0.138696,0.207735


###7.3.1 Current

In [None]:
#Clfs less SVMa, x, 4-8, 200-500, norm, sg_mean

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 11 | Training 1 of 20 | 22/11/2020 23:38:29
50:50,LR,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9719 (+/- 0.0416)|Test F1: 0.5000|2s
Iteration 1 of 11 | Training 2 of 20 | 22/11/2020 23:38:31
50:50,LR,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9769 (+/- 0.0324)|Test F1: 0.0000|1s
Iteration 1 of 11 | Training 3 of 20 | 22/11/2020 23:38:32
50:50,LR,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9855 (+/- 0.0278)|Test F1: 0.5000|1s
Iteration 1 of 11 | Training 4 of 20 | 22/11/2020 23:38:34
50:50,LR,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9823 (+/- 0.0296)|Test F1: 0.0000|1s
Iteration 1 of 11 | Training 5 of 20 | 22/11/2020 23:38:36
50:50,LR,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9537 (+/- 0.0451)|Test F1: 0.5000|1s
Iteration 1 of 11 | Training 6 of 20 | 22/11/2020 23:38:37
50:50,LR,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:



50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9409 (+/- 0.4041)|Test F1: 0.0000|18s
Iteration 3 of 11 | Training 3 of 20 | 22/11/2020 23:39:52
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9934 (+/- 0.0259)|Test F1: 0.6667|18s
Iteration 3 of 11 | Training 4 of 20 | 22/11/2020 23:40:10
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9946 (+/- 0.0164)|Test F1: 0.0000|19s
Iteration 3 of 11 | Training 5 of 20 | 22/11/2020 23:40:30
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9949 (+/- 0.0218)|Test F1: 0.0000|18s
Iteration 3 of 11 | Training 6 of 20 | 22/11/2020 23:40:49
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9953 (+/- 0.0213)|Test F1: 0.0000|18s
Iteration 3 of 11 | Training 7 of 20 | 22/11/2020 23:41:07
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9831 (+/- 0.1321)|Test F1: 0.5000|19s
Iteration 3 of 



50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9969 (+/- 0.0134)|Test F1: 0.0000|17s
Iteration 3 of 11 | Training 10 of 20 | 22/11/2020 23:42:04
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9957 (+/- 0.0140)|Test F1: 0.6667|18s
Iteration 3 of 11 | Training 11 of 20 | 22/11/2020 23:42:23
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9440 (+/- 0.3988)|Test F1: 0.0000|19s
Iteration 3 of 11 | Training 12 of 20 | 22/11/2020 23:42:43
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.9828 (+/- 0.1483)|Test F1: 0.0000|20s
Iteration 3 of 11 | Training 13 of 20 | 22/11/2020 23:43:03
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9941 (+/- 0.0201)|Test F1: 0.0000|16s
Iteration 3 of 11 | Training 14 of 20 | 22/11/2020 23:43:20
50:50,MLP8,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.9971 (+/- 0.0104)|Test F1: 0.0800|16s
Iteration 



50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.2376 (+/- 0.6364)|Test F1: 0.0800|112s
Iteration 4 of 11 | Training 2 of 20 | 22/11/2020 23:47:21
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.3462 (+/- 0.6678)|Test F1: 0.0000|145s
Iteration 4 of 11 | Training 3 of 20 | 22/11/2020 23:49:46
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.4236 (+/- 0.6390)|Test F1: 0.0800|192s
Iteration 4 of 11 | Training 4 of 20 | 22/11/2020 23:52:58
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.4194 (+/- 0.6336)|Test F1: 0.0000|237s
Iteration 4 of 11 | Training 5 of 20 | 22/11/2020 23:56:55
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.3932 (+/- 0.6459)|Test F1: 0.0800|104s
Iteration 4 of 11 | Training 6 of 20 | 22/11/2020 23:58:40
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.2587 (+/- 0.6366)|Test F1: 0.0800|148s
Ite



50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.3556 (+/- 0.6318)|Test F1: 0.0000|230s
Iteration 4 of 11 | Training 13 of 20 | 23/11/2020 00:19:16
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.3973 (+/- 0.6525)|Test F1: 0.0800|104s
Iteration 4 of 11 | Training 14 of 20 | 23/11/2020 00:21:01
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.3376 (+/- 0.6507)|Test F1: 0.0800|150s
Iteration 4 of 11 | Training 15 of 20 | 23/11/2020 00:23:31
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.3428 (+/- 0.6611)|Test F1: 0.0000|190s
Iteration 4 of 11 | Training 16 of 20 | 23/11/2020 00:26:42




50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.2111 (+/- 0.6176)|Test F1: 0.0000|234s
Iteration 4 of 11 | Training 17 of 20 | 23/11/2020 00:30:36
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.4192 (+/- 0.6323)|Test F1: 0.0000|108s
Iteration 4 of 11 | Training 18 of 20 | 23/11/2020 00:32:24
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.3935 (+/- 0.6457)|Test F1: 0.0000|145s
Iteration 4 of 11 | Training 19 of 20 | 23/11/2020 00:34:50
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.3687 (+/- 0.6574)|Test F1: 0.0000|184s
Iteration 4 of 11 | Training 20 of 20 | 23/11/2020 00:37:55
50:50,CNN1a,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.2617 (+/- 0.6448)|Test F1: 0.0800|225s
Iteration 4 of 11 | Duration 56.21m | 1.05h since start
Iteration 5 of 11 | Training 1 of 20 | 23/11/2020 00:41:40
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:4,Embedding_siz



50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.7299 (+/- 0.5171)|Test F1: 0.1143|247s
Iteration 5 of 11 | Training 9 of 20 | 23/11/2020 01:04:30
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.8331 (+/- 0.1273)|Test F1: 0.0000|113s
Iteration 5 of 11 | Training 10 of 20 | 23/11/2020 01:06:24
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.5862 (+/- 0.7608)|Test F1: 0.0000|153s
Iteration 5 of 11 | Training 11 of 20 | 23/11/2020 01:08:58
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.7552 (+/- 0.6070)|Test F1: 0.0000|193s
Iteration 5 of 11 | Training 12 of 20 | 23/11/2020 01:12:11




50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.7133 (+/- 0.5221)|Test F1: 0.1250|241s
Iteration 5 of 11 | Training 13 of 20 | 23/11/2020 01:16:13
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.8171 (+/- 0.2550)|Test F1: 0.0000|110s
Iteration 5 of 11 | Training 14 of 20 | 23/11/2020 01:18:03
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.7404 (+/- 0.5184)|Test F1: 0.2500|150s
Iteration 5 of 11 | Training 15 of 20 | 23/11/2020 01:20:34
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.7381 (+/- 0.5901)|Test F1: 0.4000|201s
Iteration 5 of 11 | Training 16 of 20 | 23/11/2020 01:23:56
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.5712 (+/- 0.7283)|Test F1: 0.1290|232s
Iteration 5 of 11 | Training 17 of 20 | 23/11/2020 01:27:49
50:50,CNN1b,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.7963 (+/- 0.3524)|Test F1: 0.0000|109



50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.6037 (+/- 0.6324)|Test F1: 0.1600|212s
Iteration 6 of 11 | Training 4 of 20 | 23/11/2020 01:47:17
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.7271 (+/- 0.4857)|Test F1: 0.2857|248s
Iteration 6 of 11 | Training 5 of 20 | 23/11/2020 01:51:26
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.7853 (+/- 0.3551)|Test F1: 0.0000|118s
Iteration 6 of 11 | Training 6 of 20 | 23/11/2020 01:53:25




50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.7047 (+/- 0.4570)|Test F1: 0.0000|165s
Iteration 6 of 11 | Training 7 of 20 | 23/11/2020 01:56:10
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.5940 (+/- 0.6889)|Test F1: 0.0889|206s
Iteration 6 of 11 | Training 8 of 20 | 23/11/2020 01:59:37
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.6632 (+/- 0.7170)|Test F1: 0.2500|250s
Iteration 6 of 11 | Training 9 of 20 | 23/11/2020 02:03:48
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.7984 (+/- 0.3501)|Test F1: 0.0000|113s
Iteration 6 of 11 | Training 10 of 20 | 23/11/2020 02:05:41
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.6146 (+/- 0.6573)|Test F1: 0.1818|157s
Iteration 6 of 11 | Training 11 of 20 | 23/11/2020 02:08:19
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.6756 (+/- 0.6246)|Test F1: 0.0000|203s
I



50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.6581 (+/- 0.5763)|Test F1: 0.0000|210s
Iteration 6 of 11 | Training 16 of 20 | 23/11/2020 02:23:57
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.5390 (+/- 0.6667)|Test F1: 0.1538|246s
Iteration 6 of 11 | Training 17 of 20 | 23/11/2020 02:28:04
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.8130 (+/- 0.1376)|Test F1: 0.0000|114s
Iteration 6 of 11 | Training 18 of 20 | 23/11/2020 02:29:58
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.7832 (+/- 0.2302)|Test F1: 0.2222|161s
Iteration 6 of 11 | Training 19 of 20 | 23/11/2020 02:32:40




50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.6740 (+/- 0.6262)|Test F1: 0.0000|211s
Iteration 6 of 11 | Training 20 of 20 | 23/11/2020 02:36:12
50:50,CNN1c,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.6903 (+/- 0.4168)|Test F1: 0.0000|248s
Iteration 6 of 11 | Duration 61.18m | 3.03h since start
Iteration 7 of 11 | Training 1 of 20 | 23/11/2020 02:40:20
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.8853 (+/- 0.1369)|Test F1: 0.5000|100s
Iteration 7 of 11 | Training 2 of 20 | 23/11/2020 02:42:01
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.7359 (+/- 0.4105)|Test F1: 0.0000|145s
Iteration 7 of 11 | Training 3 of 20 | 23/11/2020 02:44:26
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.8377 (+/- 0.3917)|Test F1: 0.0000|181s
Iteration 7 of 11 | Training 4 of 20 | 23/11/2020 02:47:28
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:5



50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.8388 (+/- 0.2841)|Test F1: 0.2000|183s
Iteration 7 of 11 | Training 12 of 20 | 23/11/2020 03:09:05
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.7856 (+/- 0.3801)|Test F1: 0.2500|217s
Iteration 7 of 11 | Training 13 of 20 | 23/11/2020 03:12:43
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.8686 (+/- 0.1554)|Test F1: 0.4444|104s
Iteration 7 of 11 | Training 14 of 20 | 23/11/2020 03:14:27
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.8152 (+/- 0.1637)|Test F1: 0.1429|140s
Iteration 7 of 11 | Training 15 of 20 | 23/11/2020 03:16:48




50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.8545 (+/- 0.1912)|Test F1: 0.2222|184s
Iteration 7 of 11 | Training 16 of 20 | 23/11/2020 03:19:52
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.6810 (+/- 0.6150)|Test F1: 0.0000|218s
Iteration 7 of 11 | Training 17 of 20 | 23/11/2020 03:23:30
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.8580 (+/- 0.1208)|Test F1: 0.0000|103s
Iteration 7 of 11 | Training 18 of 20 | 23/11/2020 03:25:14
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.8354 (+/- 0.3480)|Test F1: 0.2500|141s
Iteration 7 of 11 | Training 19 of 20 | 23/11/2020 03:27:35
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.6227 (+/- 0.6212)|Test F1: 0.3333|181s
Iteration 7 of 11 | Training 20 of 20 | 23/11/2020 03:30:36
50:50,CNN1d,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7793 (+/- 0.4069)|Test F1: 0.5000|218



50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.2594 (+/- 0.6373)|Test F1: 0.0000|199s
Iteration 8 of 11 | Training 3 of 20 | 23/11/2020 03:40:04
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.3362 (+/- 0.6484)|Test F1: 0.1176|228s
Iteration 8 of 11 | Training 4 of 20 | 23/11/2020 03:43:52
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.3867 (+/- 0.6362)|Test F1: 0.0816|274s
Iteration 8 of 11 | Training 5 of 20 | 23/11/2020 03:48:27




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.4018 (+/- 0.7005)|Test F1: 0.0000|152s
Iteration 8 of 11 | Training 6 of 20 | 23/11/2020 03:50:59




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.2824 (+/- 0.6396)|Test F1: 0.0816|193s
Iteration 8 of 11 | Training 7 of 20 | 23/11/2020 03:54:13
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.3908 (+/- 0.6405)|Test F1: 0.0000|226s
Iteration 8 of 11 | Training 8 of 20 | 23/11/2020 03:57:59




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.3295 (+/- 0.6478)|Test F1: 0.0816|274s
Iteration 8 of 11 | Training 9 of 20 | 23/11/2020 04:02:33
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.3832 (+/- 0.6362)|Test F1: 0.0426|160s
Iteration 8 of 11 | Training 10 of 20 | 23/11/2020 04:05:14
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.3434 (+/- 0.6660)|Test F1: 0.0870|200s
Iteration 8 of 11 | Training 11 of 20 | 23/11/2020 04:08:35
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.2648 (+/- 0.6511)|Test F1: 0.0000|237s
Iteration 8 of 11 | Training 12 of 20 | 23/11/2020 04:12:32




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.3756 (+/- 0.6381)|Test F1: 0.0000|299s
Iteration 8 of 11 | Training 13 of 20 | 23/11/2020 04:17:32
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.3010 (+/- 0.6433)|Test F1: 0.0800|154s
Iteration 8 of 11 | Training 14 of 20 | 23/11/2020 04:20:06
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.3317 (+/- 0.6385)|Test F1: 0.0000|204s
Iteration 8 of 11 | Training 15 of 20 | 23/11/2020 04:23:30




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.3291 (+/- 0.6929)|Test F1: 0.0000|246s
Iteration 8 of 11 | Training 16 of 20 | 23/11/2020 04:27:37
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.4140 (+/- 0.6044)|Test F1: 0.0800|285s
Iteration 8 of 11 | Training 17 of 20 | 23/11/2020 04:32:23
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.2318 (+/- 0.6190)|Test F1: 0.0800|161s
Iteration 8 of 11 | Training 18 of 20 | 23/11/2020 04:35:04




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.3323 (+/- 0.7052)|Test F1: 0.0800|207s
Iteration 8 of 11 | Training 19 of 20 | 23/11/2020 04:38:32




50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.4192 (+/- 0.6156)|Test F1: 0.0851|257s
Iteration 8 of 11 | Training 20 of 20 | 23/11/2020 04:42:49
50:50,CNN2,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.3676 (+/- 0.6412)|Test F1: 0.1053|295s
Iteration 8 of 11 | Duration 73.49m | 5.15h since start
Iteration 9 of 11 | Training 1 of 20 | 23/11/2020 04:47:44
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.8767 (+/- 0.0966)|Test F1: 0.2857|83s
Iteration 9 of 11 | Training 2 of 20 | 23/11/2020 04:49:08
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.6593 (+/- 0.6348)|Test F1: 0.0000|108s
Iteration 9 of 11 | Training 3 of 20 | 23/11/2020 04:50:56
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.7469 (+/- 0.5807)|Test F1: 0.1667|132s
Iteration 9 of 11 | Training 4 of 20 | 23/11/2020 04:53:09




50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.7015 (+/- 0.5828)|Test F1: 0.3333|159s
Iteration 9 of 11 | Training 5 of 20 | 23/11/2020 04:55:48




50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.8124 (+/- 0.3518)|Test F1: 0.0000|87s
Iteration 9 of 11 | Training 6 of 20 | 23/11/2020 04:57:15
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.7885 (+/- 0.4323)|Test F1: 0.3333|106s
Iteration 9 of 11 | Training 7 of 20 | 23/11/2020 04:59:02
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.8230 (+/- 0.3310)|Test F1: 0.2500|132s
Iteration 9 of 11 | Training 8 of 20 | 23/11/2020 05:01:15
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.7794 (+/- 0.4505)|Test F1: 0.5000|156s
Iteration 9 of 11 | Training 9 of 20 | 23/11/2020 05:03:51
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.8552 (+/- 0.1600)|Test F1: 0.0000|86s
Iteration 9 of 11 | Training 10 of 20 | 23/11/2020 05:05:18
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.7556 (+/- 0.5558)|Test F1: 0.1667|110s
Iteration 



50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.6402 (+/- 0.6928)|Test F1: 0.3333|160s
Iteration 9 of 11 | Training 17 of 20 | 23/11/2020 05:19:58




50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.8392 (+/- 0.1429)|Test F1: 0.0000|88s
Iteration 9 of 11 | Training 18 of 20 | 23/11/2020 05:21:26
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.8796 (+/- 0.1343)|Test F1: 0.2500|108s
Iteration 9 of 11 | Training 19 of 20 | 23/11/2020 05:23:15
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.7138 (+/- 0.5396)|Test F1: 0.0976|124s
Iteration 9 of 11 | Training 20 of 20 | 23/11/2020 05:25:19
50:50,CNN3,x,norm,wv_sg_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7665 (+/- 0.5143)|Test F1: 0.6667|148s
Iteration 9 of 11 | Duration 40.07m | 5.82h since start
Iteration 10 of 11 | Training 1 of 20 | 23/11/2020 05:27:48




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.7014 (+/- 0.3433)|Test F1: 0.0000|131s
Iteration 10 of 11 | Training 2 of 20 | 23/11/2020 05:30:00
50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.4188 (+/- 0.6016)|Test F1: 0.0000|123s
Iteration 10 of 11 | Training 3 of 20 | 23/11/2020 05:32:03




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.4890 (+/- 0.5986)|Test F1: 0.1111|129s
Iteration 10 of 11 | Training 4 of 20 | 23/11/2020 05:34:13
50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.5416 (+/- 0.5910)|Test F1: 0.1250|127s
Iteration 10 of 11 | Training 5 of 20 | 23/11/2020 05:36:21




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.6226 (+/- 0.5776)|Test F1: 0.0000|130s
Iteration 10 of 11 | Training 6 of 20 | 23/11/2020 05:38:32




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.5226 (+/- 0.5999)|Test F1: 0.0000|128s
Iteration 10 of 11 | Training 7 of 20 | 23/11/2020 05:40:40
50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.5377 (+/- 0.6220)|Test F1: 0.0000|125s
Iteration 10 of 11 | Training 8 of 20 | 23/11/2020 05:42:46




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.5238 (+/- 0.5482)|Test F1: 0.0833|130s
Iteration 10 of 11 | Training 9 of 20 | 23/11/2020 05:44:57




50:50,LSTM1,x,norm,wv_sg_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.5795 (+/- 0.6227)|Test F1: 0.0000|131s
Iteration 10 of 11 | Training 10 of 20 | 23/11/2020 05:47:08


In [None]:
writeCSV

NameError: ignored

In [None]:
f1_df.to_csv(pathFigures+F1filename)

###7.3.2 Previous

In [None]:
#Clfs, x, 4-8, 200, norm, cbow_mean

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 12 | Training 1 of 5 | 22/11/2020 21:50:40
50:50,LR,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9949 (+/- 0.0161)|Test F1: 1.0000|2s
Iteration 1 of 12 | Training 2 of 5 | 22/11/2020 21:50:42
50:50,LR,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9949 (+/- 0.0161)|Test F1: 1.0000|0s
Iteration 1 of 12 | Training 3 of 5 | 22/11/2020 21:50:43
50:50,LR,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9949 (+/- 0.0161)|Test F1: 0.6667|0s
Iteration 1 of 12 | Training 4 of 5 | 22/11/2020 21:50:44
50:50,LR,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9965 (+/- 0.0152)|Test F1: 1.0000|0s
Iteration 1 of 12 | Training 5 of 5 | 22/11/2020 21:50:45
50:50,LR,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.9686 (+/- 0.0341)|Test F1: 0.5000|0s
Iteration 1 of 12 | Duration 0.10m | 0.00h since start
Iteration 2 of 12 | Training 1 of 5 | 22/11/2020 21:50:46




50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.3139 (+/- 0.6548)|Test F1: 0.0800|111s
Iteration 5 of 12 | Training 2 of 5 | 22/11/2020 21:53:44
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.2864 (+/- 0.6473)|Test F1: 0.0000|104s
Iteration 5 of 12 | Training 3 of 5 | 22/11/2020 21:55:29
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.3174 (+/- 0.6631)|Test F1: 0.0800|103s
Iteration 5 of 12 | Training 4 of 5 | 22/11/2020 21:57:12
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.3141 (+/- 0.6554)|Test F1: 0.0800|105s
Iteration 5 of 12 | Training 5 of 5 | 22/11/2020 21:58:57
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.3687 (+/- 0.6553)|Test F1: 0.0800|102s
Iteration 5 of 12 | Duration 8.79m | 0.17h since start
Iteration 6 of 12 | Training 1 of 5 | 22/11/2020 22:00:40
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_s



50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.8009 (+/- 0.1369)|Test F1: 0.2222|119s
Iteration 7 of 12 | Duration 9.45m | 0.47h since start
Iteration 8 of 12 | Training 1 of 5 | 22/11/2020 22:18:54
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9913 (+/- 0.0285)|Test F1: 1.0000|103s
Iteration 8 of 12 | Training 2 of 5 | 22/11/2020 22:20:38
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9869 (+/- 0.0318)|Test F1: 0.0000|100s
Iteration 8 of 12 | Training 3 of 5 | 22/11/2020 22:22:19
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9914 (+/- 0.0291)|Test F1: 0.6667|99s
Iteration 8 of 12 | Training 4 of 5 | 22/11/2020 22:23:58
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9896 (+/- 0.0287)|Test F1: 0.6667|99s
Iteration 8 of 12 | Training 5 of 5 | 22/11/2020 22:25:38
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_siz



50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.3661 (+/- 0.7241)|Test F1: 0.2857|151s
Iteration 9 of 12 | Training 3 of 5 | 22/11/2020 22:32:17




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.5455 (+/- 0.6317)|Test F1: 0.0000|150s
Iteration 9 of 12 | Training 4 of 5 | 22/11/2020 22:34:47
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.7126 (+/- 0.3854)|Test F1: 0.0000|147s
Iteration 9 of 12 | Training 5 of 5 | 22/11/2020 22:37:15




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.3333 (+/- 0.7013)|Test F1: 0.1429|153s
Iteration 9 of 12 | Duration 12.52m | 0.82h since start
Iteration 10 of 12 | Training 1 of 5 | 22/11/2020 22:39:48




50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9960 (+/- 0.0154)|Test F1: 0.0000|80s
Iteration 10 of 12 | Training 2 of 5 | 22/11/2020 22:41:09
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9949 (+/- 0.0161)|Test F1: 0.6667|76s
Iteration 10 of 12 | Training 3 of 5 | 22/11/2020 22:42:26
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9916 (+/- 0.0413)|Test F1: 1.0000|77s
Iteration 10 of 12 | Training 4 of 5 | 22/11/2020 22:43:43
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9824 (+/- 0.0807)|Test F1: 0.0000|76s
Iteration 10 of 12 | Training 5 of 5 | 22/11/2020 22:45:00
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.8343 (+/- 0.1025)|Test F1: 0.2857|77s
Iteration 10 of 12 | Duration 6.48m | 0.93h since start
Iteration 11 of 12 | Training 1 of 5 | 22/11/2020 22:46:17




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9364 (+/- 0.1103)|Test F1: 0.6667|128s
Iteration 11 of 12 | Training 2 of 5 | 22/11/2020 22:48:25
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9514 (+/- 0.0965)|Test F1: 0.5000|123s
Iteration 11 of 12 | Training 3 of 5 | 22/11/2020 22:50:29




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9222 (+/- 0.1204)|Test F1: 0.6667|128s
Iteration 11 of 12 | Training 4 of 5 | 22/11/2020 22:52:37
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9214 (+/- 0.1048)|Test F1: 0.8000|123s
Iteration 11 of 12 | Training 5 of 5 | 22/11/2020 22:54:41




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.7652 (+/- 0.1531)|Test F1: 0.0000|128s
Iteration 11 of 12 | Duration 10.54m | 1.10h since start
Iteration 12 of 12 | Training 1 of 5 | 22/11/2020 22:56:50




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:200|Train F1: 0.9579 (+/- 0.1018)|Test F1: 0.6667|128s
Iteration 12 of 12 | Training 2 of 5 | 22/11/2020 22:58:58
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:200|Train F1: 0.9791 (+/- 0.0456)|Test F1: 0.6667|126s
Iteration 12 of 12 | Training 3 of 5 | 22/11/2020 23:01:04




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:200|Train F1: 0.9623 (+/- 0.1040)|Test F1: 0.6667|129s
Iteration 12 of 12 | Training 4 of 5 | 22/11/2020 23:03:14




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:200|Train F1: 0.9803 (+/- 0.0510)|Test F1: 0.6667|131s
Iteration 12 of 12 | Training 5 of 5 | 22/11/2020 23:05:26
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:200|Train F1: 0.7816 (+/- 0.1169)|Test F1: 0.0000|122s
Iteration 12 of 12 | Duration 10.63m | 1.28h since start


In [None]:
#Clfs, x, n=2-6, 60, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 12 | Training 1 of 5 | 22/11/2020 19:41:05
50:50,LR,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.9904 (+/- 0.0184)|Test F1: 1.0000|2s
Iteration 1 of 12 | Training 2 of 5 | 22/11/2020 19:41:07
50:50,LR,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.9904 (+/- 0.0184)|Test F1: 1.0000|0s
Iteration 1 of 12 | Training 3 of 5 | 22/11/2020 19:41:07
50:50,LR,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.9904 (+/- 0.0184)|Test F1: 1.0000|0s
Iteration 1 of 12 | Training 4 of 5 | 22/11/2020 19:41:08
50:50,LR,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.9904 (+/- 0.0184)|Test F1: 1.0000|0s
Iteration 1 of 12 | Training 5 of 5 | 22/11/2020 19:41:08
50:50,LR,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.9904 (+/- 0.0184)|Test F1: 1.0000|0s
Iteration 1 of 12 | Duration 0.08m | 0.00h since start
Iteration 2 of 12 | Training 1 of 5 | 22/11/2020 19:41:09
50:50,SVMa,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.9922 (+/- 0.0174)|Test F1: 0.8000|0s
Iteration 2 of 12 | Tr



50:50,MLP8,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.9942 (+/- 0.0171)|Test F1: 0.0000|8s
Iteration 4 of 12 | Training 3 of 5 | 22/11/2020 19:41:29
50:50,MLP8,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.9956 (+/- 0.0133)|Test F1: 1.0000|8s
Iteration 4 of 12 | Training 4 of 5 | 22/11/2020 19:41:38
50:50,MLP8,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.9951 (+/- 0.0136)|Test F1: 1.0000|7s
Iteration 4 of 12 | Training 5 of 5 | 22/11/2020 19:41:46
50:50,MLP8,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.9942 (+/- 0.0156)|Test F1: 0.6667|7s
Iteration 4 of 12 | Duration 0.70m | 0.01h since start
Iteration 5 of 12 | Training 1 of 5 | 22/11/2020 19:41:54




Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
50:50,CNN1a,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.4263 (+/- 0.6434)|Test F1: 0.0000|61s
Iteration 5 of 12 | Training 2 of 5 | 22/11/2020 19:42:55
50:50,CNN1a,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.3904 (+/- 0.6418)|Test F1: 0.0000|51s
Iteration 5 of 12 | Training 3 of 5 | 22/11/2020 19:43:47
50:50,CNN1a,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.2936 (+/- 0.6662)|Test F1: 0.0000|51s
Iteration 5 of 12 | Training 4 of 5 | 22/11/2020 19:44:38
50:50,CNN1a,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.3449 (+/- 0.6683)|Test F1: 0.0800|50s
Iteration 5 of 12 | Training 5 of 5 | 22/11/2020 19:45:29
50:50,CNN1a,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1:



50:50,CNN1d,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.9933 (+/- 0.0155)|Test F1: 1.0000|50s
Iteration 8 of 12 | Training 2 of 5 | 22/11/2020 19:55:42
50:50,CNN1d,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.9916 (+/- 0.0171)|Test F1: 1.0000|48s
Iteration 8 of 12 | Training 3 of 5 | 22/11/2020 19:56:30
50:50,CNN1d,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.9918 (+/- 0.0177)|Test F1: 1.0000|49s
Iteration 8 of 12 | Training 4 of 5 | 22/11/2020 19:57:19
50:50,CNN1d,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.9909 (+/- 0.0188)|Test F1: 1.0000|46s
Iteration 8 of 12 | Training 5 of 5 | 22/11/2020 19:58:06
50:50,CNN1d,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.9922 (+/- 0.0169)|Test F1: 1.0000|47s
Iteration 8 of 12 | Duration 4.03m | 0.30h since start
Iteration 9 of 12 | Training 1 of 5 | 22/11/2020 19:58:53
50:50,CNN2,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.7403 (+/- 0.3740)|Test F1: 0.0000|92s
Iteration 9 of 12 | Training 2 of 5 | 22/11/2020 20:00:25
50:50,CNN2,x,norm,tfidf,nTo:



50:50,CNN2,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.7545 (+/- 0.1974)|Test F1: 0.1905|97s
Iteration 9 of 12 | Training 4 of 5 | 22/11/2020 20:03:36
50:50,CNN2,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.7023 (+/- 0.3860)|Test F1: 0.0000|90s
Iteration 9 of 12 | Training 5 of 5 | 22/11/2020 20:05:07
50:50,CNN2,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.7100 (+/- 0.3887)|Test F1: 0.4000|91s
Iteration 9 of 12 | Duration 7.77m | 0.43h since start
Iteration 10 of 12 | Training 1 of 5 | 22/11/2020 20:06:39




50:50,CNN3,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.9811 (+/- 0.0797)|Test F1: 0.0000|46s
Iteration 10 of 12 | Training 2 of 5 | 22/11/2020 20:07:25
50:50,CNN3,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.9938 (+/- 0.0197)|Test F1: 0.0000|40s
Iteration 10 of 12 | Training 3 of 5 | 22/11/2020 20:08:05
50:50,CNN3,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.9958 (+/- 0.0231)|Test F1: 0.6667|40s
Iteration 10 of 12 | Training 4 of 5 | 22/11/2020 20:08:46




50:50,CNN3,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.9953 (+/- 0.0232)|Test F1: 0.0000|44s
Iteration 10 of 12 | Training 5 of 5 | 22/11/2020 20:09:30
50:50,CNN3,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.9928 (+/- 0.0246)|Test F1: 0.0000|40s
Iteration 10 of 12 | Duration 3.52m | 0.48h since start
Iteration 11 of 12 | Training 1 of 5 | 22/11/2020 20:10:10




50:50,LSTM1,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.8856 (+/- 0.0976)|Test F1: 0.4000|120s
Iteration 11 of 12 | Training 2 of 5 | 22/11/2020 20:12:11




50:50,LSTM1,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.9056 (+/- 0.1286)|Test F1: 0.4000|122s
Iteration 11 of 12 | Training 3 of 5 | 22/11/2020 20:14:14




50:50,LSTM1,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.8981 (+/- 0.1205)|Test F1: 0.4000|127s
Iteration 11 of 12 | Training 4 of 5 | 22/11/2020 20:16:22
50:50,LSTM1,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.8950 (+/- 0.1039)|Test F1: 0.4000|128s
Iteration 11 of 12 | Training 5 of 5 | 22/11/2020 20:18:30




50:50,LSTM1,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.8941 (+/- 0.1161)|Test F1: 0.4000|129s
Iteration 11 of 12 | Duration 10.49m | 0.66h since start
Iteration 12 of 12 | Training 1 of 5 | 22/11/2020 20:20:39
50:50,LSTM2,x,norm,tfidf,nTo:2,FeatureMax:60|Train F1: 0.8900 (+/- 0.1057)|Test F1: 0.4000|120s
Iteration 12 of 12 | Training 2 of 5 | 22/11/2020 20:22:40




50:50,LSTM2,x,norm,tfidf,nTo:3,FeatureMax:60|Train F1: 0.8898 (+/- 0.1086)|Test F1: 0.4000|125s
Iteration 12 of 12 | Training 3 of 5 | 22/11/2020 20:24:45




50:50,LSTM2,x,norm,tfidf,nTo:4,FeatureMax:60|Train F1: 0.8937 (+/- 0.1131)|Test F1: 0.4000|124s
Iteration 12 of 12 | Training 4 of 5 | 22/11/2020 20:26:50




50:50,LSTM2,x,norm,tfidf,nTo:5,FeatureMax:60|Train F1: 0.8873 (+/- 0.1039)|Test F1: 0.4000|124s
Iteration 12 of 12 | Training 5 of 5 | 22/11/2020 20:28:55




50:50,LSTM2,x,norm,tfidf,nTo:6,FeatureMax:60|Train F1: 0.8896 (+/- 0.1081)|Test F1: 0.4000|123s
Iteration 12 of 12 | Duration 10.32m | 0.83h since start


In [None]:
#00 and 20 LR, SVMa, SVMs, x, n=1, 20-100, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 6 | Training 1 of 5 | 22/11/2020 18:20:10
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.1199 (+/- 0.2620)|Test F1: 0.0000|18s
Iteration 1 of 6 | Training 2 of 5 | 22/11/2020 18:20:10
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:40 in df|Train F1: 0.9784 (+/- 0.0603)|Test F1: 1.0000|35s
Iteration 1 of 6 | Training 3 of 5 | 22/11/2020 18:20:10
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:60 in df|Train F1: 0.9821 (+/- 0.0603)|Test F1: 0.6667|17s
Iteration 1 of 6 | Training 4 of 5 | 22/11/2020 18:20:10
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:80 in df|Train F1: 0.9857 (+/- 0.0528)|Test F1: 0.6667|17s
Iteration 1 of 6 | Training 5 of 5 | 22/11/2020 18:20:10
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:100 in df|Train F1: 0.9831 (+/- 0.0562)|Test F1: 1.0000|17s
Iteration 1 of 6 | Duration 0.00m | 0.00h since start
Iteration 2 of 6 | Training 1 of 5 | 22/11/2020 18:20:10
20:80,SVMa,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.0000 (+/- 0.0000)|Test F

In [None]:
#00 LR, x, n=1, 40, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
No training roots to delete
No test roots to delete
Iteration 1 of 1 | Training 1 of 1 | 22/11/2020 18:08:04
...vectorizing data
...saved vectorized file
...cross validating on training data
00,LR,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.0000 (+/- 0.0000)|Test F1: 0.0000|16s
Iteration 1 of 1 | Duration 0.28m | 0.00h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete


In [None]:
#20:80 LR, x, n=1, 40, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
No training roots to delete
No test roots to delete
Iteration 1 of 1 | Training 1 of 1 | 22/11/2020 17:54:37
...vectorizing data
...saved vectorized file
...cross validating on training data
20:80,LR,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9784 (+/- 0.0603)|Test F1: 1.0000|35s
Iteration 1 of 1 | Duration 0.60m | 0.01h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete


In [None]:
#Verify LSTM1, x, n=1, 20-100, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV False
writeF1 False
No training roots to delete
No test roots to delete
Iteration 1 of 1 | Training 1 of 5 | 22/11/2020 13:22:11
...loaded vectorized file
...cross validating on training data




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.7145 (+/- 0.3134)|Test F1: 0.4000|119s
Iteration 1 of 1 | Training 2 of 5 | 22/11/2020 13:24:11
...loaded vectorized file
...cross validating on training data




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8743 (+/- 0.0774)|Test F1: 0.3333|116s
Iteration 1 of 1 | Training 3 of 5 | 22/11/2020 13:26:07
...loaded vectorized file
...cross validating on training data
50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8855 (+/- 0.0971)|Test F1: 0.5000|115s
Iteration 1 of 1 | Training 4 of 5 | 22/11/2020 13:28:02
...loaded vectorized file
...cross validating on training data




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.8999 (+/- 0.0998)|Test F1: 0.6667|123s
Iteration 1 of 1 | Training 5 of 5 | 22/11/2020 13:30:06
...loaded vectorized file
...cross validating on training data
50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9122 (+/- 0.1159)|Test F1: 0.6667|114s
Iteration 1 of 1 | Duration 9.84m | 0.16h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete


In [None]:
#All CNN except CNN3, x, n=1, 20-40, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV False
writeF1 False
No training roots to delete
No test roots to delete
Iteration 1 of 5 | Training 1 of 2 | 22/11/2020 11:28:00
...loaded vectorized file
...cross validating on training data
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.2383 (+/- 0.6378)|Test F1: 0.0000|30s
Iteration 1 of 5 | Training 2 of 2 | 22/11/2020 11:28:30
...loaded vectorized file
...cross validating on training data
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.2432 (+/- 0.6544)|Test F1: 0.0000|39s
Iteration 1 of 5 | Duration 1.16m | 0.02h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 2 of 5 | Training 1 of 2 | 22/11/2020 11:29:09
...loaded vectorized file
...cross validating on training data
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9787 (+/- 0.0375)|Test F1: 0.5714|30s
Iteration 2 of 5 | Training 2 of 2 | 22/11/2020 11:29:40
...loaded vectorize



50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9854 (+/- 0.0254)|Test F1: 0.6667|41s
Iteration 2 of 5 | Duration 1.20m | 0.04h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 3 of 5 | Training 1 of 2 | 22/11/2020 11:30:21
...loaded vectorized file
...cross validating on training data
50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9747 (+/- 0.0462)|Test F1: 0.5000|34s
Iteration 3 of 5 | Training 2 of 2 | 22/11/2020 11:30:56
...loaded vectorized file
...cross validating on training data




50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9839 (+/- 0.0314)|Test F1: 0.8000|43s
Iteration 3 of 5 | Duration 1.30m | 0.06h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 4 of 5 | Training 1 of 2 | 22/11/2020 11:31:39
...loaded vectorized file
...cross validating on training data
50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9796 (+/- 0.0260)|Test F1: 0.5000|30s
Iteration 4 of 5 | Training 2 of 2 | 22/11/2020 11:32:09
...loaded vectorized file
...cross validating on training data
50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9912 (+/- 0.0176)|Test F1: 0.6667|37s
Iteration 4 of 5 | Duration 1.13m | 0.08h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 5 of 5 | Training 1 of 2 | 22/11/2020 11:32:47
...loaded vectorized f



50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3954 (+/- 0.7241)|Test F1: 0.0000|83s
Iteration 5 of 5 | Duration 2.61m | 0.12h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete


In [None]:
#All CNNs, x, n=1, 20, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV False
writeF1 False
No training roots to delete
No test roots to delete
Iteration 1 of 6 | Training 1 of 1 | 22/11/2020 11:20:08
...loaded vectorized file
...cross validating on training data
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.3471 (+/- 0.6702)|Test F1: 0.0000|45s
Iteration 1 of 6 | Duration 0.75m | 0.01h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 2 of 6 | Training 1 of 1 | 22/11/2020 11:20:53
...loaded vectorized file
...cross validating on training data
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:20|Train F

ValueError: ignored

In [None]:
#All clfs, x, n=2-6, 40, norm, tfidf

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 12 | Training 1 of 5 | 21/11/2020 13:16:07
50:50,LR,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.9919 (+/- 0.0188)|Test F1: 0.8000|9s
Iteration 1 of 12 | Training 2 of 5 | 21/11/2020 13:16:16
50:50,LR,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.9919 (+/- 0.0188)|Test F1: 0.8000|1s
Iteration 1 of 12 | Training 3 of 5 | 21/11/2020 13:16:18
50:50,LR,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.9919 (+/- 0.0188)|Test F1: 0.8000|1s
Iteration 1 of 12 | Training 4 of 5 | 21/11/2020 13:16:19
50:50,LR,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.9919 (+/- 0.0188)|Test F1: 0.8000|1s
Iteration 1 of 12 | Training 5 of 5 | 21/11/2020 13:16:21
50:50,LR,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.9919 (+/- 0.0188)|Test F1: 0.8000|1s
Iteration 1 of 12 | Duration 0.25m | 0.00h since start
Iteration 2 of 12 | Training 1 of 5 | 21/11/2020 13:16:22
50:50,SVMa,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.9930 (+/- 0.0177)|Test F1: 0.8000|0s
Iteration 2 of 12 | Tr



Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
50:50,LSTM1,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.8780 (+/- 0.1099)|Test F1: 0.4000|119s
Iteration 5 of 12 | Training 2 of 5 | 21/11/2020 13:19:11




50:50,LSTM1,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.8752 (+/- 0.1279)|Test F1: 0.2857|116s
Iteration 5 of 12 | Training 3 of 5 | 21/11/2020 13:21:07
50:50,LSTM1,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.8776 (+/- 0.0849)|Test F1: 0.4000|113s
Iteration 5 of 12 | Training 4 of 5 | 21/11/2020 13:23:00




50:50,LSTM1,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.8901 (+/- 0.1061)|Test F1: 0.3333|118s
Iteration 5 of 12 | Training 5 of 5 | 21/11/2020 13:24:58
50:50,LSTM1,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.8875 (+/- 0.0926)|Test F1: 0.2857|112s
Iteration 5 of 12 | Duration 9.66m | 0.18h since start
Iteration 6 of 12 | Training 1 of 5 | 21/11/2020 13:26:51




50:50,LSTM2,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.8774 (+/- 0.1227)|Test F1: 0.3333|118s
Iteration 6 of 12 | Training 2 of 5 | 21/11/2020 13:28:49
50:50,LSTM2,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.8801 (+/- 0.1105)|Test F1: 0.4000|112s
Iteration 6 of 12 | Training 3 of 5 | 21/11/2020 13:30:41




50:50,LSTM2,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.8856 (+/- 0.1125)|Test F1: 0.4000|120s
Iteration 6 of 12 | Training 4 of 5 | 21/11/2020 13:32:41
50:50,LSTM2,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.8848 (+/- 0.1135)|Test F1: 0.4000|120s
Iteration 6 of 12 | Training 5 of 5 | 21/11/2020 13:34:41




50:50,LSTM2,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.8805 (+/- 0.1241)|Test F1: 0.3333|124s
Iteration 6 of 12 | Duration 9.92m | 0.34h since start
Iteration 7 of 12 | Training 1 of 5 | 21/11/2020 13:36:45
50:50,CNN1a,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.2610 (+/- 0.6417)|Test F1: 0.0800|40s
Iteration 7 of 12 | Training 2 of 5 | 21/11/2020 13:37:26
50:50,CNN1a,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.3660 (+/- 0.6514)|Test F1: 0.0800|39s
Iteration 7 of 12 | Training 3 of 5 | 21/11/2020 13:38:06
50:50,CNN1a,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.2316 (+/- 0.6214)|Test F1: 0.0800|40s
Iteration 7 of 12 | Training 4 of 5 | 21/11/2020 13:38:46
50:50,CNN1a,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.2597 (+/- 0.6411)|Test F1: 0.0800|40s
Iteration 7 of 12 | Training 5 of 5 | 21/11/2020 13:39:26
50:50,CNN1a,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.2875 (+/- 0.6534)|Test F1: 0.0000|39s
Iteration 7 of 12 | Duration 3.34m | 0.40h since start
Iteration 8 of 12 | Training 



50:50,CNN1c,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.9895 (+/- 0.0210)|Test F1: 1.0000|45s
Iteration 9 of 12 | Training 3 of 5 | 21/11/2020 13:44:54
50:50,CNN1c,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.9863 (+/- 0.0222)|Test F1: 1.0000|42s
Iteration 9 of 12 | Training 4 of 5 | 21/11/2020 13:45:37
50:50,CNN1c,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.9890 (+/- 0.0215)|Test F1: 1.0000|42s
Iteration 9 of 12 | Training 5 of 5 | 21/11/2020 13:46:19




50:50,CNN1c,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.9912 (+/- 0.0177)|Test F1: 1.0000|46s
Iteration 9 of 12 | Duration 3.63m | 0.52h since start
Iteration 10 of 12 | Training 1 of 5 | 21/11/2020 13:47:05
50:50,CNN1d,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.9923 (+/- 0.0146)|Test F1: 0.4000|38s
Iteration 10 of 12 | Training 2 of 5 | 21/11/2020 13:47:43
50:50,CNN1d,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.9944 (+/- 0.0133)|Test F1: 0.6667|38s
Iteration 10 of 12 | Training 3 of 5 | 21/11/2020 13:48:22
50:50,CNN1d,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.9932 (+/- 0.0137)|Test F1: 0.5000|39s
Iteration 10 of 12 | Training 4 of 5 | 21/11/2020 13:49:01
50:50,CNN1d,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.9940 (+/- 0.0144)|Test F1: 0.6667|38s
Iteration 10 of 12 | Training 5 of 5 | 21/11/2020 13:49:40
50:50,CNN1d,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.9943 (+/- 0.0136)|Test F1: 0.6667|38s
Iteration 10 of 12 | Duration 3.23m | 0.57h since start
Iteration 11 of 12 | Tra



50:50,CNN2,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.5506 (+/- 0.7030)|Test F1: 0.0000|88s
Iteration 11 of 12 | Training 3 of 5 | 21/11/2020 13:53:09
50:50,CNN2,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.5007 (+/- 0.7250)|Test F1: 0.0000|84s
Iteration 11 of 12 | Training 4 of 5 | 21/11/2020 13:54:34
50:50,CNN2,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.4866 (+/- 0.6904)|Test F1: 0.1053|83s
Iteration 11 of 12 | Training 5 of 5 | 21/11/2020 13:55:58




50:50,CNN2,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.5368 (+/- 0.7504)|Test F1: 0.0000|88s
Iteration 11 of 12 | Duration 7.13m | 0.69h since start
Iteration 12 of 12 | Training 1 of 5 | 21/11/2020 13:57:26
50:50,CNN3,x,norm,tfidf,nTo:2,FeatureMax:40|Train F1: 0.9835 (+/- 0.0837)|Test F1: 0.8000|34s
Iteration 12 of 12 | Training 2 of 5 | 21/11/2020 13:58:01
50:50,CNN3,x,norm,tfidf,nTo:3,FeatureMax:40|Train F1: 0.9928 (+/- 0.0179)|Test F1: 1.0000|35s
Iteration 12 of 12 | Training 3 of 5 | 21/11/2020 13:58:36
50:50,CNN3,x,norm,tfidf,nTo:4,FeatureMax:40|Train F1: 0.9922 (+/- 0.0199)|Test F1: 0.8000|35s
Iteration 12 of 12 | Training 4 of 5 | 21/11/2020 13:59:12
50:50,CNN3,x,norm,tfidf,nTo:5,FeatureMax:40|Train F1: 0.9933 (+/- 0.0164)|Test F1: 0.8000|35s
Iteration 12 of 12 | Training 5 of 5 | 21/11/2020 13:59:48
50:50,CNN3,x,norm,tfidf,nTo:6,FeatureMax:40|Train F1: 0.9928 (+/- 0.0179)|Test F1: 1.0000|35s
Iteration 12 of 12 | Duration 2.95m | 0.74h since start


In [None]:
#All clfs except LR and SVMa, x, all min_count, all size, norm, cb mean

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

writeCSV True
writeF1 True
Iteration 1 of 10 | Training 1 of 15 | 21/11/2020 00:54:49
50:50,SVMs,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9978 (+/- 0.0101)|Test F1: 0.6667|3s
Iteration 1 of 10 | Training 2 of 15 | 21/11/2020 00:54:52
50:50,SVMs,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|1s
Iteration 1 of 10 | Training 3 of 15 | 21/11/2020 00:54:54
50:50,SVMs,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|1s
Iteration 1 of 10 | Training 4 of 15 | 21/11/2020 00:54:55
50:50,SVMs,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|1s
Iteration 1 of 10 | Training 5 of 15 | 21/11/2020 00:54:57
50:50,SVMs,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|1s
Iteration 1 of 10 | Training 6 of 15 | 21/11/2020 00:54:58
50:50,SVMs,x,norm,wv_cbow_mean,Min_word



Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9735 (+/- 0.0593)|Test F1: 0.6667|136s
Iteration 3 of 10 | Training 2 of 15 | 21/11/2020 01:01:06




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9746 (+/- 0.0821)|Test F1: 0.6667|128s
Iteration 3 of 10 | Training 3 of 15 | 21/11/2020 01:03:15
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9922 (+/- 0.0191)|Test F1: 0.6667|123s
Iteration 3 of 10 | Training 4 of 15 | 21/11/2020 01:05:18




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9840 (+/- 0.0594)|Test F1: 0.6667|128s
Iteration 3 of 10 | Training 5 of 15 | 21/11/2020 01:07:27




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9849 (+/- 0.0312)|Test F1: 0.6667|132s
Iteration 3 of 10 | Training 6 of 15 | 21/11/2020 01:09:39
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9829 (+/- 0.0558)|Test F1: 0.6667|123s
Iteration 3 of 10 | Training 7 of 15 | 21/11/2020 01:11:43




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9779 (+/- 0.0644)|Test F1: 0.6667|128s
Iteration 3 of 10 | Training 8 of 15 | 21/11/2020 01:13:51
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9860 (+/- 0.0303)|Test F1: 0.8000|123s
Iteration 3 of 10 | Training 9 of 15 | 21/11/2020 01:15:55




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.9874 (+/- 0.0292)|Test F1: 0.6667|129s
Iteration 3 of 10 | Training 10 of 15 | 21/11/2020 01:18:04




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.9789 (+/- 0.0638)|Test F1: 0.6667|131s
Iteration 3 of 10 | Training 11 of 15 | 21/11/2020 01:20:16
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.7859 (+/- 0.1100)|Test F1: 0.0000|124s
Iteration 3 of 10 | Training 12 of 15 | 21/11/2020 01:22:21




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.7777 (+/- 0.1233)|Test F1: 0.0000|131s
Iteration 3 of 10 | Training 13 of 15 | 21/11/2020 01:24:33




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.5204 (+/- 0.5248)|Test F1: 0.0800|133s
Iteration 3 of 10 | Training 14 of 15 | 21/11/2020 01:26:46
50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.7758 (+/- 0.1267)|Test F1: 0.0000|129s
Iteration 3 of 10 | Training 15 of 15 | 21/11/2020 01:28:55




50:50,LSTM1,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7656 (+/- 0.1078)|Test F1: 0.0000|134s
Iteration 3 of 10 | Duration 32.31m | 0.61h since start
Iteration 4 of 10 | Training 1 of 15 | 21/11/2020 01:31:09
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9864 (+/- 0.0280)|Test F1: 0.6667|126s
Iteration 4 of 10 | Training 2 of 15 | 21/11/2020 01:33:16




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9940 (+/- 0.0162)|Test F1: 0.8000|132s
Iteration 4 of 10 | Training 3 of 15 | 21/11/2020 01:35:28




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9942 (+/- 0.0154)|Test F1: 0.6667|132s
Iteration 4 of 10 | Training 4 of 15 | 21/11/2020 01:37:41
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9929 (+/- 0.0184)|Test F1: 0.6667|130s
Iteration 4 of 10 | Training 5 of 15 | 21/11/2020 01:39:52




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9912 (+/- 0.0279)|Test F1: 0.6667|135s
Iteration 4 of 10 | Training 6 of 15 | 21/11/2020 01:42:08




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9903 (+/- 0.0327)|Test F1: 0.6667|135s
Iteration 4 of 10 | Training 7 of 15 | 21/11/2020 01:44:23
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9879 (+/- 0.0406)|Test F1: 0.6667|129s
Iteration 4 of 10 | Training 8 of 15 | 21/11/2020 01:46:33




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9870 (+/- 0.0328)|Test F1: 0.6667|131s
Iteration 4 of 10 | Training 9 of 15 | 21/11/2020 01:48:44
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.9930 (+/- 0.0186)|Test F1: 0.6667|131s
Iteration 4 of 10 | Training 10 of 15 | 21/11/2020 01:50:56




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.9861 (+/- 0.0271)|Test F1: 0.6667|133s
Iteration 4 of 10 | Training 11 of 15 | 21/11/2020 01:53:10




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.7517 (+/- 0.3338)|Test F1: 0.0000|138s
Iteration 4 of 10 | Training 12 of 15 | 21/11/2020 01:55:28
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.7552 (+/- 0.3249)|Test F1: 0.1111|134s
Iteration 4 of 10 | Training 13 of 15 | 21/11/2020 01:57:42




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.4659 (+/- 0.5848)|Test F1: 0.0800|135s
Iteration 4 of 10 | Training 14 of 15 | 21/11/2020 01:59:57




50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.7566 (+/- 0.1418)|Test F1: 0.0000|139s
Iteration 4 of 10 | Training 15 of 15 | 21/11/2020 02:02:17
50:50,LSTM2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7327 (+/- 0.3199)|Test F1: 0.0833|129s
Iteration 4 of 10 | Duration 33.29m | 1.16h since start
Iteration 5 of 10 | Training 1 of 15 | 21/11/2020 02:04:26




50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.4152 (+/- 0.6265)|Test F1: 0.0000|154s
Iteration 5 of 10 | Training 2 of 15 | 21/11/2020 02:07:01




50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.3424 (+/- 0.6609)|Test F1: 0.0800|196s
Iteration 5 of 10 | Training 3 of 15 | 21/11/2020 02:10:17
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.4760 (+/- 0.5988)|Test F1: 0.0000|239s
Iteration 5 of 10 | Training 4 of 15 | 21/11/2020 02:14:16
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.3895 (+/- 0.6403)|Test F1: 0.0800|154s
Iteration 5 of 10 | Training 5 of 15 | 21/11/2020 02:16:51
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.5034 (+/- 0.5720)|Test F1: 0.0000|192s
Iteration 5 of 10 | Training 6 of 15 | 21/11/2020 02:20:04
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.2853 (+/- 0.6461)|Test F1: 0.0800|234s
Iteration 5 of 10 | Training 7 of 15 | 21/11/2020 02:23:59
50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.4225 (+/- 0.6383)|Test F1: 0.0



50:50,CNN1a,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.3096 (+/- 0.6472)|Test F1: 0.0800|236s
Iteration 5 of 10 | Duration 48.43m | 1.97h since start
Iteration 6 of 10 | Training 1 of 15 | 21/11/2020 02:52:52
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9958 (+/- 0.0142)|Test F1: 0.6667|152s
Iteration 6 of 10 | Training 2 of 15 | 21/11/2020 02:55:25
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9973 (+/- 0.0108)|Test F1: 0.0000|194s
Iteration 6 of 10 | Training 3 of 15 | 21/11/2020 02:58:39




50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9969 (+/- 0.0112)|Test F1: 1.0000|238s
Iteration 6 of 10 | Training 4 of 15 | 21/11/2020 03:02:38
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9964 (+/- 0.0116)|Test F1: 0.6667|150s
Iteration 6 of 10 | Training 5 of 15 | 21/11/2020 03:05:09
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9963 (+/- 0.0178)|Test F1: 0.6667|192s
Iteration 6 of 10 | Training 6 of 15 | 21/11/2020 03:08:22
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9973 (+/- 0.0108)|Test F1: 0.6667|235s
Iteration 6 of 10 | Training 7 of 15 | 21/11/2020 03:12:17
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9973 (+/- 0.0108)|Test F1: 1.0000|150s
Iteration 6 of 10 | Training 8 of 15 | 21/11/2020 03:14:48
50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9968 (+/- 0.0135)|Test F1: 0.6



50:50,CNN1b,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.8178 (+/- 0.1190)|Test F1: 0.0000|236s
Iteration 6 of 10 | Duration 48.37m | 2.77h since start
Iteration 7 of 10 | Training 1 of 15 | 21/11/2020 03:41:14
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9951 (+/- 0.0175)|Test F1: 0.6667|163s
Iteration 7 of 10 | Training 2 of 15 | 21/11/2020 03:43:58
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9968 (+/- 0.0135)|Test F1: 0.6667|208s
Iteration 7 of 10 | Training 3 of 15 | 21/11/2020 03:47:27
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9968 (+/- 0.0135)|Test F1: 0.6667|252s
Iteration 7 of 10 | Training 4 of 15 | 21/11/2020 03:51:40
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9955 (+/- 0.0162)|Test F1: 1.0000|166s
Iteration 7 of 10 | Training 5 of 15 | 21/11/2020 03:54:26




50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9973 (+/- 0.0108)|Test F1: 0.6667|213s
Iteration 7 of 10 | Training 6 of 15 | 21/11/2020 03:57:59
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9963 (+/- 0.0178)|Test F1: 0.0000|254s
Iteration 7 of 10 | Training 7 of 15 | 21/11/2020 04:02:14
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9868 (+/- 0.0929)|Test F1: 0.6667|163s
Iteration 7 of 10 | Training 8 of 15 | 21/11/2020 04:04:57
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9962 (+/- 0.0150)|Test F1: 0.6667|205s
Iteration 7 of 10 | Training 9 of 15 | 21/11/2020 04:08:23
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.9953 (+/- 0.0196)|Test F1: 1.0000|251s
Iteration 7 of 10 | Training 10 of 15 | 21/11/2020 04:12:34
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.9973 (+/- 0.0108)|Test F1: 1.



50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.8105 (+/- 0.1259)|Test F1: 0.0000|255s
Iteration 7 of 10 | Training 13 of 15 | 21/11/2020 04:22:59
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.4935 (+/- 0.5587)|Test F1: 0.0000|162s
Iteration 7 of 10 | Training 14 of 15 | 21/11/2020 04:25:41
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.8101 (+/- 0.0973)|Test F1: 0.0000|206s
Iteration 7 of 10 | Training 15 of 15 | 21/11/2020 04:29:08
50:50,CNN1c,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7823 (+/- 0.3550)|Test F1: 0.1111|252s
Iteration 7 of 10 | Duration 52.10m | 3.64h since start
Iteration 8 of 10 | Training 1 of 15 | 21/11/2020 04:33:20
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9951 (+/- 0.0213)|Test F1: 0.0000|142s
Iteration 8 of 10 | Training 2 of 15 | 21/11/2020 04:35:42
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:4,Em



50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9959 (+/- 0.0140)|Test F1: 0.0000|223s
Iteration 8 of 10 | Training 4 of 15 | 21/11/2020 04:42:26
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9955 (+/- 0.0155)|Test F1: 0.6667|142s
Iteration 8 of 10 | Training 5 of 15 | 21/11/2020 04:44:49
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.9968 (+/- 0.0113)|Test F1: 0.0000|182s
Iteration 8 of 10 | Training 6 of 15 | 21/11/2020 04:47:51
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9958 (+/- 0.0180)|Test F1: 0.0000|220s
Iteration 8 of 10 | Training 7 of 15 | 21/11/2020 04:51:31
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9939 (+/- 0.0229)|Test F1: 0.0000|143s
Iteration 8 of 10 | Training 8 of 15 | 21/11/2020 04:53:55
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9968 (+/- 0.0113)|Test F1: 0.0



50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.7886 (+/- 0.1470)|Test F1: 0.3333|224s
Iteration 8 of 10 | Training 13 of 15 | 21/11/2020 05:09:57
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.2517 (+/- 0.6184)|Test F1: 0.0000|142s
Iteration 8 of 10 | Training 14 of 15 | 21/11/2020 05:12:19
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.8152 (+/- 0.1364)|Test F1: 0.5000|183s
Iteration 8 of 10 | Training 15 of 15 | 21/11/2020 05:15:22
50:50,CNN1d,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7933 (+/- 0.2325)|Test F1: 0.2500|220s
Iteration 8 of 10 | Duration 45.71m | 4.40h since start
Iteration 9 of 10 | Training 1 of 15 | 21/11/2020 05:19:03
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.4490 (+/- 0.6116)|Test F1: 0.3333|190s
Iteration 9 of 10 | Training 2 of 15 | 21/11/2020 05:22:14
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:4,Embe



50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.7203 (+/- 0.5176)|Test F1: 0.1111|275s
Iteration 9 of 10 | Training 4 of 15 | 21/11/2020 05:30:43
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.5583 (+/- 0.6501)|Test F1: 0.0541|190s
Iteration 9 of 10 | Training 5 of 15 | 21/11/2020 05:33:54




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:400|Train F1: 0.6448 (+/- 0.5499)|Test F1: 0.2857|236s
Iteration 9 of 10 | Training 6 of 15 | 21/11/2020 05:37:50




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.5950 (+/- 0.6669)|Test F1: 0.2105|276s
Iteration 9 of 10 | Training 7 of 15 | 21/11/2020 05:42:27
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.5570 (+/- 0.6725)|Test F1: 0.2500|192s
Iteration 9 of 10 | Training 8 of 15 | 21/11/2020 05:45:40




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.5247 (+/- 0.7240)|Test F1: 0.2000|234s
Iteration 9 of 10 | Training 9 of 15 | 21/11/2020 05:49:34
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.6192 (+/- 0.5289)|Test F1: 0.1818|274s
Iteration 9 of 10 | Training 10 of 15 | 21/11/2020 05:54:09




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.3932 (+/- 0.7440)|Test F1: 0.0000|199s
Iteration 9 of 10 | Training 11 of 15 | 21/11/2020 05:57:28




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.3198 (+/- 0.6323)|Test F1: 0.0541|237s
Iteration 9 of 10 | Training 12 of 15 | 21/11/2020 06:01:25
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.3128 (+/- 0.6545)|Test F1: 0.0000|273s
Iteration 9 of 10 | Training 13 of 15 | 21/11/2020 06:05:59
50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.4404 (+/- 0.6078)|Test F1: 0.0800|191s
Iteration 9 of 10 | Training 14 of 15 | 21/11/2020 06:09:10




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.4208 (+/- 0.6357)|Test F1: 0.0800|240s
Iteration 9 of 10 | Training 15 of 15 | 21/11/2020 06:13:11




50:50,CNN2,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.4851 (+/- 0.5674)|Test F1: 0.0800|284s
Iteration 9 of 10 | Duration 58.89m | 5.39h since start
Iteration 10 of 10 | Training 1 of 15 | 21/11/2020 06:17:56
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:300|Train F1: 0.9955 (+/- 0.0177)|Test F1: 1.0000|102s
Iteration 10 of 10 | Training 2 of 15 | 21/11/2020 06:19:39
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:400|Train F1: 0.9937 (+/- 0.0214)|Test F1: 0.0000|124s
Iteration 10 of 10 | Training 3 of 15 | 21/11/2020 06:21:43
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:4,Embedding_size:500|Train F1: 0.9964 (+/- 0.0117)|Test F1: 0.0000|147s
Iteration 10 of 10 | Training 4 of 15 | 21/11/2020 06:24:11
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:300|Train F1: 0.9836 (+/- 0.0990)|Test F1: 1.0000|101s
Iteration 10 of 10 | Training 5 of 15 | 21/11/2020 06:25:53
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:5,Embedd



50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:5,Embedding_size:500|Train F1: 0.9939 (+/- 0.0225)|Test F1: 0.6667|149s
Iteration 10 of 10 | Training 7 of 15 | 21/11/2020 06:30:27
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:300|Train F1: 0.9929 (+/- 0.0247)|Test F1: 1.0000|102s
Iteration 10 of 10 | Training 8 of 15 | 21/11/2020 06:32:10
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:400|Train F1: 0.9697 (+/- 0.1442)|Test F1: 0.0000|126s
Iteration 10 of 10 | Training 9 of 15 | 21/11/2020 06:34:17
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:6,Embedding_size:500|Train F1: 0.9904 (+/- 0.0396)|Test F1: 1.0000|146s
Iteration 10 of 10 | Training 10 of 15 | 21/11/2020 06:36:43
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:300|Train F1: 0.9950 (+/- 0.0283)|Test F1: 0.6667|102s
Iteration 10 of 10 | Training 11 of 15 | 21/11/2020 06:38:26
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:400|Train F1: 0.8222 (+/- 0.1265)|Test F1: 0.



50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:7,Embedding_size:500|Train F1: 0.8136 (+/- 0.1022)|Test F1: 0.0000|150s
Iteration 10 of 10 | Training 13 of 15 | 21/11/2020 06:43:01
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:300|Train F1: 0.4647 (+/- 0.5828)|Test F1: 0.0800|101s
Iteration 10 of 10 | Training 14 of 15 | 21/11/2020 06:44:43
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:400|Train F1: 0.8229 (+/- 0.1128)|Test F1: 0.1176|124s
Iteration 10 of 10 | Training 15 of 15 | 21/11/2020 06:46:48
50:50,CNN3,x,norm,wv_cbow_mean,Min_word_count:8,Embedding_size:500|Train F1: 0.7889 (+/- 0.3421)|Test F1: 0.0000|149s
Iteration 10 of 10 | Duration 31.35m | 5.91h since start


In [None]:
#SVMs 1-3, 20-100
#50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|9s

#All clf, 1, 20

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

No training roots to delete
No test roots to delete
Iteration 1 of 5 | Training 1 of 1 | 17/11/2020 02:40:59
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9763 (+/- 0.0294)|Test F1: 0.5714|1s
Iteration 1 of 5 | Duration 0.00m | 0.00h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 2 of 5 | Training 1 of 1 | 17/11/2020 02:40:59
...loaded vectorized file
...cross validating
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9499 (+/- 0.0674)|Test F1: 0.4444|1s
Iteration 2 of 5 | Duration 0.02m | 0.00h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete
Iteration 3 of 5 | Training 1 of 1 | 17/11/2020 02:41:00
50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|9s
Iteration 3 of 5 | Duration 0.00m | 0.00h sinc

In [None]:
#All clf, 1, 20 + 3 x RI (r_sm, ts_f, ts_lw)

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

No training roots to delete
No test roots to delete
Iteration 1 of 5 | Training 1 of 1 | 17/11/2020 02:49:12
...appended r_sm
...appended ts_f
...appended ts_lw
...loaded vectorized file
...cross validating
50:50,LR,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9762 (+/- 0.0362)|Test F1: 0.6667|1s
Iteration 1 of 5 | Duration 0.02m | 0.00h since start
No training XC to delete
No test XC to delete
No training roots to delete
No test roots to delete
Iteration 2 of 5 | Training 1 of 1 | 17/11/2020 02:49:14
...appended r_sm
...appended ts_f
...appended ts_lw
...loaded vectorized file
...cross validating
50:50,SVMa,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9556 (+/- 0.0305)|Test F1: 0.5000|0s
Iteration 2 of 5 | Duration 0.00m | 0.00h since start
No training XC to delete
No test XC to delete
No training roots to delete
No test roots to delete
Iteration 3 of 5 | Training 1 of 1 | 17/11/2020 02:49:14
...appended r_sm
...appended ts_f
...appended ts_lw
...loaded vectorized fi

In [None]:
printError=False

In [None]:
#All clf, 1, 20 + features selected by RFECV on 40TFIDF plus next 4

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 5 | Training 1 of 1 | 17/11/2020 02:59:53
50:50,LR,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9788 (+/- 0.0308)|Test F1: 0.6667|1s
Iteration 1 of 5 | Duration 0.02m | 0.00h since start
Iteration 2 of 5 | Training 1 of 1 | 17/11/2020 02:59:54
50:50,SVMa,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9535 (+/- 0.0242)|Test F1: 0.5000|0s
Iteration 2 of 5 | Duration 0.00m | 0.00h since start
Iteration 3 of 5 | Training 1 of 1 | 17/11/2020 02:59:54
50:50,SVMs,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 3 of 5 | Duration 0.00m | 0.00h since start
Iteration 4 of 5 | Training 1 of 1 | 17/11/2020 02:59:54
50:50,RF20,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.0000|1s
Iteration 4 of 5 | Duration 0.03m | 0.00h since start
Iteration 5 of 5 | Training 1 of 1 | 17/11/2020 02:59:56
50:50,ET20,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,

In [None]:
#All clf, 1, 20 + features selected by ANOVA

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 5 | Training 1 of 1 | 17/11/2020 03:09:44
50:50,LR,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9785 (+/- 0.0173)|Test F1: 0.5714|1s
Iteration 1 of 5 | Duration 0.02m | 0.00h since start
Iteration 2 of 5 | Training 1 of 1 | 17/11/2020 03:09:45
50:50,SVMa,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9448 (+/- 0.0349)|Test F1: 0.5000|0s
Iteration 2 of 5 | Duration 0.00m | 0.00h since start
Iteration 3 of 5 | Training 1 of 1 | 17/11/2020 03:09:45
50:50,SVMs,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 3 of 5 | Duration 0.00m | 0.00h since start
Iteration 4 of 5 | Training 1 of 1 | 17/11/2020 03:09:45
50:50,RF20,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.0000|1s
Iteration 4 of 5 | Duration 0.03m | 0.00h since start
Iteration 5 of 5 | Training 1 of 1 | 17/11/2020 03:09:47
50:50,ET20,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0

In [None]:
#All clf, 1, 30 + features selected above

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 60 | Training 1 of 1 | 17/11/2020 03:40:15
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:30|Train F1: 0.9881 (+/- 0.0258)|Test F1: 0.5714|1s
Iteration 1 of 60 | Duration 0.02m | 0.00h since start
Iteration 2 of 60 | Training 1 of 1 | 17/11/2020 03:40:16
50:50,LR,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:30|Train F1: 0.9717 (+/- 0.0381)|Test F1: 0.6667|0s
Iteration 2 of 60 | Duration 0.00m | 0.00h since start
Iteration 3 of 60 | Training 1 of 1 | 17/11/2020 03:40:16
50:50,LR,x+r8+rC+rH+c0+cR+r2+rF,raw,tfidf,nTo:1,FeatureMax:30|Train F1: 0.4244 (+/- 0.1343)|Test F1: 0.1111|0s
Iteration 3 of 60 | Duration 0.00m | 0.00h since start
Iteration 4 of 60 | Training 1 of 1 | 17/11/2020 03:40:16
50:50,LR,x+cJ+rG+cV+c0,raw,tfidf,nTo:1,FeatureMax:30|Train F1: 0.3062 (+/- 0.3113)|Test F1: 0.1538|0s
Iteration 4 of 60 | Duration 0.00m | 0.00h since start
Iteration 5 of 60 | Training 1 of 1 | 17/11/2020 03:40:17
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:30 in df|Train F1: 0.9928 (+/- 0.0191)|Test F1: 0

In [None]:
#All clf, 1, 40 + features selected above

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 60 | Training 1 of 1 | 17/11/2020 03:43:06
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9903 (+/- 0.0097)|Test F1: 0.8000|0s
Iteration 1 of 60 | Duration 0.00m | 0.00h since start
Iteration 2 of 60 | Training 1 of 1 | 17/11/2020 03:43:06
50:50,LR,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9837 (+/- 0.0452)|Test F1: 0.8000|0s
Iteration 2 of 60 | Duration 0.00m | 0.00h since start
Iteration 3 of 60 | Training 1 of 1 | 17/11/2020 03:43:06
50:50,LR,x+r8+rC+rH+c0+cR+r2+rF,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4243 (+/- 0.0535)|Test F1: 0.1111|0s
Iteration 3 of 60 | Duration 0.00m | 0.00h since start
Iteration 4 of 60 | Training 1 of 1 | 17/11/2020 03:43:06
50:50,LR,x+cJ+rG+cV+c0,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.2866 (+/- 0.2904)|Test F1: 0.1667|0s
Iteration 4 of 60 | Duration 0.00m | 0.00h since start
Iteration 5 of 60 | Training 1 of 1 | 17/11/2020 03:43:07
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9952 (+/- 0.0118)|Test F1: 1.0000|

In [None]:
#MLP, n=1, 10-40

#5-fold CV: 50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9952 (+/- 0.0190)|Test F1: 0.6667|2s
#5-times 5-fold CV: 50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9945 (+/- 0.0156)|Test F1: 1.0000|8s

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

No training roots to delete
No test roots to delete
Iteration 1 of 1 | Training 1 of 4 | 17/11/2020 12:06:20
...loaded vectorized file
...cross validating




50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:10|Train F1: 0.9772 (+/- 0.0573)|Test F1: 0.5000|15s
Iteration 1 of 1 | Training 2 of 4 | 17/11/2020 12:06:35
...loaded vectorized file
...cross validating
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9926 (+/- 0.0172)|Test F1: 0.5000|10s
Iteration 1 of 1 | Training 3 of 4 | 17/11/2020 12:06:46
...loaded vectorized file
...cross validating
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:30|Train F1: 0.9880 (+/- 0.0481)|Test F1: 0.8000|8s
Iteration 1 of 1 | Training 4 of 4 | 17/11/2020 12:06:55
...loaded vectorized file
...cross validating
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9945 (+/- 0.0156)|Test F1: 1.0000|8s
Iteration 1 of 1 | Duration 0.72m | 0.01h since start
No training XC to delete
No test XC to delete
No training XR to delete
No test XR to delete
No training roots to delete
No test roots to delete


In [None]:
#MLP, 1, 40 + features selected above

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 3 | Training 1 of 1 | 17/11/2020 15:38:43
50:50,MLP8,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9946 (+/- 0.0122)|Test F1: 0.6667|9s
Iteration 1 of 3 | Duration 0.16m | 0.00h since start
Iteration 2 of 3 | Training 1 of 1 | 17/11/2020 15:38:53
50:50,MLP8,x+r8+rC+rH+c0+cR+r2+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9951 (+/- 0.0120)|Test F1: 1.0000|7s
Iteration 2 of 3 | Duration 0.13m | 0.00h since start
Iteration 3 of 3 | Training 1 of 1 | 17/11/2020 15:39:01
50:50,MLP8,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9942 (+/- 0.0143)|Test F1: 0.6667|7s
Iteration 3 of 3 | Duration 0.13m | 0.01h since start


In [None]:
#MLP, 1, 40

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 1 | Training 1 of 1 | 17/11/2020 21:14:08
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9950 (+/- 0.0163)|Test F1: 1.0000|22s
Iteration 1 of 1 | Duration 0.38m | 0.01h since start


In [None]:
#CNN1a, 1, 40

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 1 | Training 1 of 1 | 18/11/2020 00:10:46
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.1854 (+/- 0.5953)|Test F1: 0.0000|42s
Iteration 1 of 1 | Duration 0.70m | 0.01h since start


In [None]:
#All Clf except MLP8, 1, 40

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 10 | Training 1 of 1 | 17/11/2020 23:54:28
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9951 (+/- 0.0173)|Test F1: 1.0000|1s
Iteration 1 of 10 | Duration 0.02m | 0.00h since start
Iteration 2 of 10 | Training 1 of 1 | 17/11/2020 23:54:30
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9951 (+/- 0.0173)|Test F1: 1.0000|0s
Iteration 2 of 10 | Duration 0.00m | 0.00h since start
Iteration 3 of 10 | Training 1 of 1 | 17/11/2020 23:54:30
50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 3 of 10 | Duration 0.00m | 0.00h since start
Iteration 4 of 10 | Training 1 of 1 | 17/11/2020 23:54:30




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8535 (+/- 0.1503)|Test F1: 0.5714|120s
Iteration 4 of 10 | Duration 2.01m | 0.03h since start
Iteration 5 of 10 | Training 1 of 1 | 17/11/2020 23:56:31




50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8990 (+/- 0.0992)|Test F1: 0.3333|119s
Iteration 5 of 10 | Duration 1.99m | 0.07h since start
Iteration 6 of 10 | Training 1 of 1 | 17/11/2020 23:58:30
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9902 (+/- 0.0199)|Test F1: 1.0000|40s
Iteration 6 of 10 | Duration 0.67m | 0.08h since start
Iteration 7 of 10 | Training 1 of 1 | 17/11/2020 23:59:10
50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9870 (+/- 0.0208)|Test F1: 1.0000|41s
Iteration 7 of 10 | Duration 0.68m | 0.09h since start
Iteration 8 of 10 | Training 1 of 1 | 17/11/2020 23:59:51
50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9920 (+/- 0.0235)|Test F1: 0.6667|37s
Iteration 8 of 10 | Duration 0.63m | 0.10h since start
Iteration 9 of 10 | Training 1 of 1 | 18/11/2020 00:00:29
50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.6307 (+/- 0.4402)|Test F1: 0.1053|81s
Iteration 9 of 10 | Duration 1.36m | 0.12h since start
Iteration 10 of 10

In [None]:
#All LSTM and CNN, 1, 60

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 7 | Training 1 of 1 | 17/11/2020 23:36:44
50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8781 (+/- 0.0836)|Test F1: 0.5000|116s
Iteration 1 of 7 | Duration 1.94m | 0.03h since start
Iteration 2 of 7 | Training 1 of 1 | 17/11/2020 23:38:41




50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8874 (+/- 0.0824)|Test F1: 0.5000|119s
Iteration 2 of 7 | Duration 2.00m | 0.07h since start
Iteration 3 of 7 | Training 1 of 1 | 17/11/2020 23:40:41
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9952 (+/- 0.0183)|Test F1: 1.0000|44s
Iteration 3 of 7 | Duration 0.75m | 0.08h since start
Iteration 4 of 7 | Training 1 of 1 | 17/11/2020 23:41:26
50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9953 (+/- 0.0147)|Test F1: 1.0000|48s
Iteration 4 of 7 | Duration 0.80m | 0.09h since start
Iteration 5 of 7 | Training 1 of 1 | 17/11/2020 23:42:14
50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9904 (+/- 0.0224)|Test F1: 1.0000|44s
Iteration 5 of 7 | Duration 0.74m | 0.10h since start
Iteration 6 of 7 | Training 1 of 1 | 17/11/2020 23:42:58
50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.7463 (+/- 0.2449)|Test F1: 0.1176|86s
Iteration 6 of 7 | Duration 1.44m | 0.13h since start
Iteration 7 of 7 | Training

In [None]:
#All clfs, 1, 60-100

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 12 | Training 1 of 3 | 18/11/2020 00:18:41
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9947 (+/- 0.0119)|Test F1: 1.0000|1s
Iteration 1 of 12 | Training 2 of 3 | 18/11/2020 00:18:43
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9957 (+/- 0.0115)|Test F1: 1.0000|1s
Iteration 1 of 12 | Training 3 of 3 | 18/11/2020 00:18:44
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9966 (+/- 0.0131)|Test F1: 1.0000|1s
Iteration 1 of 12 | Duration 0.07m | 0.00h since start
Iteration 2 of 12 | Training 1 of 3 | 18/11/2020 00:18:45
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9971 (+/- 0.0102)|Test F1: 1.0000|0s
Iteration 2 of 12 | Training 2 of 3 | 18/11/2020 00:18:46
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9976 (+/- 0.0095)|Test F1: 1.0000|0s
Iteration 2 of 12 | Training 3 of 3 | 18/11/2020 00:18:46
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9976 (+/- 0.0095)|Test F1: 1.0000|0s
Iteration 2 of 12 | Duration 0.02m | 0.00h 



50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9958 (+/- 0.0113)|Test F1: 0.6667|8s
Iteration 4 of 12 | Training 3 of 3 | 18/11/2020 00:19:04
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9967 (+/- 0.0106)|Test F1: 1.0000|8s
Iteration 4 of 12 | Duration 0.43m | 0.01h since start
Iteration 5 of 12 | Training 1 of 3 | 18/11/2020 00:19:13




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8957 (+/- 0.1084)|Test F1: 0.5000|123s
Iteration 5 of 12 | Training 2 of 3 | 18/11/2020 00:21:17




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.8883 (+/- 0.0851)|Test F1: 0.6667|121s
Iteration 5 of 12 | Training 3 of 3 | 18/11/2020 00:23:18
50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.8880 (+/- 0.0634)|Test F1: 0.6667|117s
Iteration 5 of 12 | Duration 6.05m | 0.11h since start
Iteration 6 of 12 | Training 1 of 3 | 18/11/2020 00:25:16




50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8786 (+/- 0.0775)|Test F1: 0.5000|120s
Iteration 6 of 12 | Training 2 of 3 | 18/11/2020 00:27:16
50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.8968 (+/- 0.0841)|Test F1: 0.6667|116s
Iteration 6 of 12 | Training 3 of 3 | 18/11/2020 00:29:13




50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9138 (+/- 0.0997)|Test F1: 1.0000|124s
Iteration 6 of 12 | Duration 6.01m | 0.21h since start
Iteration 7 of 12 | Training 1 of 3 | 18/11/2020 00:31:17
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.5331 (+/- 0.5356)|Test F1: 0.0000|45s
Iteration 7 of 12 | Training 2 of 3 | 18/11/2020 00:32:03
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.4300 (+/- 0.6466)|Test F1: 0.0800|53s
Iteration 7 of 12 | Training 3 of 3 | 18/11/2020 00:32:56
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.3970 (+/- 0.6498)|Test F1: 0.0000|60s
Iteration 7 of 12 | Duration 2.67m | 0.25h since start
Iteration 8 of 12 | Training 1 of 3 | 18/11/2020 00:33:57
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9972 (+/- 0.0101)|Test F1: 1.0000|46s
Iteration 8 of 12 | Training 2 of 3 | 18/11/2020 00:34:44
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9962 (+/- 0.0132)|Test F1: 1.0000|52s
Iteration 8 of 12 | Trainin



50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9922 (+/- 0.0155)|Test F1: 1.0000|54s
Iteration 10 of 12 | Training 3 of 3 | 18/11/2020 00:41:11




50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9961 (+/- 0.0135)|Test F1: 0.0000|61s
Iteration 10 of 12 | Duration 2.71m | 0.39h since start
Iteration 11 of 12 | Training 1 of 3 | 18/11/2020 00:42:13
50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.7576 (+/- 0.1532)|Test F1: 0.1379|88s
Iteration 11 of 12 | Training 2 of 3 | 18/11/2020 00:43:41
50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.7134 (+/- 0.1778)|Test F1: 0.2353|97s
Iteration 11 of 12 | Training 3 of 3 | 18/11/2020 00:45:18




50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.8210 (+/- 0.3202)|Test F1: 0.4000|109s
Iteration 11 of 12 | Duration 4.91m | 0.47h since start
Iteration 12 of 12 | Training 1 of 3 | 18/11/2020 00:47:07
50:50,CNN3,x,norm,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9991 (+/- 0.0063)|Test F1: 0.0000|38s
Iteration 12 of 12 | Training 2 of 3 | 18/11/2020 00:47:46
50:50,CNN3,x,norm,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9882 (+/- 0.0669)|Test F1: 1.0000|42s
Iteration 12 of 12 | Training 3 of 3 | 18/11/2020 00:48:29
50:50,CNN3,x,norm,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9976 (+/- 0.0095)|Test F1: 1.0000|47s
Iteration 12 of 12 | Duration 2.15m | 0.51h since start


In [None]:
#All clfs, 1, 40, X+L

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 36 | Training 1 of 1 | 18/11/2020 01:15:22
50:50,LR,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9948 (+/- 0.0146)|Test F1: 1.0000|1s
Iteration 1 of 36 | Duration 0.03m | 0.00h since start
Iteration 2 of 36 | Training 1 of 1 | 18/11/2020 01:15:24
50:50,LR,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9948 (+/- 0.0146)|Test F1: 1.0000|0s
Iteration 2 of 36 | Duration 0.00m | 0.00h since start
Iteration 3 of 36 | Training 1 of 1 | 18/11/2020 01:15:24
50:50,LR,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9948 (+/- 0.0146)|Test F1: 1.0000|0s
Iteration 3 of 36 | Duration 0.00m | 0.00h since start
Iteration 4 of 36 | Training 1 of 1 | 18/11/2020 01:15:24
50:50,SVMa,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9948 (+/- 0.0146)|Test F1: 1.0000|0s
Iteration 4 of 36 | Duration 0.00m | 0.00h since start
Iteration 5 of 36 | Training 1 of 1 | 18/11/2020 01:15:24
50:50,SVMa,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.99



50:50,LSTM1,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8987 (+/- 0.1118)|Test F1: 0.5714|122s
Iteration 13 of 36 | Duration 2.04m | 0.04h since start
Iteration 14 of 36 | Training 1 of 1 | 18/11/2020 01:17:37




50:50,LSTM1,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9111 (+/- 0.1025)|Test F1: 0.8000|118s
Iteration 14 of 36 | Duration 1.97m | 0.07h since start
Iteration 15 of 36 | Training 1 of 1 | 18/11/2020 01:19:35
50:50,LSTM1,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8748 (+/- 0.1260)|Test F1: 0.5714|119s
Iteration 15 of 36 | Duration 1.98m | 0.10h since start
Iteration 16 of 36 | Training 1 of 1 | 18/11/2020 01:21:34




50:50,LSTM2,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9461 (+/- 0.0875)|Test F1: 0.8000|121s
Iteration 16 of 36 | Duration 2.03m | 0.14h since start
Iteration 17 of 36 | Training 1 of 1 | 18/11/2020 01:23:36
50:50,LSTM2,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9357 (+/- 0.1037)|Test F1: 0.8000|115s
Iteration 17 of 36 | Duration 1.93m | 0.17h since start
Iteration 18 of 36 | Training 1 of 1 | 18/11/2020 01:25:32




50:50,LSTM2,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9071 (+/- 0.1388)|Test F1: 0.5000|123s
Iteration 18 of 36 | Duration 2.05m | 0.20h since start
Iteration 19 of 36 | Training 1 of 1 | 18/11/2020 01:27:35
50:50,CNN1a,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4496 (+/- 0.6208)|Test F1: 0.0800|39s
Iteration 19 of 36 | Duration 0.66m | 0.21h since start
Iteration 20 of 36 | Training 1 of 1 | 18/11/2020 01:28:14
50:50,CNN1a,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3642 (+/- 0.6479)|Test F1: 0.0000|41s
Iteration 20 of 36 | Duration 0.70m | 0.23h since start
Iteration 21 of 36 | Training 1 of 1 | 18/11/2020 01:28:56
50:50,CNN1a,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.2883 (+/- 0.6523)|Test F1: 0.0000|40s
Iteration 21 of 36 | Duration 0.67m | 0.24h since start
Iteration 22 of 36 | Training 1 of 1 | 18/11/2020 01:29:36
50:50,CNN1b,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9913 (+/- 0.0282)|Test F1: 1.0000|40s
Iter



50:50,CNN2,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.5420 (+/- 0.5582)|Test F1: 0.0000|86s
Iteration 31 of 36 | Duration 1.44m | 0.37h since start
Iteration 32 of 36 | Training 1 of 1 | 18/11/2020 01:37:23




50:50,CNN2,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4381 (+/- 0.6564)|Test F1: 0.0769|86s
Iteration 32 of 36 | Duration 1.45m | 0.39h since start
Iteration 33 of 36 | Training 1 of 1 | 18/11/2020 01:38:50
50:50,CNN2,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.5235 (+/- 0.5565)|Test F1: 0.0000|82s
Iteration 33 of 36 | Duration 1.37m | 0.41h since start
Iteration 34 of 36 | Training 1 of 1 | 18/11/2020 01:40:12
50:50,CNN3,x+r8+rC+rH,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9984 (+/- 0.0085)|Test F1: 1.0000|38s
Iteration 34 of 36 | Duration 0.65m | 0.42h since start
Iteration 35 of 36 | Training 1 of 1 | 18/11/2020 01:40:51
50:50,CNN3,x+r8+rC+rH+c0+cR+rF,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9990 (+/- 0.0070)|Test F1: 1.0000|37s
Iteration 35 of 36 | Duration 0.63m | 0.44h since start
Iteration 36 of 36 | Training 1 of 1 | 18/11/2020 01:41:28
50:50,CNN3,x+cJ+rG+cV+c0,norm,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9979 (+/- 0.0097)|Test F1: 1.0000|36s
I

In [None]:
#All clfs, 1, 20
#CNN3 is too big

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 12 | Training 1 of 1 | 18/11/2020 01:48:29
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9780 (+/- 0.0255)|Test F1: 0.5714|2s
Iteration 1 of 12 | Duration 0.04m | 0.00h since start
Iteration 2 of 12 | Training 1 of 1 | 18/11/2020 01:48:31
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9480 (+/- 0.0503)|Test F1: 0.4444|0s
Iteration 2 of 12 | Duration 0.00m | 0.00h since start
Iteration 3 of 12 | Training 1 of 1 | 18/11/2020 01:48:32
50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 3 of 12 | Duration 0.00m | 0.00h since start
Iteration 4 of 12 | Training 1 of 1 | 18/11/2020 01:48:32
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9926 (+/- 0.0172)|Test F1: 0.5000|10s
Iteration 4 of 12 | Duration 0.00m | 0.00h since start
Iteration 5 of 12 | Training 1 of 1 | 18/11/2020 01:48:32




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.8030 (+/- 0.3826)|Test F1: 0.4000|121s
Iteration 5 of 12 | Duration 2.03m | 0.03h since start
Iteration 6 of 12 | Training 1 of 1 | 18/11/2020 01:50:34




50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.8509 (+/- 0.1472)|Test F1: 0.2000|124s
Iteration 6 of 12 | Duration 2.08m | 0.07h since start
Iteration 7 of 12 | Training 1 of 1 | 18/11/2020 01:52:39
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.2604 (+/- 0.6399)|Test F1: 0.0800|30s
Iteration 7 of 12 | Duration 0.51m | 0.08h since start
Iteration 8 of 12 | Training 1 of 1 | 18/11/2020 01:53:09
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9778 (+/- 0.0336)|Test F1: 0.5000|31s
Iteration 8 of 12 | Duration 0.52m | 0.09h since start
Iteration 9 of 12 | Training 1 of 1 | 18/11/2020 01:53:41
50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9735 (+/- 0.0389)|Test F1: 0.5000|32s
Iteration 9 of 12 | Duration 0.55m | 0.10h since start
Iteration 10 of 12 | Training 1 of 1 | 18/11/2020 01:54:13
50:50,CNN1d,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9809 (+/- 0.0282)|Test F1: 0.8000|30s
Iteration 10 of 12 | Duration 0.52m | 0.10h since start
Iteration 11 of

ValueError: ignored

In [None]:
#All clfs, 1-7, 20, scaling, TF and TFIDF
#Without CNN3

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 66 | Training 1 of 6 | 18/11/2020 08:28:37
50:50,LR,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|10s
Iteration 1 of 66 | Training 2 of 6 | 18/11/2020 08:28:47
50:50,LR,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|1s
Iteration 1 of 66 | Training 3 of 6 | 18/11/2020 08:28:49
50:50,LR,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|1s
Iteration 1 of 66 | Training 4 of 6 | 18/11/2020 08:28:50
50:50,LR,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|1s
Iteration 1 of 66 | Training 5 of 6 | 18/11/2020 08:28:51
50:50,LR,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|1s
Iteration 1 of 66 | Training 6 of 6 | 18/11/2020 08:28:53
50:50,LR,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.9856 (+/- 0.0282)|Test F1: 0.1667|1s
Iteration 1 of 66 | Duration 0.29m | 0.00h since start
Iteration 2 of 66 | Training 1 of 6 | 18/11/2020 08:28:54
50:50,LR,x,raw,t



50:50,MLP8,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.8729 (+/- 0.4486)|Test F1: 0.2857|10s
Iteration 19 of 66 | Training 2 of 6 | 18/11/2020 08:29:35
50:50,MLP8,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.8933 (+/- 0.4076)|Test F1: 0.2000|12s
Iteration 19 of 66 | Training 3 of 6 | 18/11/2020 08:29:47
50:50,MLP8,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.8751 (+/- 0.3554)|Test F1: 0.1000|9s
Iteration 19 of 66 | Training 4 of 6 | 18/11/2020 08:29:57
50:50,MLP8,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9241 (+/- 0.1698)|Test F1: 0.0000|12s
Iteration 19 of 66 | Training 5 of 6 | 18/11/2020 08:30:10
50:50,MLP8,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9466 (+/- 0.1051)|Test F1: 0.1429|11s
Iteration 19 of 66 | Training 6 of 6 | 18/11/2020 08:30:21
50:50,MLP8,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.8881 (+/- 0.3386)|Test F1: 0.0000|10s
Iteration 19 of 66 | Duration 1.14m | 0.03h since start
Iteration 20 of 66 | Training 1 of 6 | 18/11/2020 08:30:32
50:50,MLP8,x,raw,tfidf,nTo:1,FeatureMax:20|Train F1



50:50,MLP8,x,raw,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9869 (+/- 0.0278)|Test F1: 0.3333|14s
Iteration 20 of 66 | Training 3 of 6 | 18/11/2020 08:31:02




50:50,MLP8,x,raw,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9856 (+/- 0.0255)|Test F1: 0.5000|14s
Iteration 20 of 66 | Training 4 of 6 | 18/11/2020 08:31:16




50:50,MLP8,x,raw,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9863 (+/- 0.0235)|Test F1: 0.5000|14s
Iteration 20 of 66 | Training 5 of 6 | 18/11/2020 08:31:31




50:50,MLP8,x,raw,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9859 (+/- 0.0204)|Test F1: 0.5000|14s
Iteration 20 of 66 | Training 6 of 6 | 18/11/2020 08:31:45




50:50,MLP8,x,raw,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9465 (+/- 0.3877)|Test F1: 0.0000|14s
Iteration 20 of 66 | Duration 1.46m | 0.06h since start
Iteration 21 of 66 | Training 1 of 6 | 18/11/2020 08:31:59




50:50,MLP8,x,norm,tf,nTo:1,FeatureMax:20|Train F1: 0.9887 (+/- 0.0237)|Test F1: 0.0000|15s
Iteration 21 of 66 | Training 2 of 6 | 18/11/2020 08:32:15




50:50,MLP8,x,norm,tf,nTo:2,FeatureMax:20|Train F1: 0.9873 (+/- 0.0268)|Test F1: 0.6667|15s
Iteration 21 of 66 | Training 3 of 6 | 18/11/2020 08:32:31




50:50,MLP8,x,norm,tf,nTo:3,FeatureMax:20|Train F1: 0.9856 (+/- 0.0307)|Test F1: 0.8000|15s
Iteration 21 of 66 | Training 4 of 6 | 18/11/2020 08:32:46




50:50,MLP8,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.9869 (+/- 0.0290)|Test F1: 0.8000|15s
Iteration 21 of 66 | Training 5 of 6 | 18/11/2020 08:33:02




50:50,MLP8,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.9837 (+/- 0.0353)|Test F1: 0.6667|15s
Iteration 21 of 66 | Training 6 of 6 | 18/11/2020 08:33:18




50:50,MLP8,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.9881 (+/- 0.0279)|Test F1: 0.6667|15s
Iteration 21 of 66 | Duration 1.56m | 0.08h since start
Iteration 22 of 66 | Training 1 of 6 | 18/11/2020 08:33:33
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9926 (+/- 0.0172)|Test F1: 0.5000|10s
Iteration 22 of 66 | Training 2 of 6 | 18/11/2020 08:33:33
50:50,MLP8,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9890 (+/- 0.0234)|Test F1: 0.6667|11s
Iteration 22 of 66 | Training 3 of 6 | 18/11/2020 08:33:45
50:50,MLP8,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9917 (+/- 0.0194)|Test F1: 0.5000|11s
Iteration 22 of 66 | Training 4 of 6 | 18/11/2020 08:33:57
50:50,MLP8,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9879 (+/- 0.0254)|Test F1: 0.5000|11s
Iteration 22 of 66 | Training 5 of 6 | 18/11/2020 08:34:08
50:50,MLP8,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9936 (+/- 0.0189)|Test F1: 0.5000|12s
Iteration 22 of 66 | Training 6 of 6 | 18/11/2020 08:34:20
50:50,MLP8,x,norm,tfidf



50:50,MLP8,x,std,tf,nTo:1,FeatureMax:20|Train F1: 0.9904 (+/- 0.0194)|Test F1: 0.0000|13s
Iteration 23 of 66 | Training 2 of 6 | 18/11/2020 08:34:46
50:50,MLP8,x,std,tf,nTo:2,FeatureMax:20|Train F1: 0.9909 (+/- 0.0194)|Test F1: 0.5000|13s
Iteration 23 of 66 | Training 3 of 6 | 18/11/2020 08:34:59
50:50,MLP8,x,std,tf,nTo:3,FeatureMax:20|Train F1: 0.9899 (+/- 0.0249)|Test F1: 0.0000|14s
Iteration 23 of 66 | Training 4 of 6 | 18/11/2020 08:35:13




50:50,MLP8,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.9926 (+/- 0.0218)|Test F1: 0.0000|13s
Iteration 23 of 66 | Training 5 of 6 | 18/11/2020 08:35:26
50:50,MLP8,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.9934 (+/- 0.0198)|Test F1: 0.0000|13s
Iteration 23 of 66 | Training 6 of 6 | 18/11/2020 08:35:40
50:50,MLP8,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.9928 (+/- 0.0195)|Test F1: 0.0000|13s
Iteration 23 of 66 | Duration 1.36m | 0.12h since start
Iteration 24 of 66 | Training 1 of 6 | 18/11/2020 08:35:54
50:50,MLP8,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9916 (+/- 0.0176)|Test F1: 0.6667|8s
Iteration 24 of 66 | Training 2 of 6 | 18/11/2020 08:36:02
50:50,MLP8,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9941 (+/- 0.0188)|Test F1: 0.0000|9s
Iteration 24 of 66 | Training 3 of 6 | 18/11/2020 08:36:11
50:50,MLP8,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9943 (+/- 0.0197)|Test F1: 0.6667|8s
Iteration 24 of 66 | Training 4 of 6 | 18/11/2020 08:36:20
50:50,MLP8,x,std,tfidf,nTo:4,FeatureMax:20|T



Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
50:50,LSTM1,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9834 (+/- 0.0400)|Test F1: 0.6667|122s
Iteration 25 of 66 | Training 2 of 6 | 18/11/2020 08:38:49
50:50,LSTM1,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9839 (+/- 0.0373)|Test F1: 0.0000|118s
Iteration 25 of 66 | Training 3 of 6 | 18/11/2020 08:40:48
50:50,LSTM1,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9868 (+/- 0.0261)|Test F1: 0.0000|110s
Iteration 25 of 66 | Training 4 of 6 | 18/11/2020 08:42:39
50:50,LSTM1,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9859 (+/- 0.0290)|Test F1: 0.6667|118s
Iteration 25 of 66 | Training 5 of 6 | 18/11/2020 08:44:38
50:50,LSTM1,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9771 (+/-



50:50,LSTM1,x,norm,tf,nTo:2,FeatureMax:20|Train F1: 0.7142 (+/- 0.2715)|Test F1: 0.2000|123s
Iteration 27 of 66 | Training 3 of 6 | 18/11/2020 09:04:43
50:50,LSTM1,x,norm,tf,nTo:3,FeatureMax:20|Train F1: 0.6747 (+/- 0.2577)|Test F1: 0.1905|119s
Iteration 27 of 66 | Training 4 of 6 | 18/11/2020 09:06:43




50:50,LSTM1,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.6807 (+/- 0.2933)|Test F1: 0.1481|124s
Iteration 27 of 66 | Training 5 of 6 | 18/11/2020 09:08:48




50:50,LSTM1,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.7264 (+/- 0.3532)|Test F1: 0.1111|126s
Iteration 27 of 66 | Training 6 of 6 | 18/11/2020 09:10:54
50:50,LSTM1,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.7282 (+/- 0.3902)|Test F1: 0.1818|115s
Iteration 27 of 66 | Duration 12.12m | 0.74h since start
Iteration 28 of 66 | Training 1 of 6 | 18/11/2020 09:12:50
50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.8030 (+/- 0.3826)|Test F1: 0.4000|121s
Iteration 28 of 66 | Training 2 of 6 | 18/11/2020 09:12:50




50:50,LSTM1,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.6968 (+/- 0.1550)|Test F1: 0.1026|124s
Iteration 28 of 66 | Training 3 of 6 | 18/11/2020 09:14:54
50:50,LSTM1,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.7156 (+/- 0.1621)|Test F1: 0.1053|120s
Iteration 28 of 66 | Training 4 of 6 | 18/11/2020 09:16:55




50:50,LSTM1,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.7270 (+/- 0.1791)|Test F1: 0.1212|130s
Iteration 28 of 66 | Training 5 of 6 | 18/11/2020 09:19:06
50:50,LSTM1,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.7212 (+/- 0.1813)|Test F1: 0.0851|121s
Iteration 28 of 66 | Training 6 of 6 | 18/11/2020 09:21:07




50:50,LSTM1,x,norm,tfidf,nTo:6,FeatureMax:20|Train F1: 0.7167 (+/- 0.1870)|Test F1: 0.1212|126s
Iteration 28 of 66 | Duration 10.41m | 0.91h since start
Iteration 29 of 66 | Training 1 of 6 | 18/11/2020 09:23:14
50:50,LSTM1,x,std,tf,nTo:1,FeatureMax:20|Train F1: 0.8221 (+/- 0.1031)|Test F1: 0.2857|120s
Iteration 29 of 66 | Training 2 of 6 | 18/11/2020 09:25:15




50:50,LSTM1,x,std,tf,nTo:2,FeatureMax:20|Train F1: 0.8098 (+/- 0.0969)|Test F1: 0.2857|126s
Iteration 29 of 66 | Training 3 of 6 | 18/11/2020 09:27:21
50:50,LSTM1,x,std,tf,nTo:3,FeatureMax:20|Train F1: 0.8110 (+/- 0.1194)|Test F1: 0.2222|124s
Iteration 29 of 66 | Training 4 of 6 | 18/11/2020 09:29:26




50:50,LSTM1,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.8194 (+/- 0.0809)|Test F1: 0.2222|126s
Iteration 29 of 66 | Training 5 of 6 | 18/11/2020 09:31:32




50:50,LSTM1,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.8198 (+/- 0.0805)|Test F1: 0.1818|127s
Iteration 29 of 66 | Training 6 of 6 | 18/11/2020 09:33:39
50:50,LSTM1,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.8263 (+/- 0.0977)|Test F1: 0.2222|115s
Iteration 29 of 66 | Duration 12.35m | 1.12h since start
Iteration 30 of 66 | Training 1 of 6 | 18/11/2020 09:35:35




50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9357 (+/- 0.0585)|Test F1: 0.2500|124s
Iteration 30 of 66 | Training 2 of 6 | 18/11/2020 09:37:40
50:50,LSTM1,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9320 (+/- 0.0623)|Test F1: 0.3636|120s
Iteration 30 of 66 | Training 3 of 6 | 18/11/2020 09:39:40




50:50,LSTM1,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9352 (+/- 0.0586)|Test F1: 0.2000|124s
Iteration 30 of 66 | Training 4 of 6 | 18/11/2020 09:41:45
50:50,LSTM1,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9351 (+/- 0.0555)|Test F1: 0.4444|119s
Iteration 30 of 66 | Training 5 of 6 | 18/11/2020 09:43:45




50:50,LSTM1,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9327 (+/- 0.0558)|Test F1: 0.4444|125s
Iteration 30 of 66 | Training 6 of 6 | 18/11/2020 09:45:50
50:50,LSTM1,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9338 (+/- 0.0613)|Test F1: 0.3636|120s
Iteration 30 of 66 | Duration 12.26m | 1.32h since start
Iteration 31 of 66 | Training 1 of 6 | 18/11/2020 09:47:51




50:50,LSTM2,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9956 (+/- 0.0168)|Test F1: 0.0000|129s
Iteration 31 of 66 | Training 2 of 6 | 18/11/2020 09:50:00
50:50,LSTM2,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9948 (+/- 0.0182)|Test F1: 0.6667|122s
Iteration 31 of 66 | Training 3 of 6 | 18/11/2020 09:52:03
50:50,LSTM2,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9949 (+/- 0.0177)|Test F1: 0.0000|125s
Iteration 31 of 66 | Training 4 of 6 | 18/11/2020 09:54:08
50:50,LSTM2,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9959 (+/- 0.0148)|Test F1: 0.6667|124s
Iteration 31 of 66 | Training 5 of 6 | 18/11/2020 09:56:12
50:50,LSTM2,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9961 (+/- 0.0146)|Test F1: 0.6667|122s
Iteration 31 of 66 | Training 6 of 6 | 18/11/2020 09:58:15
50:50,LSTM2,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.9971 (+/- 0.0123)|Test F1: 0.6667|130s
Iteration 31 of 66 | Duration 12.58m | 1.53h since start
Iteration 32 of 66 | Training 1 of 6 | 18/11/2020 10:00:25
50:50,LSTM2,x,raw,tfidf,nTo:1,Feature



50:50,LSTM2,x,norm,tf,nTo:2,FeatureMax:20|Train F1: 0.7432 (+/- 0.3018)|Test F1: 0.1905|126s
Iteration 33 of 66 | Training 3 of 6 | 18/11/2020 10:17:03




50:50,LSTM2,x,norm,tf,nTo:3,FeatureMax:20|Train F1: 0.7567 (+/- 0.1711)|Test F1: 0.1176|127s
Iteration 33 of 66 | Training 4 of 6 | 18/11/2020 10:19:11




50:50,LSTM2,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.7514 (+/- 0.1925)|Test F1: 0.2500|125s
Iteration 33 of 66 | Training 5 of 6 | 18/11/2020 10:21:17




50:50,LSTM2,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.6723 (+/- 0.4506)|Test F1: 0.1818|127s
Iteration 33 of 66 | Training 6 of 6 | 18/11/2020 10:23:24
50:50,LSTM2,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.7276 (+/- 0.2712)|Test F1: 0.1538|124s
Iteration 33 of 66 | Duration 12.55m | 1.95h since start
Iteration 34 of 66 | Training 1 of 6 | 18/11/2020 10:25:28
50:50,LSTM2,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.8509 (+/- 0.1472)|Test F1: 0.2000|124s
Iteration 34 of 66 | Training 2 of 6 | 18/11/2020 10:25:28




50:50,LSTM2,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.7350 (+/- 0.1472)|Test F1: 0.1600|127s
Iteration 34 of 66 | Training 3 of 6 | 18/11/2020 10:27:36
50:50,LSTM2,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.7450 (+/- 0.1853)|Test F1: 0.1250|124s
Iteration 34 of 66 | Training 4 of 6 | 18/11/2020 10:29:40




50:50,LSTM2,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.7344 (+/- 0.1537)|Test F1: 0.1600|126s
Iteration 34 of 66 | Training 5 of 6 | 18/11/2020 10:31:46
50:50,LSTM2,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.7366 (+/- 0.1851)|Test F1: 0.1818|119s
Iteration 34 of 66 | Training 6 of 6 | 18/11/2020 10:33:46




50:50,LSTM2,x,norm,tfidf,nTo:6,FeatureMax:20|Train F1: 0.7317 (+/- 0.1684)|Test F1: 0.1538|126s
Iteration 34 of 66 | Duration 10.41m | 2.12h since start
Iteration 35 of 66 | Training 1 of 6 | 18/11/2020 10:35:53
50:50,LSTM2,x,std,tf,nTo:1,FeatureMax:20|Train F1: 0.8204 (+/- 0.0920)|Test F1: 0.1818|121s
Iteration 35 of 66 | Training 2 of 6 | 18/11/2020 10:37:55




50:50,LSTM2,x,std,tf,nTo:2,FeatureMax:20|Train F1: 0.8216 (+/- 0.0878)|Test F1: 0.2222|130s
Iteration 35 of 66 | Training 3 of 6 | 18/11/2020 10:40:05




50:50,LSTM2,x,std,tf,nTo:3,FeatureMax:20|Train F1: 0.8131 (+/- 0.0917)|Test F1: 0.2857|125s
Iteration 35 of 66 | Training 4 of 6 | 18/11/2020 10:42:11




50:50,LSTM2,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.8171 (+/- 0.1242)|Test F1: 0.2500|121s
Iteration 35 of 66 | Training 5 of 6 | 18/11/2020 10:44:13




50:50,LSTM2,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.8053 (+/- 0.1025)|Test F1: 0.2222|121s
Iteration 35 of 66 | Training 6 of 6 | 18/11/2020 10:46:14




50:50,LSTM2,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.8182 (+/- 0.0855)|Test F1: 0.1818|123s
Iteration 35 of 66 | Duration 12.42m | 2.33h since start
Iteration 36 of 66 | Training 1 of 6 | 18/11/2020 10:48:18




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9370 (+/- 0.0627)|Test F1: 0.4444|132s
Iteration 36 of 66 | Training 2 of 6 | 18/11/2020 10:50:30
50:50,LSTM2,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9376 (+/- 0.0637)|Test F1: 0.5714|119s
Iteration 36 of 66 | Training 3 of 6 | 18/11/2020 10:52:30




50:50,LSTM2,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9375 (+/- 0.0615)|Test F1: 0.4444|126s
Iteration 36 of 66 | Training 4 of 6 | 18/11/2020 10:54:37
50:50,LSTM2,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9395 (+/- 0.0589)|Test F1: 0.4444|118s
Iteration 36 of 66 | Training 5 of 6 | 18/11/2020 10:56:35




50:50,LSTM2,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9419 (+/- 0.0569)|Test F1: 0.4444|126s
Iteration 36 of 66 | Training 6 of 6 | 18/11/2020 10:58:42
50:50,LSTM2,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9317 (+/- 0.0615)|Test F1: 0.3636|123s
Iteration 36 of 66 | Duration 12.46m | 2.54h since start
Iteration 37 of 66 | Training 1 of 6 | 18/11/2020 11:00:45




50:50,CNN1a,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.8404 (+/- 0.1306)|Test F1: 0.2857|39s
Iteration 37 of 66 | Training 2 of 6 | 18/11/2020 11:01:24
50:50,CNN1a,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.8429 (+/- 0.1251)|Test F1: 0.2857|33s
Iteration 37 of 66 | Training 3 of 6 | 18/11/2020 11:01:57
50:50,CNN1a,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.8392 (+/- 0.1284)|Test F1: 0.2857|32s
Iteration 37 of 66 | Training 4 of 6 | 18/11/2020 11:02:30
50:50,CNN1a,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.8295 (+/- 0.1317)|Test F1: 0.2222|33s
Iteration 37 of 66 | Training 5 of 6 | 18/11/2020 11:03:04
50:50,CNN1a,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.8422 (+/- 0.1323)|Test F1: 0.1818|32s
Iteration 37 of 66 | Training 6 of 6 | 18/11/2020 11:03:36
50:50,CNN1a,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.8421 (+/- 0.1239)|Test F1: 0.2857|32s
Iteration 37 of 66 | Duration 3.39m | 2.59h since start
Iteration 38 of 66 | Training 1 of 6 | 18/11/2020 11:04:09
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:20|



50:50,CNN1a,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.3948 (+/- 0.6488)|Test F1: 0.0800|35s
Iteration 39 of 66 | Duration 3.31m | 2.70h since start
Iteration 40 of 66 | Training 1 of 6 | 18/11/2020 11:10:41
50:50,CNN1a,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.2604 (+/- 0.6399)|Test F1: 0.0800|30s
Iteration 40 of 66 | Training 2 of 6 | 18/11/2020 11:10:41




50:50,CNN1a,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.4417 (+/- 0.6087)|Test F1: 0.0000|35s
Iteration 40 of 66 | Training 3 of 6 | 18/11/2020 11:11:16
50:50,CNN1a,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.4516 (+/- 0.6248)|Test F1: 0.0000|31s
Iteration 40 of 66 | Training 4 of 6 | 18/11/2020 11:11:48
50:50,CNN1a,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.4264 (+/- 0.6424)|Test F1: 0.0000|31s
Iteration 40 of 66 | Training 5 of 6 | 18/11/2020 11:12:20
50:50,CNN1a,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.3677 (+/- 0.6559)|Test F1: 0.0800|32s
Iteration 40 of 66 | Training 6 of 6 | 18/11/2020 11:12:52
50:50,CNN1a,x,norm,tfidf,nTo:6,FeatureMax:20|Train F1: 0.3409 (+/- 0.6580)|Test F1: 0.0000|33s
Iteration 40 of 66 | Duration 2.76m | 2.75h since start
Iteration 41 of 66 | Training 1 of 6 | 18/11/2020 11:13:26
50:50,CNN1a,x,std,tf,nTo:1,FeatureMax:20|Train F1: 0.3985 (+/- 0.6544)|Test F1: 0.0800|32s
Iteration 41 of 66 | Training 2 of 6 | 18/11/2020 11:13:59
50:50,CNN1a,x,std,tf,nTo



50:50,CNN1b,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9900 (+/- 0.0384)|Test F1: 0.5000|38s
Iteration 43 of 66 | Training 2 of 6 | 18/11/2020 11:20:41
50:50,CNN1b,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9911 (+/- 0.0237)|Test F1: 0.5000|32s
Iteration 43 of 66 | Training 3 of 6 | 18/11/2020 11:21:14
50:50,CNN1b,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9922 (+/- 0.0190)|Test F1: 0.6667|32s
Iteration 43 of 66 | Training 4 of 6 | 18/11/2020 11:21:46
50:50,CNN1b,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9918 (+/- 0.0190)|Test F1: 0.4000|33s
Iteration 43 of 66 | Training 5 of 6 | 18/11/2020 11:22:19
50:50,CNN1b,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9927 (+/- 0.0191)|Test F1: 0.6667|32s
Iteration 43 of 66 | Training 6 of 6 | 18/11/2020 11:22:52
50:50,CNN1b,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.9888 (+/- 0.0391)|Test F1: 0.4000|33s
Iteration 43 of 66 | Duration 3.39m | 2.91h since start
Iteration 44 of 66 | Training 1 of 6 | 18/11/2020 11:23:26
50:50,CNN1b,x,raw,tfidf,nTo:1,FeatureMax:20|



50:50,CNN1b,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.8526 (+/- 0.0888)|Test F1: 0.2222|35s
Iteration 45 of 66 | Training 5 of 6 | 18/11/2020 11:29:01
50:50,CNN1b,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.8506 (+/- 0.0919)|Test F1: 0.2857|33s
Iteration 45 of 66 | Training 6 of 6 | 18/11/2020 11:29:34




50:50,CNN1b,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.8555 (+/- 0.0835)|Test F1: 0.3333|36s
Iteration 45 of 66 | Duration 3.41m | 3.03h since start
Iteration 46 of 66 | Training 1 of 6 | 18/11/2020 11:30:10
50:50,CNN1b,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9778 (+/- 0.0336)|Test F1: 0.5000|31s
Iteration 46 of 66 | Training 2 of 6 | 18/11/2020 11:30:10
50:50,CNN1b,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9785 (+/- 0.0309)|Test F1: 0.5000|37s
Iteration 46 of 66 | Training 3 of 6 | 18/11/2020 11:30:48
50:50,CNN1b,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9770 (+/- 0.0295)|Test F1: 0.5000|33s
Iteration 46 of 66 | Training 4 of 6 | 18/11/2020 11:31:22
50:50,CNN1b,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9751 (+/- 0.0274)|Test F1: 0.4444|33s
Iteration 46 of 66 | Training 5 of 6 | 18/11/2020 11:31:55
50:50,CNN1b,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9775 (+/- 0.0376)|Test F1: 0.5714|32s
Iteration 46 of 66 | Training 6 of 6 | 18/11/2020 11:32:28
50:50,CNN1b,x,nor



50:50,CNN1b,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9926 (+/- 0.0212)|Test F1: 0.6667|35s
Iteration 48 of 66 | Training 4 of 6 | 18/11/2020 11:37:59
50:50,CNN1b,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9916 (+/- 0.0229)|Test F1: 0.0000|32s
Iteration 48 of 66 | Training 5 of 6 | 18/11/2020 11:38:32
50:50,CNN1b,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9930 (+/- 0.0228)|Test F1: 0.6667|32s
Iteration 48 of 66 | Training 6 of 6 | 18/11/2020 11:39:04
50:50,CNN1b,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9916 (+/- 0.0229)|Test F1: 0.6667|32s
Iteration 48 of 66 | Duration 3.30m | 3.18h since start
Iteration 49 of 66 | Training 1 of 6 | 18/11/2020 11:39:36
50:50,CNN1c,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9833 (+/- 0.0361)|Test F1: 0.4000|35s
Iteration 49 of 66 | Training 2 of 6 | 18/11/2020 11:40:12




50:50,CNN1c,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9903 (+/- 0.0242)|Test F1: 0.6667|39s
Iteration 49 of 66 | Training 3 of 6 | 18/11/2020 11:40:51
50:50,CNN1c,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9891 (+/- 0.0252)|Test F1: 0.4000|34s
Iteration 49 of 66 | Training 4 of 6 | 18/11/2020 11:41:26
50:50,CNN1c,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9884 (+/- 0.0365)|Test F1: 0.3333|34s
Iteration 49 of 66 | Training 5 of 6 | 18/11/2020 11:42:00
50:50,CNN1c,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9907 (+/- 0.0230)|Test F1: 0.5000|33s
Iteration 49 of 66 | Training 6 of 6 | 18/11/2020 11:42:34
50:50,CNN1c,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.9853 (+/- 0.0314)|Test F1: 0.4444|34s
Iteration 49 of 66 | Duration 3.54m | 3.24h since start
Iteration 50 of 66 | Training 1 of 6 | 18/11/2020 11:43:09
50:50,CNN1c,x,raw,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9477 (+/- 0.0662)|Test F1: 0.4000|34s
Iteration 50 of 66 | Training 2 of 6 | 18/11/2020 11:43:43
50:50,CNN1c,x,raw,tfidf,nTo:2,FeatureMax:



50:50,CNN1c,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.8480 (+/- 0.0901)|Test F1: 0.2222|38s
Iteration 51 of 66 | Training 5 of 6 | 18/11/2020 11:49:00
50:50,CNN1c,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.8487 (+/- 0.1043)|Test F1: 0.2000|34s
Iteration 51 of 66 | Training 6 of 6 | 18/11/2020 11:49:35
50:50,CNN1c,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.8567 (+/- 0.0960)|Test F1: 0.2500|34s
Iteration 51 of 66 | Duration 3.49m | 3.36h since start
Iteration 52 of 66 | Training 1 of 6 | 18/11/2020 11:50:09
50:50,CNN1c,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9735 (+/- 0.0389)|Test F1: 0.5000|32s
Iteration 52 of 66 | Training 2 of 6 | 18/11/2020 11:50:09
50:50,CNN1c,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9762 (+/- 0.0283)|Test F1: 0.5000|36s
Iteration 52 of 66 | Training 3 of 6 | 18/11/2020 11:50:45
50:50,CNN1c,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9769 (+/- 0.0284)|Test F1: 0.5000|33s
Iteration 52 of 66 | Training 4 of 6 | 18/11/2020 11:51:19
50:50,CNN1c,x,norm,tfid



50:50,CNN1c,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.9661 (+/- 0.0439)|Test F1: 0.4000|37s
Iteration 53 of 66 | Training 5 of 6 | 18/11/2020 11:55:22
50:50,CNN1c,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.9604 (+/- 0.0808)|Test F1: 0.3333|34s
Iteration 53 of 66 | Training 6 of 6 | 18/11/2020 11:55:56
50:50,CNN1c,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.9671 (+/- 0.0438)|Test F1: 0.5000|33s
Iteration 53 of 66 | Duration 3.45m | 3.46h since start
Iteration 54 of 66 | Training 1 of 6 | 18/11/2020 11:56:30
50:50,CNN1c,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9917 (+/- 0.0234)|Test F1: 0.6667|34s
Iteration 54 of 66 | Training 2 of 6 | 18/11/2020 11:57:05




50:50,CNN1c,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9898 (+/- 0.0265)|Test F1: 0.5000|38s
Iteration 54 of 66 | Training 3 of 6 | 18/11/2020 11:57:44
50:50,CNN1c,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9923 (+/- 0.0230)|Test F1: 0.5000|34s
Iteration 54 of 66 | Training 4 of 6 | 18/11/2020 11:58:18
50:50,CNN1c,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9917 (+/- 0.0225)|Test F1: 0.6667|33s
Iteration 54 of 66 | Training 5 of 6 | 18/11/2020 11:58:51
50:50,CNN1c,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9925 (+/- 0.0213)|Test F1: 0.6667|33s
Iteration 54 of 66 | Training 6 of 6 | 18/11/2020 11:59:25
50:50,CNN1c,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9921 (+/- 0.0230)|Test F1: 0.6667|35s
Iteration 54 of 66 | Duration 3.50m | 3.52h since start
Iteration 55 of 66 | Training 1 of 6 | 18/11/2020 12:00:00
50:50,CNN1d,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9173 (+/- 0.1280)|Test F1: 0.3636|32s
Iteration 55 of 66 | Training 2 of 6 | 18/11/2020 12:00:33
50:50,CNN1d,x,raw,tf,nTo:2,Fe



50:50,CNN1d,x,raw,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9444 (+/- 0.0498)|Test F1: 0.3077|36s
Iteration 56 of 66 | Training 2 of 6 | 18/11/2020 12:03:54
50:50,CNN1d,x,raw,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9470 (+/- 0.0460)|Test F1: 0.2500|32s
Iteration 56 of 66 | Training 3 of 6 | 18/11/2020 12:04:26
50:50,CNN1d,x,raw,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9419 (+/- 0.0574)|Test F1: 0.2500|32s
Iteration 56 of 66 | Training 4 of 6 | 18/11/2020 12:04:59
50:50,CNN1d,x,raw,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9426 (+/- 0.0652)|Test F1: 0.2500|32s
Iteration 56 of 66 | Training 5 of 6 | 18/11/2020 12:05:32
50:50,CNN1d,x,raw,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9434 (+/- 0.0457)|Test F1: 0.2222|32s
Iteration 56 of 66 | Training 6 of 6 | 18/11/2020 12:06:04
50:50,CNN1d,x,raw,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9348 (+/- 0.0759)|Test F1: 0.4000|32s
Iteration 56 of 66 | Duration 3.32m | 3.63h since start
Iteration 57 of 66 | Training 1 of 6 | 18/11/2020 12:06:36




50:50,CNN1d,x,norm,tf,nTo:1,FeatureMax:20|Train F1: 0.9370 (+/- 0.0855)|Test F1: 0.3636|35s
Iteration 57 of 66 | Training 2 of 6 | 18/11/2020 12:07:12
50:50,CNN1d,x,norm,tf,nTo:2,FeatureMax:20|Train F1: 0.9363 (+/- 0.0728)|Test F1: 0.3077|32s
Iteration 57 of 66 | Training 3 of 6 | 18/11/2020 12:07:44
50:50,CNN1d,x,norm,tf,nTo:3,FeatureMax:20|Train F1: 0.9431 (+/- 0.0795)|Test F1: 0.4000|32s
Iteration 57 of 66 | Training 4 of 6 | 18/11/2020 12:08:16
50:50,CNN1d,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.9251 (+/- 0.0995)|Test F1: 0.5000|33s
Iteration 57 of 66 | Training 5 of 6 | 18/11/2020 12:08:49
50:50,CNN1d,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.9287 (+/- 0.0866)|Test F1: 0.2857|32s
Iteration 57 of 66 | Training 6 of 6 | 18/11/2020 12:09:22
50:50,CNN1d,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.9215 (+/- 0.1152)|Test F1: 0.5000|32s
Iteration 57 of 66 | Duration 3.30m | 3.69h since start
Iteration 58 of 66 | Training 1 of 6 | 18/11/2020 12:09:54
50:50,CNN1d,x,norm,tfidf,nTo:1,Feature



50:50,CNN1d,x,std,tf,nTo:2,FeatureMax:20|Train F1: 0.9491 (+/- 0.0796)|Test F1: 0.5000|35s
Iteration 59 of 66 | Training 3 of 6 | 18/11/2020 12:13:48
50:50,CNN1d,x,std,tf,nTo:3,FeatureMax:20|Train F1: 0.9501 (+/- 0.0731)|Test F1: 0.3636|31s
Iteration 59 of 66 | Training 4 of 6 | 18/11/2020 12:14:20
50:50,CNN1d,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.9542 (+/- 0.0474)|Test F1: 0.5000|32s
Iteration 59 of 66 | Training 5 of 6 | 18/11/2020 12:14:52
50:50,CNN1d,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.9551 (+/- 0.0609)|Test F1: 0.5000|31s
Iteration 59 of 66 | Training 6 of 6 | 18/11/2020 12:15:24
50:50,CNN1d,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.9547 (+/- 0.0468)|Test F1: 0.5714|32s
Iteration 59 of 66 | Duration 3.29m | 3.79h since start
Iteration 60 of 66 | Training 1 of 6 | 18/11/2020 12:15:56




50:50,CNN1d,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9868 (+/- 0.0265)|Test F1: 0.5000|35s
Iteration 60 of 66 | Training 2 of 6 | 18/11/2020 12:16:32
50:50,CNN1d,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.9832 (+/- 0.0322)|Test F1: 0.2500|32s
Iteration 60 of 66 | Training 3 of 6 | 18/11/2020 12:17:05
50:50,CNN1d,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.9853 (+/- 0.0261)|Test F1: 0.2857|34s
Iteration 60 of 66 | Training 4 of 6 | 18/11/2020 12:17:40
50:50,CNN1d,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.9857 (+/- 0.0244)|Test F1: 0.2500|32s
Iteration 60 of 66 | Training 5 of 6 | 18/11/2020 12:18:12
50:50,CNN1d,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.9862 (+/- 0.0228)|Test F1: 0.2500|33s
Iteration 60 of 66 | Training 6 of 6 | 18/11/2020 12:18:45
50:50,CNN1d,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.9872 (+/- 0.0297)|Test F1: 0.2500|32s
Iteration 60 of 66 | Duration 3.36m | 3.84h since start
Iteration 61 of 66 | Training 1 of 6 | 18/11/2020 12:19:18




50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:20|Train F1: 0.9693 (+/- 0.0642)|Test F1: 0.6667|83s
Iteration 61 of 66 | Training 2 of 6 | 18/11/2020 12:20:41
50:50,CNN2,x,raw,tf,nTo:2,FeatureMax:20|Train F1: 0.9638 (+/- 0.0703)|Test F1: 0.5000|83s
Iteration 61 of 66 | Training 3 of 6 | 18/11/2020 12:22:05
50:50,CNN2,x,raw,tf,nTo:3,FeatureMax:20|Train F1: 0.9522 (+/- 0.0653)|Test F1: 0.6667|78s
Iteration 61 of 66 | Training 4 of 6 | 18/11/2020 12:23:23
50:50,CNN2,x,raw,tf,nTo:4,FeatureMax:20|Train F1: 0.9640 (+/- 0.0711)|Test F1: 0.6667|78s
Iteration 61 of 66 | Training 5 of 6 | 18/11/2020 12:24:42
50:50,CNN2,x,raw,tf,nTo:5,FeatureMax:20|Train F1: 0.9711 (+/- 0.0613)|Test F1: 0.5000|79s
Iteration 61 of 66 | Training 6 of 6 | 18/11/2020 12:26:02
50:50,CNN2,x,raw,tf,nTo:6,FeatureMax:20|Train F1: 0.9656 (+/- 0.0655)|Test F1: 0.1818|80s
Iteration 61 of 66 | Duration 8.06m | 3.98h since start
Iteration 62 of 66 | Training 1 of 6 | 18/11/2020 12:27:22
50:50,CNN2,x,raw,tfidf,nTo:1,FeatureMax:20|Train F



50:50,CNN2,x,norm,tf,nTo:2,FeatureMax:20|Train F1: 0.5255 (+/- 0.3439)|Test F1: 0.1111|79s
Iteration 63 of 66 | Training 3 of 6 | 18/11/2020 12:38:00




50:50,CNN2,x,norm,tf,nTo:3,FeatureMax:20|Train F1: 0.5523 (+/- 0.3542)|Test F1: 0.1176|81s
Iteration 63 of 66 | Training 4 of 6 | 18/11/2020 12:39:21
50:50,CNN2,x,norm,tf,nTo:4,FeatureMax:20|Train F1: 0.5189 (+/- 0.4178)|Test F1: 0.1333|75s
Iteration 63 of 66 | Training 5 of 6 | 18/11/2020 12:40:37




50:50,CNN2,x,norm,tf,nTo:5,FeatureMax:20|Train F1: 0.5402 (+/- 0.4002)|Test F1: 0.1538|81s
Iteration 63 of 66 | Training 6 of 6 | 18/11/2020 12:41:59




50:50,CNN2,x,norm,tf,nTo:6,FeatureMax:20|Train F1: 0.5142 (+/- 0.4055)|Test F1: 0.1818|78s
Iteration 63 of 66 | Duration 7.88m | 4.24h since start
Iteration 64 of 66 | Training 1 of 6 | 18/11/2020 12:43:17
50:50,CNN2,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.5312 (+/- 0.6309)|Test F1: 0.0000|74s
Iteration 64 of 66 | Training 2 of 6 | 18/11/2020 12:43:17
50:50,CNN2,x,norm,tfidf,nTo:2,FeatureMax:20|Train F1: 0.6385 (+/- 0.2905)|Test F1: 0.0000|76s
Iteration 64 of 66 | Training 3 of 6 | 18/11/2020 12:44:33
50:50,CNN2,x,norm,tfidf,nTo:3,FeatureMax:20|Train F1: 0.6656 (+/- 0.3083)|Test F1: 0.0800|77s
Iteration 64 of 66 | Training 4 of 6 | 18/11/2020 12:45:51




50:50,CNN2,x,norm,tfidf,nTo:4,FeatureMax:20|Train F1: 0.6930 (+/- 0.1367)|Test F1: 0.0588|82s
Iteration 64 of 66 | Training 5 of 6 | 18/11/2020 12:47:13
50:50,CNN2,x,norm,tfidf,nTo:5,FeatureMax:20|Train F1: 0.6650 (+/- 0.3089)|Test F1: 0.0000|77s
Iteration 64 of 66 | Training 6 of 6 | 18/11/2020 12:48:30
50:50,CNN2,x,norm,tfidf,nTo:6,FeatureMax:20|Train F1: 0.6912 (+/- 0.1578)|Test F1: 0.0000|77s
Iteration 64 of 66 | Duration 6.52m | 4.35h since start
Iteration 65 of 66 | Training 1 of 6 | 18/11/2020 12:49:48




50:50,CNN2,x,std,tf,nTo:1,FeatureMax:20|Train F1: 0.7317 (+/- 0.2515)|Test F1: 0.2000|80s
Iteration 65 of 66 | Training 2 of 6 | 18/11/2020 12:51:09
50:50,CNN2,x,std,tf,nTo:2,FeatureMax:20|Train F1: 0.7501 (+/- 0.1524)|Test F1: 0.1818|80s
Iteration 65 of 66 | Training 3 of 6 | 18/11/2020 12:52:29
50:50,CNN2,x,std,tf,nTo:3,FeatureMax:20|Train F1: 0.7466 (+/- 0.1699)|Test F1: 0.2105|78s
Iteration 65 of 66 | Training 4 of 6 | 18/11/2020 12:53:47




50:50,CNN2,x,std,tf,nTo:4,FeatureMax:20|Train F1: 0.7685 (+/- 0.1429)|Test F1: 0.1905|79s
Iteration 65 of 66 | Training 5 of 6 | 18/11/2020 12:55:07




50:50,CNN2,x,std,tf,nTo:5,FeatureMax:20|Train F1: 0.7387 (+/- 0.1600)|Test F1: 0.0000|78s
Iteration 65 of 66 | Training 6 of 6 | 18/11/2020 12:56:25
50:50,CNN2,x,std,tf,nTo:6,FeatureMax:20|Train F1: 0.7478 (+/- 0.1398)|Test F1: 0.2105|77s
Iteration 65 of 66 | Duration 7.91m | 4.48h since start
Iteration 66 of 66 | Training 1 of 6 | 18/11/2020 12:57:43




50:50,CNN2,x,std,tfidf,nTo:1,FeatureMax:20|Train F1: 0.8276 (+/- 0.1315)|Test F1: 0.2500|81s
Iteration 66 of 66 | Training 2 of 6 | 18/11/2020 12:59:04




50:50,CNN2,x,std,tfidf,nTo:2,FeatureMax:20|Train F1: 0.8307 (+/- 0.1412)|Test F1: 0.3333|79s
Iteration 66 of 66 | Training 3 of 6 | 18/11/2020 13:00:23
50:50,CNN2,x,std,tfidf,nTo:3,FeatureMax:20|Train F1: 0.8457 (+/- 0.1597)|Test F1: 0.4000|76s
Iteration 66 of 66 | Training 4 of 6 | 18/11/2020 13:01:39
50:50,CNN2,x,std,tfidf,nTo:4,FeatureMax:20|Train F1: 0.8334 (+/- 0.1509)|Test F1: 0.1818|79s
Iteration 66 of 66 | Training 5 of 6 | 18/11/2020 13:02:59




50:50,CNN2,x,std,tfidf,nTo:5,FeatureMax:20|Train F1: 0.8482 (+/- 0.1336)|Test F1: 0.2353|81s
Iteration 66 of 66 | Training 6 of 6 | 18/11/2020 13:04:21
50:50,CNN2,x,std,tfidf,nTo:6,FeatureMax:20|Train F1: 0.8335 (+/- 0.1604)|Test F1: 0.1538|76s
Iteration 66 of 66 | Duration 7.90m | 4.62h since start


In [None]:
#All clfs, 1-7, 20-100, scaling, TF and TFIDF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 12 | Training 1 of 1 | 18/11/2020 01:48:29
50:50,LR,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9780 (+/- 0.0255)|Test F1: 0.5714|2s
Iteration 1 of 12 | Duration 0.04m | 0.00h since start
Iteration 2 of 12 | Training 1 of 1 | 18/11/2020 01:48:31
50:50,SVMa,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.9480 (+/- 0.0503)|Test F1: 0.4444|0s
Iteration 2 of 12 | Duration 0.00m | 0.00h since start
Iteration 3 of 12 | Training 1 of 1 | 18/11/2020 01:48:32
50:50,SVMs,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 3 of 12 | Duration 0.00m | 0.00h since start
Iteration 4 of 12 | Training 1 of 1 | 18/11/2020 01:48:32
50:50,MLP8,x,norm,tfidf,nTo:1,FeatureMax:20 in df|Train F1: 0.9926 (+/- 0.0172)|Test F1: 0.5000|10s
Iteration 4 of 12 | Duration 0.00m | 0.00h since start
Iteration 5 of 12 | Training 1 of 1 | 18/11/2020 01:48:32




50:50,LSTM1,x,norm,tfidf,nTo:1,FeatureMax:20|Train F1: 0.8030 (+/- 0.3826)|Test F1: 0.4000|121s
Iteration 5 of 12 | Duration 2.03m | 0.03h since start
Iteration 6 of 12 | Training 1 of 1 | 18/11/2020 01:50:34




In [None]:
#All clfs, 1, 40, raw+std, TFIDF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 24 | Training 1 of 1 | 18/11/2020 14:20:01
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9910 (+/- 0.0203)|Test F1: 0.8000|3s
Iteration 1 of 24 | Duration 0.06m | 0.00h since start
Iteration 2 of 24 | Training 1 of 1 | 18/11/2020 14:20:04
50:50,LR,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9950 (+/- 0.0123)|Test F1: 0.6667|0s
Iteration 2 of 24 | Duration 0.00m | 0.00h since start
Iteration 3 of 24 | Training 1 of 1 | 18/11/2020 14:20:04
50:50,SVMa,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8938 (+/- 0.0495)|Test F1: 0.5000|0s
Iteration 3 of 24 | Duration 0.01m | 0.00h since start
Iteration 4 of 24 | Training 1 of 1 | 18/11/2020 14:20:04
50:50,SVMa,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 4 of 24 | Duration 0.00m | 0.00h since start
Iteration 5 of 24 | Training 1 of 1 | 18/11/2020 14:20:05
50:50,SVMs,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 1.0000 (+/- 0.0000)|Test F1: 0.6667|0s
Iteration 5 of 24 | Duration 0.00m



50:50,LSTM1,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8036 (+/- 0.3805)|Test F1: 0.5000|129s
Iteration 9 of 24 | Duration 2.16m | 0.04h since start
Iteration 10 of 24 | Training 1 of 1 | 18/11/2020 14:22:35




50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9837 (+/- 0.0271)|Test F1: 1.0000|126s
Iteration 10 of 24 | Duration 2.11m | 0.08h since start
Iteration 11 of 24 | Training 1 of 1 | 18/11/2020 14:24:42
50:50,LSTM2,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8356 (+/- 0.2424)|Test F1: 0.2857|123s
Iteration 11 of 24 | Duration 2.05m | 0.11h since start
Iteration 12 of 24 | Training 1 of 1 | 18/11/2020 14:26:45




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9854 (+/- 0.0251)|Test F1: 1.0000|129s
Iteration 12 of 24 | Duration 2.16m | 0.15h since start
Iteration 13 of 24 | Training 1 of 1 | 18/11/2020 14:28:55
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.2392 (+/- 0.6394)|Test F1: 0.0800|41s
Iteration 13 of 24 | Duration 0.68m | 0.16h since start
Iteration 14 of 24 | Training 1 of 1 | 18/11/2020 14:29:36
50:50,CNN1a,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.5038 (+/- 0.5903)|Test F1: 0.0000|40s
Iteration 14 of 24 | Duration 0.68m | 0.17h since start
Iteration 15 of 24 | Training 1 of 1 | 18/11/2020 14:30:17
50:50,CNN1b,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9482 (+/- 0.1039)|Test F1: 1.0000|42s
Iteration 15 of 24 | Duration 0.71m | 0.18h since start
Iteration 16 of 24 | Training 1 of 1 | 18/11/2020 14:30:59
50:50,CNN1b,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9955 (+/- 0.0121)|Test F1: 1.0000|41s
Iteration 16 of 24 | Duration 0.69m | 0.19h since start
Iteration 17 



50:50,CNN2,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.8861 (+/- 0.1303)|Test F1: 0.5000|92s
Iteration 22 of 24 | Duration 1.54m | 0.29h since start
Iteration 23 of 24 | Training 1 of 1 | 18/11/2020 14:37:30
50:50,CNN3,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9631 (+/- 0.0897)|Test F1: 0.8000|36s
Iteration 23 of 24 | Duration 0.60m | 0.30h since start
Iteration 24 of 24 | Training 1 of 1 | 18/11/2020 14:38:06
50:50,CNN3,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9989 (+/- 0.0075)|Test F1: 0.0000|36s
Iteration 24 of 24 | Duration 0.60m | 0.31h since start


In [None]:
#All clfs, x+l, 1, 40, raw+std, TFIDF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 72 | Training 1 of 1 | 18/11/2020 14:47:37
50:50,LR,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9894 (+/- 0.0328)|Test F1: 0.8000|1s
Iteration 1 of 72 | Duration 0.03m | 0.00h since start
Iteration 2 of 72 | Training 1 of 1 | 18/11/2020 14:47:39
50:50,LR,x+r8+rC+rH+c0+cR+rF,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4560 (+/- 0.2312)|Test F1: 0.1111|0s
Iteration 2 of 72 | Duration 0.01m | 0.00h since start
Iteration 3 of 72 | Training 1 of 1 | 18/11/2020 14:47:39
50:50,LR,x+cJ+rG+cV+c0,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3527 (+/- 0.3531)|Test F1: 0.1667|0s
Iteration 3 of 72 | Duration 0.00m | 0.00h since start
Iteration 4 of 72 | Training 1 of 1 | 18/11/2020 14:47:40
50:50,LR,x+r8+rC+rH,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9945 (+/- 0.0188)|Test F1: 0.5000|0s
Iteration 4 of 72 | Duration 0.00m | 0.00h since start
Iteration 5 of 72 | Training 1 of 1 | 18/11/2020 14:47:40
50:50,LR,x+r8+rC+rH+c0+cR+rF,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9945 (+/- 0



50:50,MLP8,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9721 (+/- 0.0628)|Test F1: 0.0000|16s
Iteration 19 of 72 | Duration 0.28m | 0.01h since start
Iteration 20 of 72 | Training 1 of 1 | 18/11/2020 14:48:01
50:50,MLP8,x+r8+rC+rH+c0+cR+rF,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3052 (+/- 0.5824)|Test F1: 0.0000|2s
Iteration 20 of 72 | Duration 0.04m | 0.01h since start
Iteration 21 of 72 | Training 1 of 1 | 18/11/2020 14:48:04
50:50,MLP8,x+cJ+rG+cV+c0,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3390 (+/- 0.6600)|Test F1: 0.0800|2s
Iteration 21 of 72 | Duration 0.04m | 0.01h since start
Iteration 22 of 72 | Training 1 of 1 | 18/11/2020 14:48:06
50:50,MLP8,x+r8+rC+rH,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9952 (+/- 0.0185)|Test F1: 0.5000|8s
Iteration 22 of 72 | Duration 0.14m | 0.01h since start
Iteration 23 of 72 | Training 1 of 1 | 18/11/2020 14:48:14
50:50,MLP8,x+r8+rC+rH+c0+cR+rF,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9959 (+/- 0.0174)|Test F1: 0.6667|7s
Iteration 23 



50:50,LSTM1,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.7637 (+/- 0.1548)|Test F1: 0.0000|131s
Iteration 25 of 72 | Duration 2.20m | 0.05h since start
Iteration 26 of 72 | Training 1 of 1 | 18/11/2020 14:50:41
50:50,LSTM1,x+r8+rC+rH+c0+cR+rF,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3355 (+/- 0.6521)|Test F1: 0.0800|126s
Iteration 26 of 72 | Duration 2.11m | 0.09h since start
Iteration 27 of 72 | Training 1 of 1 | 18/11/2020 14:52:48
50:50,LSTM1,x+cJ+rG+cV+c0,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.3589 (+/- 0.6427)|Test F1: 0.0800|130s
Iteration 27 of 72 | Duration 2.17m | 0.12h since start
Iteration 28 of 72 | Training 1 of 1 | 18/11/2020 14:54:58




50:50,LSTM1,x+r8+rC+rH,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9811 (+/- 0.0439)|Test F1: 1.0000|131s
Iteration 28 of 72 | Duration 2.19m | 0.16h since start
Iteration 29 of 72 | Training 1 of 1 | 18/11/2020 14:57:10
50:50,LSTM1,x+r8+rC+rH+c0+cR+rF,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9811 (+/- 0.0390)|Test F1: 1.0000|125s
Iteration 29 of 72 | Duration 2.10m | 0.19h since start
Iteration 30 of 72 | Training 1 of 1 | 18/11/2020 14:59:16




50:50,LSTM1,x+cJ+rG+cV+c0,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9776 (+/- 0.0454)|Test F1: 1.0000|132s
Iteration 30 of 72 | Duration 2.20m | 0.23h since start
Iteration 31 of 72 | Training 1 of 1 | 18/11/2020 15:01:28
50:50,LSTM2,x+r8+rC+rH,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.7999 (+/- 0.1213)|Test F1: 0.0000|122s
Iteration 31 of 72 | Duration 2.04m | 0.26h since start
Iteration 32 of 72 | Training 1 of 1 | 18/11/2020 15:03:30


In [None]:
#All clfs, x, 1, 40-100, all scales, TF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 36 | Training 1 of 4 | 18/11/2020 18:11:50
50:50,LR,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9922 (+/- 0.0197)|Test F1: 0.3333|1s
Iteration 1 of 36 | Training 2 of 4 | 18/11/2020 18:11:52
50:50,LR,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9923 (+/- 0.0168)|Test F1: 0.4000|2s
Iteration 1 of 36 | Training 3 of 4 | 18/11/2020 18:11:55
50:50,LR,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9926 (+/- 0.0186)|Test F1: 0.6667|1s
Iteration 1 of 36 | Training 4 of 4 | 18/11/2020 18:11:56
50:50,LR,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9920 (+/- 0.0188)|Test F1: 0.6667|1s
Iteration 1 of 36 | Duration 0.13m | 0.00h since start
Iteration 2 of 36 | Training 1 of 4 | 18/11/2020 18:11:58
50:50,LR,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.8886 (+/- 0.0825)|Test F1: 0.4000|0s
Iteration 2 of 36 | Training 2 of 4 | 18/11/2020 18:11:58
50:50,LR,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.9037 (+/- 0.1281)|Test F1: 0.4000|0s
Iteration 2 of 36 | Training 3 of 4 | 18/11/2020 18:11:59
50:50,LR,x,nor



50:50,MLP8,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.9940 (+/- 0.0160)|Test F1: 1.0000|15s
Iteration 11 of 36 | Training 2 of 4 | 18/11/2020 18:13:07
50:50,MLP8,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.9947 (+/- 0.0190)|Test F1: 1.0000|12s
Iteration 11 of 36 | Training 3 of 4 | 18/11/2020 18:13:20
50:50,MLP8,x,norm,tf,nTo:1,FeatureMax:80|Train F1: 0.9928 (+/- 0.0190)|Test F1: 1.0000|12s
Iteration 11 of 36 | Training 4 of 4 | 18/11/2020 18:13:33
50:50,MLP8,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.9942 (+/- 0.0157)|Test F1: 1.0000|12s
Iteration 11 of 36 | Duration 0.91m | 0.03h since start
Iteration 12 of 36 | Training 1 of 4 | 18/11/2020 18:13:45
50:50,MLP8,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.9941 (+/- 0.0213)|Test F1: 1.0000|12s
Iteration 12 of 36 | Training 2 of 4 | 18/11/2020 18:13:57
50:50,MLP8,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.9951 (+/- 0.0158)|Test F1: 0.6667|10s
Iteration 12 of 36 | Training 3 of 4 | 18/11/2020 18:14:08
50:50,MLP8,x,std,tf,nTo:1,FeatureMax:80|Train



50:50,LSTM1,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9986 (+/- 0.0079)|Test F1: 0.0000|134s
Iteration 13 of 36 | Training 2 of 4 | 18/11/2020 18:16:44
50:50,LSTM1,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9986 (+/- 0.0078)|Test F1: 0.0000|136s
Iteration 13 of 36 | Training 3 of 4 | 18/11/2020 18:19:00
50:50,LSTM1,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9877 (+/- 0.0803)|Test F1: 0.6667|129s
Iteration 13 of 36 | Training 4 of 4 | 18/11/2020 18:21:10
50:50,LSTM1,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9977 (+/- 0.0130)|Test F1: 0.6667|130s
Iteration 13 of 36 | Duration 8.84m | 0.19h since start
Iteration 14 of 36 | Training 1 of 4 | 18/11/2020 18:23:20
50:50,LSTM1,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.8751 (+/- 0.0944)|Test F1: 0.4000|122s
Iteration 14 of 36 | Training 2 of 4 | 18/11/2020 18:25:22




50:50,LSTM1,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.8719 (+/- 0.0915)|Test F1: 0.4000|133s
Iteration 14 of 36 | Training 3 of 4 | 18/11/2020 18:27:36




50:50,LSTM1,x,norm,tf,nTo:1,FeatureMax:80|Train F1: 0.8761 (+/- 0.0993)|Test F1: 0.5000|129s
Iteration 14 of 36 | Training 4 of 4 | 18/11/2020 18:29:46
50:50,LSTM1,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.8833 (+/- 0.0912)|Test F1: 0.6667|123s
Iteration 14 of 36 | Duration 8.48m | 0.33h since start
Iteration 15 of 36 | Training 1 of 4 | 18/11/2020 18:31:49




50:50,LSTM1,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.8838 (+/- 0.0830)|Test F1: 0.0000|129s
Iteration 15 of 36 | Training 2 of 4 | 18/11/2020 18:33:59
50:50,LSTM1,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.8870 (+/- 0.0827)|Test F1: 0.0000|121s
Iteration 15 of 36 | Training 3 of 4 | 18/11/2020 18:36:00




50:50,LSTM1,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.8924 (+/- 0.0752)|Test F1: 0.6667|129s
Iteration 15 of 36 | Training 4 of 4 | 18/11/2020 18:38:09
50:50,LSTM1,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.8924 (+/- 0.0838)|Test F1: 0.0000|119s
Iteration 15 of 36 | Duration 8.33m | 0.47h since start
Iteration 16 of 36 | Training 1 of 4 | 18/11/2020 18:40:08




50:50,LSTM2,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9995 (+/- 0.0050)|Test F1: 0.6667|127s
Iteration 16 of 36 | Training 2 of 4 | 18/11/2020 18:42:16
50:50,LSTM2,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9990 (+/- 0.0069)|Test F1: 0.6667|124s
Iteration 16 of 36 | Training 3 of 4 | 18/11/2020 18:44:20
50:50,LSTM2,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9996 (+/- 0.0044)|Test F1: 0.6667|125s
Iteration 16 of 36 | Training 4 of 4 | 18/11/2020 18:46:26
50:50,LSTM2,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9985 (+/- 0.0081)|Test F1: 0.6667|128s
Iteration 16 of 36 | Duration 8.45m | 0.61h since start
Iteration 17 of 36 | Training 1 of 4 | 18/11/2020 18:48:35
50:50,LSTM2,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.8715 (+/- 0.0985)|Test F1: 0.3333|118s
Iteration 17 of 36 | Training 2 of 4 | 18/11/2020 18:50:33




50:50,LSTM2,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.8719 (+/- 0.1008)|Test F1: 0.4000|128s
Iteration 17 of 36 | Training 3 of 4 | 18/11/2020 18:52:41
50:50,LSTM2,x,norm,tf,nTo:1,FeatureMax:80|Train F1: 0.8781 (+/- 0.0880)|Test F1: 0.5000|119s
Iteration 17 of 36 | Training 4 of 4 | 18/11/2020 18:54:41




50:50,LSTM2,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.8871 (+/- 0.0892)|Test F1: 0.0000|130s
Iteration 17 of 36 | Duration 8.26m | 0.75h since start
Iteration 18 of 36 | Training 1 of 4 | 18/11/2020 18:56:51
50:50,LSTM2,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.8818 (+/- 0.0830)|Test F1: 0.0000|124s
Iteration 18 of 36 | Training 2 of 4 | 18/11/2020 18:58:55




50:50,LSTM2,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.8856 (+/- 0.0869)|Test F1: 0.6667|130s
Iteration 18 of 36 | Training 3 of 4 | 18/11/2020 19:01:05




50:50,LSTM2,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.8987 (+/- 0.0952)|Test F1: 0.6667|127s
Iteration 18 of 36 | Training 4 of 4 | 18/11/2020 19:03:13




50:50,LSTM2,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.9079 (+/- 0.0992)|Test F1: 0.6667|125s
Iteration 18 of 36 | Duration 8.47m | 0.89h since start
Iteration 19 of 36 | Training 1 of 4 | 18/11/2020 19:05:19
50:50,CNN1a,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.8213 (+/- 0.3548)|Test F1: 0.2222|41s
Iteration 19 of 36 | Training 2 of 4 | 18/11/2020 19:06:00
50:50,CNN1a,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.8488 (+/- 0.1425)|Test F1: 0.6667|49s
Iteration 19 of 36 | Training 3 of 4 | 18/11/2020 19:06:50
50:50,CNN1a,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.6653 (+/- 0.6318)|Test F1: 0.5000|63s
Iteration 19 of 36 | Training 4 of 4 | 18/11/2020 19:07:53
50:50,CNN1a,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.6674 (+/- 0.6269)|Test F1: 0.6667|66s
Iteration 19 of 36 | Duration 3.68m | 0.95h since start
Iteration 20 of 36 | Training 1 of 4 | 18/11/2020 19:09:00
50:50,CNN1a,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.3174 (+/- 0.6647)|Test F1: 0.0800|42s
Iteration 20 of 36 | Training 2 of 4 | 18/1



50:50,CNN1a,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.3351 (+/- 0.6455)|Test F1: 0.0800|45s
Iteration 21 of 36 | Training 2 of 4 | 18/11/2020 19:13:26
50:50,CNN1a,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.2385 (+/- 0.6368)|Test F1: 0.0800|50s
Iteration 21 of 36 | Training 3 of 4 | 18/11/2020 19:14:16
50:50,CNN1a,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.3362 (+/- 0.6631)|Test F1: 0.0000|58s
Iteration 21 of 36 | Training 4 of 4 | 18/11/2020 19:15:14
50:50,CNN1a,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.2939 (+/- 0.6703)|Test F1: 0.0800|66s
Iteration 21 of 36 | Duration 3.68m | 1.08h since start
Iteration 22 of 36 | Training 1 of 4 | 18/11/2020 19:16:21
50:50,CNN1b,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9951 (+/- 0.0157)|Test F1: 0.6667|42s
Iteration 22 of 36 | Training 2 of 4 | 18/11/2020 19:17:04
50:50,CNN1b,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9932 (+/- 0.0205)|Test F1: 1.0000|52s
Iteration 22 of 36 | Training 3 of 4 | 18/11/2020 19:17:57
50:50,CNN1b,x,raw,tf,nTo:1,FeatureMax:80|Tr



50:50,CNN1b,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.9912 (+/- 0.0465)|Test F1: 1.0000|71s
Iteration 23 of 36 | Duration 3.71m | 1.20h since start
Iteration 24 of 36 | Training 1 of 4 | 18/11/2020 19:23:50
50:50,CNN1b,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.9768 (+/- 0.0821)|Test F1: 1.0000|42s
Iteration 24 of 36 | Training 2 of 4 | 18/11/2020 19:24:33
50:50,CNN1b,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.9924 (+/- 0.0274)|Test F1: 1.0000|50s
Iteration 24 of 36 | Training 3 of 4 | 18/11/2020 19:25:23
50:50,CNN1b,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.9780 (+/- 0.0561)|Test F1: 0.6667|58s
Iteration 24 of 36 | Training 4 of 4 | 18/11/2020 19:26:22
50:50,CNN1b,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.9935 (+/- 0.0205)|Test F1: 0.6667|67s
Iteration 24 of 36 | Duration 3.66m | 1.26h since start
Iteration 25 of 36 | Training 1 of 4 | 18/11/2020 19:27:29
50:50,CNN1c,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9872 (+/- 0.0503)|Test F1: 0.5714|49s
Iteration 25 of 36 | Training 2 of 4 | 18/11



50:50,CNN1c,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.9738 (+/- 0.0878)|Test F1: 1.0000|64s
Iteration 27 of 36 | Training 4 of 4 | 18/11/2020 19:38:02
50:50,CNN1c,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.9928 (+/- 0.0202)|Test F1: 0.6667|72s
Iteration 27 of 36 | Duration 3.92m | 1.46h since start
Iteration 28 of 36 | Training 1 of 4 | 18/11/2020 19:39:15




50:50,CNN1d,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9861 (+/- 0.0251)|Test F1: 0.5714|43s
Iteration 28 of 36 | Training 2 of 4 | 18/11/2020 19:39:58
50:50,CNN1d,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9783 (+/- 0.0627)|Test F1: 0.6667|48s
Iteration 28 of 36 | Training 3 of 4 | 18/11/2020 19:40:47
50:50,CNN1d,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9753 (+/- 0.0767)|Test F1: 0.5000|55s
Iteration 28 of 36 | Training 4 of 4 | 18/11/2020 19:41:42
50:50,CNN1d,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9852 (+/- 0.0394)|Test F1: 0.6667|63s
Iteration 28 of 36 | Duration 3.50m | 1.52h since start
Iteration 29 of 36 | Training 1 of 4 | 18/11/2020 19:42:45
50:50,CNN1d,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.9686 (+/- 0.0868)|Test F1: 0.6667|40s
Iteration 29 of 36 | Training 2 of 4 | 18/11/2020 19:43:25
50:50,CNN1d,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.9880 (+/- 0.0215)|Test F1: 1.0000|47s
Iteration 29 of 36 | Training 3 of 4 | 18/11/2020 19:44:13
50:50,CNN1d,x,norm,tf,nTo:1,FeatureMax:80



50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9654 (+/- 0.0655)|Test F1: 0.6667|86s
Iteration 31 of 36 | Training 2 of 4 | 18/11/2020 19:51:05
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9864 (+/- 0.0371)|Test F1: 0.0000|93s
Iteration 31 of 36 | Training 3 of 4 | 18/11/2020 19:52:39
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9450 (+/- 0.1130)|Test F1: 0.6667|98s
Iteration 31 of 36 | Training 4 of 4 | 18/11/2020 19:54:17
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9734 (+/- 0.0526)|Test F1: 0.6667|110s
Iteration 31 of 36 | Duration 6.48m | 1.74h since start
Iteration 32 of 36 | Training 1 of 4 | 18/11/2020 19:56:08




50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.4530 (+/- 0.6997)|Test F1: 0.0000|85s
Iteration 32 of 36 | Training 2 of 4 | 18/11/2020 19:57:33
50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.6290 (+/- 0.6412)|Test F1: 0.1176|90s
Iteration 32 of 36 | Training 3 of 4 | 18/11/2020 19:59:04


In [None]:
#CNN2 and CNN3, x, 1, 40-100, all scales, TF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 6 | Training 1 of 4 | 18/11/2020 21:27:04
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:40 in df|Train F1: 0.9654 (+/- 0.0655)|Test F1: 0.6667|86s
Iteration 1 of 6 | Training 2 of 4 | 18/11/2020 21:27:04
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:60 in df|Train F1: 0.9864 (+/- 0.0371)|Test F1: 0.0000|93s
Iteration 1 of 6 | Training 3 of 4 | 18/11/2020 21:27:04
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:80 in df|Train F1: 0.9450 (+/- 0.1130)|Test F1: 0.6667|98s
Iteration 1 of 6 | Training 4 of 4 | 18/11/2020 21:27:04
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:100 in df|Train F1: 0.9734 (+/- 0.0526)|Test F1: 0.6667|110s
Iteration 1 of 6 | Duration 0.00m | 0.00h since start
Iteration 2 of 6 | Training 1 of 4 | 18/11/2020 21:27:04
50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:40|Train F1: 0.6168 (+/- 0.5152)|Test F1: 0.0000|91s
Iteration 2 of 6 | Training 2 of 4 | 18/11/2020 21:28:36
50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.7520 (+/- 0.2105)|Test F1: 0.0000|95s
Iteration 2 of 6 | Training 3 of 4 |



50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.6781 (+/- 0.3744)|Test F1: 0.1667|117s
Iteration 2 of 6 | Duration 6.80m | 0.11h since start
Iteration 3 of 6 | Training 1 of 4 | 18/11/2020 21:33:53
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.7540 (+/- 0.1148)|Test F1: 0.0000|86s
Iteration 3 of 6 | Training 2 of 4 | 18/11/2020 21:35:20
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.7600 (+/- 0.1334)|Test F1: 0.2222|95s
Iteration 3 of 6 | Training 3 of 4 | 18/11/2020 21:36:55




50:50,CNN2,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.7575 (+/- 0.1570)|Test F1: 0.0000|111s
Iteration 3 of 6 | Training 4 of 4 | 18/11/2020 21:38:47
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.7320 (+/- 0.1568)|Test F1: 0.0000|112s
Iteration 3 of 6 | Duration 6.77m | 0.23h since start
Iteration 4 of 6 | Training 1 of 4 | 18/11/2020 21:40:39
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9986 (+/- 0.0077)|Test F1: 0.6667|37s
Iteration 4 of 6 | Training 2 of 4 | 18/11/2020 21:41:16
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9910 (+/- 0.0484)|Test F1: 0.6667|41s
Iteration 4 of 6 | Training 3 of 4 | 18/11/2020 21:41:57
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9936 (+/- 0.0437)|Test F1: 0.6667|45s
Iteration 4 of 6 | Training 4 of 4 | 18/11/2020 21:42:43
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9972 (+/- 0.0101)|Test F1: 1.0000|50s
Iteration 4 of 6 | Duration 2.91m | 0.27h since start
Iteration 5 of 6 | Training 1 of 4 | 18/11/2020 21:43:33
50:50,



50:50,CNN3,x,norm,tf,nTo:1,FeatureMax:60|Train F1: 0.8920 (+/- 0.0689)|Test F1: 0.5000|44s
Iteration 5 of 6 | Training 3 of 4 | 18/11/2020 21:44:55
50:50,CNN3,x,norm,tf,nTo:1,FeatureMax:80|Train F1: 0.8902 (+/- 0.0719)|Test F1: 0.5000|45s
Iteration 5 of 6 | Training 4 of 4 | 18/11/2020 21:45:41




50:50,CNN3,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.8916 (+/- 0.0695)|Test F1: 0.5000|53s
Iteration 5 of 6 | Duration 3.01m | 0.32h since start
Iteration 6 of 6 | Training 1 of 4 | 18/11/2020 21:46:34
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.9799 (+/- 0.0756)|Test F1: 0.0000|36s
Iteration 6 of 6 | Training 2 of 4 | 18/11/2020 21:47:10
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.9789 (+/- 0.0764)|Test F1: 0.0000|41s
Iteration 6 of 6 | Training 3 of 4 | 18/11/2020 21:47:52
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.9804 (+/- 0.0651)|Test F1: 0.0000|45s
Iteration 6 of 6 | Training 4 of 4 | 18/11/2020 21:48:38
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.9784 (+/- 0.0875)|Test F1: 1.0000|50s
Iteration 6 of 6 | Duration 2.91m | 0.37h since start


In [None]:
#CNN2 and CNN3, x, 1, 40-100, all scales, TF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 6 | Training 1 of 4 | 19/11/2020 11:53:18
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:40 in df|Train F1: 0.9654 (+/- 0.0655)|Test F1: 0.6667|86s
Iteration 1 of 6 | Training 2 of 4 | 19/11/2020 11:53:18
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:60 in df|Train F1: 0.9864 (+/- 0.0371)|Test F1: 0.0000|93s
Iteration 1 of 6 | Training 3 of 4 | 19/11/2020 11:53:18
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:80 in df|Train F1: 0.9450 (+/- 0.1130)|Test F1: 0.6667|98s
Iteration 1 of 6 | Training 4 of 4 | 19/11/2020 11:53:18
50:50,CNN2,x,raw,tf,nTo:1,FeatureMax:100 in df|Train F1: 0.9734 (+/- 0.0526)|Test F1: 0.6667|110s
Iteration 1 of 6 | Duration 0.00m | 0.00h since start
Iteration 2 of 6 | Training 1 of 4 | 19/11/2020 11:53:18
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classificat



50:50,CNN2,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.7368 (+/- 0.1949)|Test F1: 0.1667|111s
Iteration 2 of 6 | Duration 6.48m | 0.11h since start
Iteration 3 of 6 | Training 1 of 4 | 19/11/2020 11:59:47
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.7531 (+/- 0.1286)|Test F1: 0.0000|79s
Iteration 3 of 6 | Training 2 of 4 | 19/11/2020 12:01:07
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.7480 (+/- 0.2541)|Test F1: 0.0000|90s
Iteration 3 of 6 | Training 3 of 4 | 19/11/2020 12:02:37




50:50,CNN2,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.7146 (+/- 0.2184)|Test F1: 0.0000|102s
Iteration 3 of 6 | Training 4 of 4 | 19/11/2020 12:04:20
50:50,CNN2,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.7165 (+/- 0.2521)|Test F1: 0.0000|105s
Iteration 3 of 6 | Duration 6.31m | 0.21h since start
Iteration 4 of 6 | Training 1 of 4 | 19/11/2020 12:06:06
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:40|Train F1: 0.9971 (+/- 0.0143)|Test F1: 0.6667|36s
Iteration 4 of 6 | Training 2 of 4 | 19/11/2020 12:06:42
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:60|Train F1: 0.9963 (+/- 0.0165)|Test F1: 0.6667|40s
Iteration 4 of 6 | Training 3 of 4 | 19/11/2020 12:07:22
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:80|Train F1: 0.9961 (+/- 0.0115)|Test F1: 0.8000|44s
Iteration 4 of 6 | Training 4 of 4 | 19/11/2020 12:08:07
50:50,CNN3,x,raw,tf,nTo:1,FeatureMax:100|Train F1: 0.9966 (+/- 0.0136)|Test F1: 1.0000|49s
Iteration 4 of 6 | Duration 2.84m | 0.26h since start
Iteration 5 of 6 | Training 1 of 4 | 19/11/2020 12:08:56
50:50,



50:50,CNN3,x,norm,tf,nTo:1,FeatureMax:100|Train F1: 0.8884 (+/- 0.0695)|Test F1: 0.5000|54s
Iteration 5 of 6 | Duration 3.03m | 0.31h since start
Iteration 6 of 6 | Training 1 of 4 | 19/11/2020 12:11:58
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:40|Train F1: 0.9792 (+/- 0.0764)|Test F1: 0.0000|36s
Iteration 6 of 6 | Training 2 of 4 | 19/11/2020 12:12:34
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:60|Train F1: 0.9741 (+/- 0.0806)|Test F1: 0.0000|41s
Iteration 6 of 6 | Training 3 of 4 | 19/11/2020 12:13:16
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:80|Train F1: 0.9709 (+/- 0.0778)|Test F1: 0.0000|46s
Iteration 6 of 6 | Training 4 of 4 | 19/11/2020 12:14:02
50:50,CNN3,x,std,tf,nTo:1,FeatureMax:100|Train F1: 0.9880 (+/- 0.0500)|Test F1: 0.0000|50s
Iteration 6 of 6 | Duration 2.92m | 0.36h since start


In [None]:
#CNN2 and CNN3, x, 1, 40-100, raw and std, TFIDF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 4 | Training 1 of 4 | 19/11/2020 13:04:06
50:50,CNN2,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4640 (+/- 0.6573)|Test F1: 0.0000|90s
Iteration 1 of 4 | Training 2 of 4 | 19/11/2020 13:05:36
50:50,CNN2,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.5778 (+/- 0.6521)|Test F1: 0.0000|98s
Iteration 1 of 4 | Training 3 of 4 | 19/11/2020 13:07:15
50:50,CNN2,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.4889 (+/- 0.6712)|Test F1: 0.0000|107s
Iteration 1 of 4 | Training 4 of 4 | 19/11/2020 13:09:02




In [None]:
#All clf except CNN2 and CNN3, x, 1, 40-100, raw and std, TFIDF

funcLoop(docToken_df,docTokenPS_df,docTokenLS_df,docTokenSS_df,docTokenWL_df,resample,clfs,features,transforms,f1_df,x_test,y_test)

Iteration 1 of 20 | Training 1 of 4 | 19/11/2020 15:43:27
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9886 (+/- 0.0230)|Test F1: 0.8000|1s
Iteration 1 of 20 | Training 2 of 4 | 19/11/2020 15:43:28
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9905 (+/- 0.0192)|Test F1: 0.8000|0s
Iteration 1 of 20 | Training 3 of 4 | 19/11/2020 15:43:29
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9919 (+/- 0.0178)|Test F1: 0.8000|0s
Iteration 1 of 20 | Training 4 of 4 | 19/11/2020 15:43:29
50:50,LR,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9905 (+/- 0.0192)|Test F1: 0.8000|0s
Iteration 1 of 20 | Duration 0.04m | 0.00h since start
Iteration 2 of 20 | Training 1 of 4 | 19/11/2020 15:43:29
50:50,LR,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9950 (+/- 0.0123)|Test F1: 0.6667|0s
Iteration 2 of 20 | Training 2 of 4 | 19/11/2020 15:43:29
50:50,LR,x,std,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9940 (+/- 0.0126)|Test F1: 0.6667|0s
Iteration 2 of 20 | Training 3 of 4 | 19/11/2020 15:43:3



50:50,LSTM1,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.7548 (+/- 0.1920)|Test F1: 0.1379|127s
Iteration 9 of 20 | Training 2 of 4 | 19/11/2020 15:47:00
50:50,LSTM1,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.7865 (+/- 0.2109)|Test F1: 0.1143|125s
Iteration 9 of 20 | Training 3 of 4 | 19/11/2020 15:49:05
50:50,LSTM1,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.7871 (+/- 0.2091)|Test F1: 0.1081|122s
Iteration 9 of 20 | Training 4 of 4 | 19/11/2020 15:51:07
50:50,LSTM1,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.7645 (+/- 0.1733)|Test F1: 0.2105|126s
Iteration 9 of 20 | Duration 8.36m | 0.16h since start
Iteration 10 of 20 | Training 1 of 4 | 19/11/2020 15:53:14
50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9795 (+/- 0.0254)|Test F1: 1.0000|121s
Iteration 10 of 20 | Training 2 of 4 | 19/11/2020 15:55:16




50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9860 (+/- 0.0190)|Test F1: 1.0000|126s
Iteration 10 of 20 | Training 3 of 4 | 19/11/2020 15:57:22
50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9904 (+/- 0.0168)|Test F1: 1.0000|121s
Iteration 10 of 20 | Training 4 of 4 | 19/11/2020 15:59:24




50:50,LSTM1,x,std,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9914 (+/- 0.0131)|Test F1: 1.0000|125s
Iteration 10 of 20 | Duration 8.26m | 0.30h since start
Iteration 11 of 20 | Training 1 of 4 | 19/11/2020 16:01:30
50:50,LSTM2,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.7784 (+/- 0.1722)|Test F1: 0.2222|122s
Iteration 11 of 20 | Training 2 of 4 | 19/11/2020 16:03:32
50:50,LSTM2,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.8109 (+/- 0.1948)|Test F1: 0.2222|122s
Iteration 11 of 20 | Training 3 of 4 | 19/11/2020 16:05:35
50:50,LSTM2,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.7924 (+/- 0.1671)|Test F1: 0.2500|125s
Iteration 11 of 20 | Training 4 of 4 | 19/11/2020 16:07:41
50:50,LSTM2,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.8079 (+/- 0.1906)|Test F1: 0.4000|119s
Iteration 11 of 20 | Duration 8.17m | 0.44h since start
Iteration 12 of 20 | Training 1 of 4 | 19/11/2020 16:09:40




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9819 (+/- 0.0237)|Test F1: 1.0000|125s
Iteration 12 of 20 | Training 2 of 4 | 19/11/2020 16:11:45




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9881 (+/- 0.0238)|Test F1: 1.0000|122s
Iteration 12 of 20 | Training 3 of 4 | 19/11/2020 16:13:48




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9909 (+/- 0.0158)|Test F1: 1.0000|123s
Iteration 12 of 20 | Training 4 of 4 | 19/11/2020 16:15:51




50:50,LSTM2,x,std,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9919 (+/- 0.0131)|Test F1: 1.0000|123s
Iteration 12 of 20 | Duration 8.24m | 0.57h since start
Iteration 13 of 20 | Training 1 of 4 | 19/11/2020 16:17:55
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4750 (+/- 0.5963)|Test F1: 0.0800|44s
Iteration 13 of 20 | Training 2 of 4 | 19/11/2020 16:18:39
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.3643 (+/- 0.6484)|Test F1: 0.0800|51s
Iteration 13 of 20 | Training 3 of 4 | 19/11/2020 16:19:31
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.3643 (+/- 0.6485)|Test F1: 0.0800|58s
Iteration 13 of 20 | Training 4 of 4 | 19/11/2020 16:20:29
50:50,CNN1a,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.4420 (+/- 0.6094)|Test F1: 0.0800|65s
Iteration 13 of 20 | Duration 3.67m | 0.64h since start
Iteration 14 of 20 | Training 1 of 4 | 19/11/2020 16:21:35
50:50,CNN1a,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.4907 (+/- 0.6179)|Test F1: 0.2000|42s
Iteration 14 of 20 | Train



50:50,CNN1b,x,std,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9950 (+/- 0.0122)|Test F1: 1.0000|65s
Iteration 16 of 20 | Training 4 of 4 | 19/11/2020 16:31:24
50:50,CNN1b,x,std,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9950 (+/- 0.0123)|Test F1: 1.0000|67s
Iteration 16 of 20 | Duration 3.73m | 0.82h since start
Iteration 17 of 20 | Training 1 of 4 | 19/11/2020 16:32:32
50:50,CNN1c,x,raw,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9398 (+/- 0.1059)|Test F1: 0.8000|44s
Iteration 17 of 20 | Training 2 of 4 | 19/11/2020 16:33:16
50:50,CNN1c,x,raw,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9866 (+/- 0.0230)|Test F1: 1.0000|52s
Iteration 17 of 20 | Training 3 of 4 | 19/11/2020 16:34:09
50:50,CNN1c,x,raw,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9736 (+/- 0.0967)|Test F1: 1.0000|62s
Iteration 17 of 20 | Training 4 of 4 | 19/11/2020 16:35:12
50:50,CNN1c,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9889 (+/- 0.0264)|Test F1: 1.0000|71s
Iteration 17 of 20 | Duration 3.86m | 0.88h since start
Iteration 18 of 20 | Traini



50:50,CNN1d,x,raw,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9928 (+/- 0.0160)|Test F1: 1.0000|65s
Iteration 19 of 20 | Duration 3.52m | 1.01h since start
Iteration 20 of 20 | Training 1 of 4 | 19/11/2020 16:43:46
50:50,CNN1d,x,std,tfidf,nTo:1,FeatureMax:40|Train F1: 0.9895 (+/- 0.0279)|Test F1: 0.5714|38s
Iteration 20 of 20 | Training 2 of 4 | 19/11/2020 16:44:25
50:50,CNN1d,x,std,tfidf,nTo:1,FeatureMax:60|Train F1: 0.9930 (+/- 0.0171)|Test F1: 1.0000|46s
Iteration 20 of 20 | Training 3 of 4 | 19/11/2020 16:45:11




50:50,CNN1d,x,std,tfidf,nTo:1,FeatureMax:80|Train F1: 0.9921 (+/- 0.0179)|Test F1: 1.0000|56s
Iteration 20 of 20 | Training 4 of 4 | 19/11/2020 16:46:08
50:50,CNN1d,x,std,tfidf,nTo:1,FeatureMax:100|Train F1: 0.9931 (+/- 0.0140)|Test F1: 1.0000|61s
Iteration 20 of 20 | Duration 3.39m | 1.06h since start


#8 Create and compile modern models

In [None]:
r_code='50'

In [None]:
docTrain_df=getDocTrain(docText_df,r_code)

In [None]:
#Shuffle rows
x_train=docTrain_df.sample(frac=1).reset_index(drop=True)
y_train=x_train.risk

In [None]:
v_train=pd.read_csv('/content/drive/MyDrive/MIT 807 Big Data Science Mini-Dissertation/GitHub/data/interim/50_x_tfidf_1_40_v_train.csv', index_col=[0])

In [None]:
#Join loaded data to docID and docName of randomised training data
v_train_df=x_train[['docID']].merge(v_train,how='left',on='docID')

In [None]:
v_train=v_train_df.iloc[:,2:]

## LSTM

In [None]:
#LSTM has shape (samples,timestep,features)
v_train_stf=np.array(v_train).reshape(v_train.shape[0], 1, v_train.shape[1])

In [None]:
y_train_stf=np.array(y_train).reshape(y_train.shape[0],1,1)

In [None]:
#LSTM has shape (samples,steps,features)
model=createLSTM()

In [None]:
#Train

#loss: 0.6911 - f1: 0.6831 - val_loss: 0.6880 - val_f1: 0.7371
#history=model.fit(v_train_stf,y_train_stf,validation_split=0.2)

#loss: 0.5856 - f1: 0.8330 - val_loss: 0.5405 - val_f1: 0.8184
history=model.fit(v_train_stf,y_train_stf,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


##CNN

In [None]:
#CNN has shape (samples,features,dimensions)
v_train_sfd=np.array(v_train).reshape(v_train.shape[0], v_train.shape[1],1)

In [None]:
#y_train_sfd=np.array(y_train).reshape(y_train.shape[0],1,1)

In [None]:
model=createCNN1a()

In [None]:
#Sigmoid

#loss: 0.7052 - f1: 0.4396 - val_loss: 0.7523 - val_f1: 0.6478
#history=model.fit(v_train_sfd,y_train,validation_split=0.2)

#loss: 0.0125 - f1: 0.9969 - val_loss: 0.0037 - val_f1: 1.0000
history=model.fit(v_train_sfd,y_train,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model=createCNN1b()

In [None]:
#Relu

#loss: 0.6835 - f1: 0.7323 - val_loss: 0.6606 - val_f1: 0.7819
#history=model.fit(v_train_sfd,y_train,validation_split=0.2)

#loss: 0.0486 - f1: 0.9926 - val_loss: 0.0298 - val_f1: 0.9892
history=model.fit(v_train_sfd,y_train,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
history.history['val_f1']

[0.0,
 0.8934442400932312,
 0.8422034382820129,
 0.895297110080719,
 0.9639375805854797]

In [None]:
K_CNN=KerasClassifier(build_fn=createCNN1b,epochs=5,validation_split=0.2,verbose=1)

In [None]:
cross_val_score(K_CNN,v_train_sfd,y_train,cv=5,scoring='f1',n_jobs=1) #n_jobs=threads

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([0.84210526, 0.71875   , 0.87179487, 0.8372093 , 0.91566265])

In [None]:
K_CNN=KerasClassifier(build_fn=createCNN1b,epochs=5,validation_split=0.2,verbose=0)

In [None]:
cross_val_score(K_CNN,v_train_sfd,y_train,cv=5,scoring='f1',n_jobs=threads)

array([0.83116883, 0.8988764 , 0.94382022, 0.96      , 0.93617021])

In [None]:
model=createCNN1d()

In [None]:
#No activation

#history=model.fit(v_train_sfd,y_train_sfd,validation_split=0.2)

#loss: 0.0508 - f1: 0.9909 - val_loss: 0.0897 - val_f1: 0.9792
history=model.fit(v_train_sfd,y_train_sfd,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model=createCNN2()

In [None]:
#CNN with LSTM

#history=model.fit(v_train_sfd,y_train_sfd,validation_split=0.2)

#loss: 0.6203 - f1: 0.6128 - val_loss: 0.6301 - val_f1: 0.6572
history=model.fit(v_train_sfd,y_train_sfd,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#Reducing pooling sizes
model=createCNN3()

In [None]:
#CNN with LSTM

#history=model.fit(v_train_sfd,y_train_sfd,validation_split=0.2)

#loss: 0.6203 - f1: 0.6128 - val_loss: 0.6301 - val_f1: 0.6572
history=model.fit(v_train_sfd,y_train_sfd,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
