In [2]:
import pandas as pd
import numpy as np
import copy

#make the columns as wide as possible so we can see all the text
pd.set_option('display.max_colwidth', None)

In [3]:
original_training = pd.read_csv("mediaeval-2015-trainingset.txt", delimiter = "\t")
original_training.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [4]:
#repeat the same process for the testing dataset
original_testing = pd.read_csv("mediaeval-2015-testset.txt", delimiter = "\t")
original_testing.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,578854927457349632,kereeen RT @Shyman33: Eclipse from ISS.... http://t.co/je2hcFpVfN,70824972,eclipse_01,peay_s,Fri Mar 20 09:45:43 +0000 2015,fake
1,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse from ISS.... http://t.co/oqwtTL0ThS,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
2,578891261353984000,“@Shyman33: Eclipse from ISS.... http://t.co/C0VfboScRj” 우주에서본 3.20 일식 Wow! amazing!,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
3,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
4,578975333841551360,@ebonfigli: Éclipse vue de l'ISS... Autre chose... http://t.co/yNBN7c4O51\n\nLa création divine n'a pas de limite 😍,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake


In [5]:
# BOOKMARK 1

#we can see that the dataset is skewed towards fake and humor tweets
original_training.label.value_counts(normalize = True) * 100

fake     47.222806
real     34.468025
humor    18.309169
Name: label, dtype: float64

In [6]:
# BOOKMARK 2

#(6742 + 2614) - 4921 = 4435 additional real entries needed to make the dataset balanced
original_training.label.value_counts()

fake     6742
real     4921
humor    2614
Name: label, dtype: int64

In [7]:
# BOOKMARK 3

#there are no non null values to being with we can see
original_training.info()
print("\n")
original_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int6

In [8]:
# BOOKMARK 4

#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

#Do the same for the testing set
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

original_training.head()

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake


In [9]:
# BOOKMARK 5 

#we can see that not all the posts are in English
original_training[:10]

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake
5,42nd #time #square #NYC #subway #hurricane http://t.co/daX5YY7X,fake
6,Just in time for #halloween a photo of #hurricane #sandy #frankenstorm http://t.co/xquKB4VN,fake
7,Crazy pic of #Hurricane #Sandy prayers go out to family and friends on the East Coast http://t.co/c4sceiMt,fake
8,#sandy #newyork #hurricane #statueofliberty #USA http://t.co/iQfEbO1E,fake
9,#nyc #hurricane http://t.co/Gv3QxZlq,fake


In [10]:
# BOOKMARK 6

#add a column to store the language, initially empty before langdetect populates it
original_training["lang"] = np.nan
original_testing["lang"] = np.nan
original_training.head()

Unnamed: 0,tweetText,label,lang
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake,
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake,
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake,
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake,
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake,


In [11]:
import langdetect as l
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk.stem as st
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\George\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\George\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
#view the languages supported by the stemming algorithm
st.SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [13]:
# BOOKMARK 7 TweetHandler class

#responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(st.SnowballStemmer.languages)
        #some languages are supported by stemming but NOT supported by language specific tokenizing,
        #only the tokens that are in this set are supported by language specific tokenizing
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language, tokenizes and stems
    #specific to the detected language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = l.detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        # if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        # and stopwords may still be supported in the language that does not support language specific tokenization
        # e.g. arabic, hungarian, romanian so tokenize with the english
        # version of the algorithm if this is the case and use the stemming and stopwords specific to 
        # the language if this is available even if the tokenization algorithm isnt
        # use a python ternary expression to do this
        tokens = nltk.word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = st.SnowballStemmer(nltkprop)
        
        # store all tokens to be output as a concatenated string here so that this string
        # can later be fed to a CountVectorizer or TfIDFVectorizer , filter out any unwanted tokens 
        # and don't add them 
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            #carry out stemming specific to the language detected
            filtered_tokens += " " + stemmer.stem(tok)
        
        #comment these out when you do not need to check if it works anymore
        #print("original tokens:", tokens,"\n")
        #print("filtered tokens:", filtered_tokens,"\n")
        
        return filtered_tokens, lang_prediction

In [14]:
# BOOKMARK 8, transform the dataset from a dataset of tweets into a dataset of labelled tokens in concatenated
# string form, along with the detected language

def transform_data(arg):

    #copy the instance given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = copy.deepcopy(arg)
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset["tokens"][i]
        label = dataset["label"][i]

        #disregard the humour information for now, map humor and fake to a single class
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
            
        #for testing
        #print("The old value of the row is:",dataset.loc[i],"\n")
        
        tokens, lang = th.parse_tweet(tweet)
        
        #replace the row with the simplified tokens, the mapped labels and the detected language
        dataset.loc[i] = tokens, label, lang
        
        #for testing
        #print("The new value of the row is:",dataset.loc[i],"\n\n")
    
    return dataset

In [15]:
#transform the data
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [1]:
simplified_testing

NameError: name 'simplified_testing' is not defined

In [15]:
# BOOKMARK 9

#get an idea of how many of each language there are, we can see that it is predominantly english
simplified_training.lang.value_counts(normalize = True) * 100

en         76.619738
es          9.056524
unknown     7.508580
fr          1.505919
pt          1.141696
de          0.910555
it          0.735449
nl          0.595363
ar          0.546333
ru          0.427261
sv          0.308188
no          0.266162
da          0.189115
fi          0.091056
hu          0.049030
ro          0.049030
Name: lang, dtype: float64

In [16]:
simplified_training.label.value_counts()

1    9356
0    4921
Name: label, dtype: int64

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import *

In [16]:
#class to convert the sparse matrix produced by the tfidf vectorizer into a dense matrix in order for it 
#to be able to be used with different algorithms in a pipeline that require a dense matrix and not a sparse matrix
class Densifier():

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None, **kwargs):
        return X.todense()

In [17]:
# Given a pipeline and some parameters, perform a grid search with these
def grid_search(pipeline, params):
    
    #make sure we are focusing on maximizing the f1 score and not a different metric, add some verbosity so we can
    #see the progress of the grid search to get an idea of how much time it is taking
    
    #make n_jobs -1 so that all cores that are available cores are used which should hopefully make it quicker
    
    clf = GridSearchCV(pipeline, params, scoring = "f1", verbose = 6, n_jobs = -1)
    
    #grid_search does not need to take the training data as an argument, we can always assume that simplified_training
    #is ready in memory so any algorithm will only ever need to use this to grid search therefore we can implicitly
    #reference it like this without having to declare it as an argument to the function
    
    #make sure the labels are treated as ints and not objects
    clf.fit(simplified_training.tokens, simplified_training.label.astype("int"))
    return clf

In [18]:
# Simple function to report the scores of the model tuned with the optimal parameters found in grid search
# by making predictions of the testing data using this newly found optimal model and calculating the 
# accuracy vs the true labels of the testing data
def report_scores(grid_search_result):
    
    print("---- RESULTS ----","\n")
    print("The algorithm being optimised was:", grid_search_result.estimator.steps[-1])
    print("The best parameters found were:", grid_search_result.best_params_)
    y_test_true = simplified_testing.label.astype("int")
    y_test_predictions = grid_search_result.predict(simplified_testing.tokens).astype("int")
    print("Score report:\n")
    #prevent the chance of any of the lists being treated as objects
    print(classification_report(y_test_true.astype('int'), y_test_predictions.astype('int')))
    
    #TODO DELETE THESE AFTER, JUST FOR TESTING
    print("f1 score:", f1_score(y_test_true, y_test_predictions))
    print("precision:", precision_score(y_test_true, y_test_predictions))
    print("recall:", recall_score(y_test_true, y_test_predictions))

In [17]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

lr_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), 
    ('lr', LogisticRegression())
])

lr_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'lr__C' : [0.1, 1, 10, 100]
}

lrResult_V1 = grid_search(lr_pipelineV1, lr_paramsV1)
report_scores(lrResult_V1)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [27]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

lr_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), 
    ('lr', LogisticRegression())
])

lr_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,8)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    'lr__C' : [0.1, 1, 10]
}

lrResult_V1 = grid_search(lr_pipelineV1, lr_paramsV1)
report_scores(lrResult_V1)

Fitting 5 folds for each of 147 candidates, totalling 735 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 735 out of 735 | elapsed:  2.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('lr', LogisticRegression())
The best parameters found were: {'cv__max_features': 10000, 'cv__ngram_range': (1, 1), 'lr__C': 1}
Score report:

              precision    recall  f1-score   support

           0       0.47      0.78      0.59      1209
           1       0.85      0.58      0.69      2546

    accuracy                           0.65      3755
   macro avg       0.66      0.68      0.64      3755
weighted avg       0.72      0.65      0.66      3755



In [28]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

lr_pipelineV3 = Pipeline([
    ('cv', CountVectorizer()), 
    ('lr', LogisticRegression())
])

lr_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (1,8)],
    'cv__max_features' : [1000 * i for i in range (9,16)],
    'lr__C' : [0.1, 1, 10]
}

lrResult_V3 = grid_search(lr_pipelineV3, lr_paramsV3)
report_scores(lrResult_V3)

Fitting 5 folds for each of 147 candidates, totalling 735 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 735 out of 735 | elapsed:  2.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('lr', LogisticRegression())
The best parameters found were: {'cv__max_features': 11000, 'cv__ngram_range': (1, 1), 'lr__C': 1}
Score report:

              precision    recall  f1-score   support

           0       0.58      0.77      0.66      1209
           1       0.87      0.74      0.80      2546

    accuracy                           0.75      3755
   macro avg       0.73      0.75      0.73      3755
weighted avg       0.78      0.75      0.75      3755



In [69]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

lr_pipelineV4 = Pipeline([
    ('cv', CountVectorizer()), 
    ('lr', LogisticRegression())
])

lr_paramsV4 = {
    'cv__ngram_range' : [(1,x) for x in range (1,15)],
    'cv__max_features' : [1000 * i for i in range (10,13)],
    'lr__C' : [0.1, 1, 10]
}

lrResult_V4 = grid_search(lr_pipelineV4, lr_paramsV4)
report_scores(lrResult_V4)

Fitting 5 folds for each of 126 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:  2.6min finished


---- RESULTS ---- 

The algorithm being optimised was: ('lr', LogisticRegression())
The best parameters found were: {'cv__max_features': 11000, 'cv__ngram_range': (1, 1), 'lr__C': 1}
Score report:

              precision    recall  f1-score   support

           0       0.58      0.77      0.66      1209
           1       0.87      0.73      0.80      2546

    accuracy                           0.75      3755
   macro avg       0.73      0.75      0.73      3755
weighted avg       0.78      0.75      0.75      3755

f1 score: 0.7970149253731343
precision: 0.8717350746268657
recall: 0.7340926944226237


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

lrcv_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), 
    ('dd', Densifier()),
    ('ss', StandardScaler()),
    ('lr', LogisticRegressionCV())
])

lrcv_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,5)],
    'cv__max_features' : [1000 * i for i in range (1,8)],
    'lr__Cs' : [1, 10, 100]
}

lrcvResult_V1 = grid_search(lrcv_pipelineV1, lrcv_paramsV1)
report_scores(lrcvResult_V1)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [68]:
#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNB_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'nb__alpha' : [i for i in range (1,4)]
}

mNBResult_V1 = grid_search(mNB_pipelineV1, mNB_paramsV1)
report_scores(mNBResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   35.2s


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 1), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.37      0.84      0.51      1209
           1       0.81      0.31      0.45      2546

    accuracy                           0.48      3755
   macro avg       0.59      0.58      0.48      3755
weighted avg       0.67      0.48      0.47      3755

f1 score: 0.45073612684031705
precision: 0.8073022312373225
recall: 0.3126472898664572


[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:   44.0s finished


In [35]:
report_scores(mNBResult_V1)

---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 1), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.37      0.84      0.51      1209
           1       0.81      0.31      0.45      2546

    accuracy                           0.48      3755
   macro avg       0.59      0.58      0.48      3755
weighted avg       0.67      0.48      0.47      3755

f1 score: 0.45073612684031705
precision: 0.8073022312373225
recall: 0.3126472898664572


In [28]:
# Second iteration with the multinomial naive bayes algorithm,
# trying ngrams from (1,1) to (1,9), 4000 max features in the vocabulary to 10000 max features
# in increments of 1000, and try values of 2-5 for alpha

mNB_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,10)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    'nb__alpha' : [i for i in range (2,6)]
}

mNBResult_V2 = grid_search(mNB_pipelineV2, mNB_paramsV2)
report_scores(mNBResult_V2)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed:  3.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 10000, 'cv__ngram_range': (1, 7), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.66      0.79      0.72      1209
           1       0.89      0.81      0.85      2546

    accuracy                           0.80      3755
   macro avg       0.77      0.80      0.78      3755
weighted avg       0.82      0.80      0.80      3755



In [62]:
cv = CountVectorizer(max_features = 18000, ngram_range = (1,10))
training = cv.fit_transform(simplified_training.tokens)#.todense()
testing = cv.fit_transform(simplified_testing.tokens)#.todense()
clf = MultinomialNB(alpha = 4).fit(training, simplified_training.label.astype("int"))
f1_score(clf.predict(testing), simplified_testing.label.astype("int"))

0.5800405268490375

In [66]:
pipeline = Pipeline([
    ("cv", CountVectorizer(max_features = 10000, ngram_range = (1,7))), 
     ("nb", MultinomialNB(alpha = 3))
])

pipeline.fit(simplified_training.tokens, simplified_training.label.astype("int"))
report_scores(pipeline)

---- RESULTS ---- 

Score report:

              precision    recall  f1-score   support

           0       0.66      0.79      0.72      1209
           1       0.89      0.81      0.85      2546

    accuracy                           0.80      3755
   macro avg       0.77      0.80      0.78      3755
weighted avg       0.82      0.80      0.81      3755

f1 score: 0.846201358863496
precision: 0.889225443530939
recall: 0.8071484681853889


In [36]:
# Third iteration with the multinomial naive bayes algorithm,
# trying ngrams from (1,6) to (1,10), 9000 max features in the vocabulary up to 14000 max features
# in increments of 1000, and try values of 2-4 for alpha

mNB_pipelineV3 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (6,11)],
    'cv__max_features' : [1000 * i for i in range (9,15)],
    'nb__alpha' : [i for i in range (2,5)]
}

mNBResult_V3 = grid_search(mNB_pipelineV3, mNB_paramsV3)
report_scores(mNBResult_V3)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  1.8min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 14000, 'cv__ngram_range': (1, 8), 'nb__alpha': 4}
Score report:

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      1209
           1       0.89      0.84      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.80      0.81      0.80      3755
weighted avg       0.83      0.82      0.83      3755

f1 score: 0.8652367462565763
precision: 0.8923205342237062
recall: 0.8397486252945797


In [24]:
from sklearn.metrics import f1_score

def calc_f1(grid_search_result):
    y_test_true = simplified_testing.label.astype("int")
    y_test_predictions = grid_search_result.predict(simplified_testing.tokens).astype("int")
    score = f1_score(y_test_true, y_test_predictions)
    print(score)

In [25]:
calc_f1(mNBResult_V3)

0.8667205169628432


In [45]:
# Fourth iteration with the multinomial naive bayes algorithm,
# trying ngrams from (1,5) to (1,13), 13000 max features in the vocabulary up to 18000 max features
# in increments of 1000, and try values of 1-6 for alpha

mNB_pipelineV4 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV4 = {
    'cv__ngram_range' : [(1,x) for x in range (5,13)],
    'cv__max_features' : [1000 * i for i in range (13,19)],
    'nb__alpha' : [i for i in range (1,7)]
}

mNBResult_V4 = grid_search(mNB_pipelineV4, mNB_paramsV4)
report_scores(mNBResult_V4)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  5.5min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 18000, 'cv__ngram_range': (1, 10), 'nb__alpha': 4}
Score report:

              precision    recall  f1-score   support

           0       0.70      0.78      0.74      1209
           1       0.89      0.84      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.80      0.81      0.80      3755
weighted avg       0.83      0.82      0.83      3755



In [31]:
# Fifth iteration with the multinomial naive bayes algorithm,
# trying ngrams from (1,8) to (1,12), 17000 max features in the vocabulary up to 22000 max features
# in increments of 1000, and try values of 3-6 for alpha

mNB_pipelineV5 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV5 = {
    'cv__ngram_range' : [(1,x) for x in range (8,13)],
    'cv__max_features' : [1000 * i for i in range (17,23)],
    'nb__alpha' : [i for i in range (3,7)]
}

mNBResult_V5 = grid_search(mNB_pipelineV5, mNB_paramsV5)
report_scores(mNBResult_V5)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.5min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 22000, 'cv__ngram_range': (1, 11), 'nb__alpha': 4}
Score report:

              precision    recall  f1-score   support

           0       0.64      0.78      0.71      1209
           1       0.89      0.79      0.84      2546

    accuracy                           0.79      3755
   macro avg       0.76      0.79      0.77      3755
weighted avg       0.81      0.79      0.79      3755



In [44]:
# Sixth iteration with the multinomial naive bayes algorithm,
# trying ngrams from (1,10) to (1,12), 21000 max features in the vocabulary up to 24000 max features
# in increments of 1000, and try values of 2-5 for alpha

mNB_pipelineV6 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV6 = {
    'cv__ngram_range' : [(1,x) for x in range (10,13)],
    'cv__max_features' : [1000 * i for i in range (21,25)],
    'nb__alpha' : [i for i in range (2,6)]
}

#we appear to have found a plateau, as the results are largely the same as the last search
mNBResult_V6 = grid_search(mNB_pipelineV6, mNB_paramsV6)
report_scores(mNBResult_V6)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 23000, 'cv__ngram_range': (1, 11), 'nb__alpha': 4}
Score report:

              precision    recall  f1-score   support

           0       0.64      0.78      0.71      1209
           1       0.89      0.79      0.84      2546

    accuracy                           0.79      3755
   macro avg       0.76      0.79      0.77      3755
weighted avg       0.81      0.79      0.79      3755



In [30]:
# Seventh iteration with the multinomial naive bayes algorithm, search around our "optimal" we have found and make it more
# precise by searching in increments of 0.5 for alpha and 500 for max features to see if this tunes it to be even better.
# trying ngrams from (1,8) to (1,12), 17000 max features in the vocabulary up to 22000 max features
# in increments of 500, and try values of 2-7 for alpha in increments of 0.5

# {'cv__max_features': 18000, 'cv__ngram_range': (1, 10), 'nb__alpha': 4}

mNB_pipelineV7 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV7 = {
    'cv__ngram_range' : [(1,x) for x in range (8,13)],
    'cv__max_features' : [500 * i for i in range (2 * 20,2 * 26)],
    'nb__alpha' : [0.5 * i for i in range (2 * 2,7 * 2)]
}

mNBResult_V7 = grid_search(mNB_pipelineV7, mNB_paramsV7)
report_scores(mNBResult_V7)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  9.9min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 25000, 'cv__ngram_range': (1, 10), 'nb__alpha': 3.5}
Score report:

              precision    recall  f1-score   support

           0       0.51      0.79      0.62      1209
           1       0.86      0.64      0.74      2546

    accuracy                           0.69      3755
   macro avg       0.69      0.72      0.68      3755
weighted avg       0.75      0.69      0.70      3755



In [26]:
# TRY WITHOUTTTT SPECIFYING MAX FEATURES

#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNB_pipelineV8 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV8 = {
    'cv__ngram_range' : [(1,x) for x in range (5,12)],
    'nb__alpha' : [0.25 * i for i in range (4 * 1,4 * 6)]
}

mNBResult_V8 = grid_search(mNB_pipelineV8, mNB_paramsV8)
report_scores(mNBResult_V8)

Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  2.8min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__ngram_range': (1, 11), 'nb__alpha': 2.25}
Score report:

              precision    recall  f1-score   support

           0       0.50      0.79      0.61      1209
           1       0.86      0.63      0.73      2546

    accuracy                           0.68      3755
   macro avg       0.68      0.71      0.67      3755
weighted avg       0.75      0.68      0.69      3755



In [27]:
# TRY WITHOUTTTT SPECIFYING MAX FEATURES

#Start with the multinomial naive bayes algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNB_pipelineV9 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV9 = {
    'cv__ngram_range' : [(1,x) for x in range (1,7)],
    'nb__alpha' : [0.25 * i for i in range (4 * 1,4 * 4)]
}

mNBResult_V9 = grid_search(mNB_pipelineV9, mNB_paramsV9)
report_scores(mNBResult_V9)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   45.4s finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__ngram_range': (1, 5), 'nb__alpha': 2.5}
Score report:

              precision    recall  f1-score   support

           0       0.50      0.79      0.61      1209
           1       0.86      0.63      0.73      2546

    accuracy                           0.68      3755
   macro avg       0.68      0.71      0.67      3755
weighted avg       0.75      0.68      0.69      3755



In [72]:
mNB_pipelineV10 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', MultinomialNB())
])

mNB_paramsV10 = {
    'cv__ngram_range' : [(1,15) for x in range (1,14)],
    'cv__max_features' : [1000 * i for i in range (15, 26)],
    'nb__alpha' : [1 * i for i in range (1, 7)]
}

mNBResult_V10 = grid_search(mNB_pipelineV10, mNB_paramsV10)
report_scores(mNBResult_V10)

Fitting 5 folds for each of 858 candidates, totalling 4290 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elap

---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 24000, 'cv__ngram_range': (1, 15), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.51      0.79      0.62      1209
           1       0.87      0.65      0.74      2546

    accuracy                           0.69      3755
   macro avg       0.69      0.72      0.68      3755
weighted avg       0.75      0.69      0.70      3755

f1 score: 0.7405077510671759
precision: 0.8650918635170604
recall: 0.6472898664571878


In [25]:
# Now try the ComplementNB algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

cNB_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'nb__alpha' : [i for i in range (1,4)]
}

cNBResult_V1 = grid_search(cNB_pipelineV1, cNB_paramsV1)
report_scores(cNBResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:   43.3s finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 5), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.40      0.87      0.55      1209
           1       0.86      0.40      0.54      2546

    accuracy                           0.55      3755
   macro avg       0.63      0.63      0.55      3755
weighted avg       0.71      0.55      0.55      3755



In [26]:
# Second iteration with the complement naive bayes algorithm, performance was poor and the best parameters
# found were at the maxima of the ranges specified, so try again with larger ranges.
# trying ngrams from (1,4) to (1,9), 4000 max features in the vocabulary up to 9000 max features
# in increments of 1000, and try values of 1-3 for alpha in increments of 1

cNB_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (4,10)],
    'cv__max_features' : [1000 * i for i in range (4,10)],
    'nb__alpha' : [i for i in range (1,4)]
}

cNBResult_V2 = grid_search(cNB_pipelineV2, cNB_paramsV2)
report_scores(cNBResult_V2)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  1.9min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 9000, 'cv__ngram_range': (1, 9), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.49      0.85      0.62      1209
           1       0.89      0.57      0.70      2546

    accuracy                           0.66      3755
   macro avg       0.69      0.71      0.66      3755
weighted avg       0.76      0.66      0.67      3755



In [27]:
# Third iteration with the complement naive bayes algorithm, performance is still quite poor and the best parameters
# found were at the maxima of the ranges specified, so try again with larger ranges.
# trying ngrams from (1,4) to (1,9), 4000 max features in the vocabulary up to 9000 max features
# in increments of 1000, and try values of 1-3 for alpha in increments of 1

cNB_pipelineV3 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (8,15)],
    'cv__max_features' : [1000 * i for i in range (8,15)],
    'nb__alpha' : [i for i in range (2,6)]
}

cNBResult_V3 = grid_search(cNB_pipelineV3, cNB_paramsV3)
report_scores(cNBResult_V3)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:  4.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 14000, 'cv__ngram_range': (1, 11), 'nb__alpha': 2}
Score report:

              precision    recall  f1-score   support

           0       0.49      0.85      0.62      1209
           1       0.89      0.58      0.70      2546

    accuracy                           0.66      3755
   macro avg       0.69      0.71      0.66      3755
weighted avg       0.76      0.66      0.67      3755



In [28]:
# Third iteration with the complement naive bayes algorithm, performance is still quite poor and the best parameters
# found were at the maxima of the ranges specified, so try again with larger ranges.
# trying ngrams from (1,4) to (1,9), 4000 max features in the vocabulary up to 9000 max features
# in increments of 1000, and try values of 1-3 for alpha in increments of 1

cNB_pipelineV4 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV4 = {
    'cv__ngram_range' : [(1,x) for x in range (10,15)],
    'cv__max_features' : [1000 * i for i in range (13,19)],
    'nb__alpha' : [i for i in range (1,5)]
}

cNBResult_V4 = grid_search(cNB_pipelineV4, cNB_paramsV4)
report_scores(cNBResult_V4)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.6min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 18000, 'cv__ngram_range': (1, 10), 'nb__alpha': 2}
Score report:

              precision    recall  f1-score   support

           0       0.49      0.85      0.62      1209
           1       0.89      0.58      0.70      2546

    accuracy                           0.67      3755
   macro avg       0.69      0.71      0.66      3755
weighted avg       0.76      0.67      0.68      3755



In [29]:
# Third iteration with the complement naive bayes algorithm, performance is still quite poor and the best parameters
# found were at the maxima of the ranges specified, so try again with larger ranges.
# trying ngrams from (1,4) to (1,9), 4000 max features in the vocabulary up to 9000 max features
# in increments of 1000, and try values of 1-3 for alpha in increments of 1

cNB_pipelineV5 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV5 = {
    'cv__ngram_range' : [(1,x) for x in range (7,15)],
    'cv__max_features' : [1000 * i for i in range (17,25)],
    'nb__alpha' : [i for i in range (1,5)]
}

cNBResult_V5 = grid_search(cNB_pipelineV5, cNB_paramsV5)
report_scores(cNBResult_V5)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1280 out of 1280 | elapsed:  5.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 23000, 'cv__ngram_range': (1, 13), 'nb__alpha': 2}
Score report:

              precision    recall  f1-score   support

           0       0.41      0.85      0.55      1209
           1       0.85      0.41      0.55      2546

    accuracy                           0.55      3755
   macro avg       0.63      0.63      0.55      3755
weighted avg       0.71      0.55      0.55      3755



In [None]:


# Third iteration with the complement naive bayes algorithm, performance is still quite poor and the best parameters
# found were at the maxima of the ranges specified, so try again with larger ranges.
# trying ngrams from (1,4) to (1,9), 4000 max features in the vocabulary up to 9000 max features
# in increments of 1000, and try values of 1-3 for alpha in increments of 1

cNB_pipelineV5 = Pipeline([
    ('cv', CountVectorizer()), 
    ('nb', ComplementNB())
])

cNB_paramsV5 = {
    'cv__ngram_range' : [(1,x) for x in range (7,15)],
    'cv__max_features' : [1000 * i for i in range (17,25)],
    'nb__alpha' : [i for i in range (1,5)]
}

cNBResult_V5 = grid_search(cNB_pipelineV5, cNB_paramsV5)
report_scores(cNBResult_V5)

In [52]:
# Now try the BernoulliNB algorithm commonly used for text classification,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

bNB_pipelineV1 = Pipeline([
    ('cv', CountVectorizer(binary = True)), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('nb', BernoulliNB())
])

bNB_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'nb__alpha' : [i for i in range (1,4)]
}

bNBResult_V1 = grid_search(bNB_pipelineV1, bNB_paramsV1)
report_scores(bNBResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:   41.2s finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', BernoulliNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 2), 'nb__alpha': 2}
Score report:

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      1209
           1       0.86      0.87      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.79      0.79      0.79      3755
weighted avg       0.82      0.82      0.82      3755



In [53]:
# It performs surprisingly excellent even with very limited parameters! Now try a second
# search with ngrams from (1,1) to (1,9), 4000 max features in the vocabulary to 10000 max features
# in increments of 1000, and try values of 1-4 for alpha

bNB_pipelineV2 = Pipeline([
    ('cv', CountVectorizer(binary = True)), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('nb', BernoulliNB())
])

bNB_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,10)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    'nb__alpha' : [i for i in range (1,5)]
}

bNBResult_V2 = grid_search(bNB_pipelineV2, bNB_paramsV2)
report_scores(bNBResult_V2)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed:  3.4min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', BernoulliNB())
The best parameters found were: {'cv__max_features': 9000, 'cv__ngram_range': (1, 2), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.41      0.73      0.53      1209
           1       0.80      0.50      0.62      2546

    accuracy                           0.58      3755
   macro avg       0.61      0.62      0.57      3755
weighted avg       0.67      0.58      0.59      3755



In [42]:
# Now try a support vector machine classifier, called LinearSVC in ScikitLearn. 
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

svc_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('svc', LinearSVC())
])

svc_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'svc__C' : [1.0, 10.0, 100.0, 1000.0]
}

svcResult_V1 = grid_search(svc_pipelineV1, svc_paramsV1)
report_scores(svcResult_V1)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('svc', LinearSVC())
The best parameters found were: {'cv__max_features': 3000, 'cv__ngram_range': (1, 1), 'svc__C': 1.0}
Score report:

              precision    recall  f1-score   support

           0       0.67      0.81      0.73      1209
           1       0.90      0.81      0.85      2546

    accuracy                           0.81      3755
   macro avg       0.78      0.81      0.79      3755
weighted avg       0.82      0.81      0.81      3755



In [43]:
# Now try a support vector machine classifier, called LinearSVC in ScikitLearn. 
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

svc_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('svc', LinearSVC())
])

svc_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,10)],
    'cv__max_features' : [1000 * i for i in range (1,10)],
    'svc__C' : [1.0, 10.0, 100.0]
}

svcResult_V2 = grid_search(svc_pipelineV2, svc_paramsV2)
report_scores(svcResult_V2)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  4.2min finished


---- RESULTS ---- 

The algorithm being optimised was: ('svc', LinearSVC())
The best parameters found were: {'cv__max_features': 8000, 'cv__ngram_range': (1, 1), 'svc__C': 1.0}
Score report:

              precision    recall  f1-score   support

           0       0.61      0.78      0.69      1209
           1       0.88      0.77      0.82      2546

    accuracy                           0.77      3755
   macro avg       0.75      0.77      0.75      3755
weighted avg       0.79      0.77      0.78      3755



In [44]:
# Now try the RandomForest algorithm,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

# do this overnight :/ hehe

rf_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('rf', RandomForestClassifier())
])

rf_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'rf__n_estimators' : [100 * x for x in range (1,6)]
    #'rf__max_depth' : [25, 50, 75, None],
    #'rf__max_features' : ["auto", "sqrt"],
    #'rf__min_samples_leaf' : [1,2,4],
    #'rf_min_samlpes_split' : [2, 5, 10],
    #'rf__bootstrap' : [True, False]
}

rfResult_V1 = grid_search(rf_pipelineV1, rf_paramsV1)
report_scores(rfResult_V1)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed: 28.4min finished


---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 2), 'rf__n_estimators': 300}
Score report:

              precision    recall  f1-score   support

           0       0.33      0.07      0.11      1209
           1       0.68      0.93      0.79      2546

    accuracy                           0.65      3755
   macro avg       0.50      0.50      0.45      3755
weighted avg       0.56      0.65      0.57      3755



In [23]:
# DO THIS OVERNIGHT!!

# Now try the RandomForest algorithm,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

# do this overnight :/ hehe

rf_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('rf', RandomForestClassifier())
])

rf_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'rf__n_estimators' : [100 * x for x in range (1,6)],
    'rf__max_depth' : [25, 50, 75, None],
    'rf__max_features' : ["auto", "sqrt"],
    'rf__min_samples_leaf' : [1,2,4],
    'rf__bootstrap' : [True, False]
}

rfResult_V2 = grid_search(rf_pipelineV2, rf_paramsV2)
report_scores(rfResult_V2)

Fitting 5 folds for each of 6000 candidates, totalling 30000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed: 22.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elap

---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier())
The best parameters found were: {'cv__max_features': 4000, 'cv__ngram_range': (1, 1), 'rf__bootstrap': True, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__n_estimators': 100}
Score report:

              precision    recall  f1-score   support

           0       0.35      0.03      0.05      1209
           1       0.68      0.97      0.80      2546

    accuracy                           0.67      3755
   macro avg       0.51      0.50      0.43      3755
weighted avg       0.57      0.67      0.56      3755



In [24]:
# DO THIS OVERNIGHT!!

# Now try the RandomForest algorithm,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

# do this overnight :/ hehe

rf_pipelineV3 = Pipeline([
    ('cv', TfidfVectorizer()), #We are looking at occurences rather than counts with the Bernoulli Naive Bayes algorithm
    ('rf', RandomForestClassifier())
])

rf_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (1,10)],
    'rf__n_estimators' : [100 * x for x in range (1,6)],
    'rf__max_depth' : [25, 50, 75, None],
    'rf__max_features' : ["auto", "sqrt"],
    'rf__min_samples_leaf' : [1,2,4],
    'rf__bootstrap' : [True, False]
}

rfResult_V3 = grid_search(rf_pipelineV3, rf_paramsV3)
report_scores(rfResult_V3)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 35.0min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 56.9min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 63.2min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elap

---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier())
The best parameters found were: {'cv__ngram_range': (1, 9), 'rf__bootstrap': False, 'rf__max_depth': None, 'rf__max_features': 'auto', 'rf__min_samples_leaf': 2, 'rf__n_estimators': 300}
Score report:

              precision    recall  f1-score   support

           0       0.32      0.05      0.08      1209
           1       0.68      0.95      0.79      2546

    accuracy                           0.66      3755
   macro avg       0.50      0.50      0.44      3755
weighted avg       0.56      0.66      0.56      3755



In [55]:
# Now do the multinomial naive bayes algorithm with a tfidf vectorizer instead of a countvectorizer,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV1 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'nb__alpha' : [i for i in range (1,4)]
}

mNBtResult_V1 = grid_search(mNBt_pipelineV1, mNBt_paramsV1)
report_scores(mNBtResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:  1.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 1), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.42      0.78      0.55      1209
           1       0.82      0.49      0.62      2546

    accuracy                           0.58      3755
   macro avg       0.62      0.64      0.58      3755
weighted avg       0.69      0.58      0.59      3755



In [56]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV2 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,11)],
    'cv__max_features' : [1000 * i for i in range (4,10)],
    'nb__alpha' : [i for i in range (1,4)]
}

mNBtResult_V2 = grid_search(mNBt_pipelineV2, mNBt_paramsV2)
report_scores(mNBtResult_V2)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  4.5min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 9000, 'cv__ngram_range': (1, 2), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.42      0.78      0.55      1209
           1       0.82      0.49      0.62      2546

    accuracy                           0.58      3755
   macro avg       0.62      0.63      0.58      3755
weighted avg       0.69      0.58      0.59      3755



In [57]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV3 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (8,15)],
    'nb__alpha' : [i for i in range (1,3)]
}

mNBtResult_V3 = grid_search(mNBt_pipelineV3, mNBt_paramsV3)
report_scores(mNBtResult_V3)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:  2.0min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 14000, 'cv__ngram_range': (1, 2), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.45      0.77      0.56      1209
           1       0.83      0.55      0.66      2546

    accuracy                           0.62      3755
   macro avg       0.64      0.66      0.61      3755
weighted avg       0.71      0.62      0.63      3755



In [58]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV4 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV4 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (13,20)],
    'nb__alpha' : [i for i in range (1,4)]
}

mNBtResult_V4 = grid_search(mNBt_pipelineV4, mNBt_paramsV4)
report_scores(mNBtResult_V4)

Fitting 5 folds for each of 105 candidates, totalling 525 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 525 out of 525 | elapsed:  4.2min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 18000, 'cv__ngram_range': (1, 2), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.55      0.76      0.64      1209
           1       0.86      0.71      0.78      2546

    accuracy                           0.72      3755
   macro avg       0.71      0.74      0.71      3755
weighted avg       0.76      0.72      0.73      3755



In [59]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV5 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV5 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (17,24)],
    'nb__alpha' : [i for i in range (1,3)]
}

mNBtResult_V5 = grid_search(mNBt_pipelineV5, mNBt_paramsV5)
report_scores(mNBtResult_V5)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:  5.0min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 22000, 'cv__ngram_range': (1, 4), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.56      0.74      0.64      1209
           1       0.86      0.72      0.78      2546

    accuracy                           0.73      3755
   macro avg       0.71      0.73      0.71      3755
weighted avg       0.76      0.73      0.74      3755



In [61]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV6 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV6 = {
    'cv__ngram_range' : [(1,x) for x in range (3,11)],
    'cv__max_features' : [1000 * i for i in range (21,27)],
    'nb__alpha' : [i for i in range (1,3)]
}

mNBtResult_V6 = grid_search(mNBt_pipelineV6, mNBt_paramsV6)
report_scores(mNBtResult_V6)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 12.4min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__max_features': 22000, 'cv__ngram_range': (1, 4), 'nb__alpha': 1}
Score report:

              precision    recall  f1-score   support

           0       0.56      0.74      0.64      1209
           1       0.86      0.72      0.78      2546

    accuracy                           0.73      3755
   macro avg       0.71      0.73      0.71      3755
weighted avg       0.76      0.73      0.74      3755



In [65]:
# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

mNBt_pipelineV7 = Pipeline([
    ('cv', TfidfVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', MultinomialNB())
])

mNBt_paramsV7 = {
    'cv__ngram_range' : [(1,x) for x in range (9,20)],
    'cv__norm' : ["l1", "l2"],
    'nb__alpha' : [0.25 * i for i in range (0,6)]
}

mNBtResult_V7 = grid_search(mNBt_pipelineV7, mNBt_paramsV7)
report_scores(mNBtResult_V7)

Fitting 5 folds for each of 132 candidates, totalling 660 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 660 out of 660 | elapsed:  3.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', MultinomialNB())
The best parameters found were: {'cv__ngram_range': (1, 9), 'cv__norm': 'l2', 'nb__alpha': 0.25}
Score report:

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1209
           1       0.87      0.87      0.87      2546

    accuracy                           0.83      3755
   macro avg       0.80      0.80      0.80      3755
weighted avg       0.83      0.83      0.83      3755



In [62]:
# Now do the multinomial naive bayes algorithm with a tfidf vectorizer instead of a countvectorizer,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

cNBt_pipelineV1 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', ComplementNB())
])

cNBt_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    'nb__alpha' : [i for i in range (1,4)]
}

cNBtResult_V1 = grid_search(cNBt_pipelineV1, cNBt_paramsV1)
report_scores(cNBtResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:  1.1min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 5), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.39      0.85      0.53      1209
           1       0.84      0.36      0.51      2546

    accuracy                           0.52      3755
   macro avg       0.61      0.61      0.52      3755
weighted avg       0.69      0.52      0.51      3755



In [63]:
# Now do the multinomial naive bayes algorithm with a tfidf vectorizer instead of a countvectorizer,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

cNBt_pipelineV2 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', ComplementNB())
])

cNBt_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (4,11)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    'nb__alpha' : [i for i in range (2,5)]
}

cNBtResult_V2 = grid_search(cNBt_pipelineV2, cNBt_paramsV2)
report_scores(cNBtResult_V2)

Fitting 5 folds for each of 147 candidates, totalling 735 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 735 out of 735 | elapsed:  4.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 10000, 'cv__ngram_range': (1, 7), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.51      0.84      0.63      1209
           1       0.89      0.61      0.72      2546

    accuracy                           0.68      3755
   macro avg       0.70      0.72      0.68      3755
weighted avg       0.76      0.68      0.69      3755



In [64]:
# Now do the multinomial naive bayes algorithm with a tfidf vectorizer instead of a countvectorizer,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

cNBt_pipelineV3 = Pipeline([
    ('cv', TfidfVectorizer()),
    ('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', ComplementNB())
])

cNBt_paramsV3 = {
    'cv__ngram_range' : [(1,x) for x in range (6,14)],
    'cv__max_features' : [1000 * i for i in range (9,15)],
    'nb__alpha' : [i for i in range (1,5)]
}

cNBtResult_V3 = grid_search(cNBt_pipelineV3, cNBt_paramsV3)
report_scores(cNBtResult_V3)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  7.8min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__max_features': 13000, 'cv__ngram_range': (1, 11), 'nb__alpha': 3}
Score report:

              precision    recall  f1-score   support

           0       0.66      0.83      0.74      1209
           1       0.91      0.80      0.85      2546

    accuracy                           0.81      3755
   macro avg       0.78      0.81      0.79      3755
weighted avg       0.83      0.81      0.81      3755



In [67]:
# TRY WITHOUTTT SPECIFYING MAX FEATURES

# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

# (its not actually version 7 im just too lazy to think of another name yet)

cNBt_pipelineV4 = Pipeline([
    ('cv', TfidfVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', ComplementNB())
])

cNBt_paramsV4 = {
    'cv__ngram_range' : [(1,x) for x in range (5,15)],
    'cv__norm' : ["l1", "l2"],
    'nb__alpha' : [0.25 * i for i in range (0,6)]
}

cNBtResult_V4 = grid_search(cNBt_pipelineV4, cNBt_paramsV4)
report_scores(cNBtResult_V4)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.5min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__ngram_range': (1, 13), 'cv__norm': 'l2', 'nb__alpha': 1.0}
Score report:

              precision    recall  f1-score   support

           0       0.67      0.80      0.73      1209
           1       0.89      0.81      0.85      2546

    accuracy                           0.81      3755
   macro avg       0.78      0.80      0.79      3755
weighted avg       0.82      0.81      0.81      3755



In [28]:
# TRY WITHOUTTT SPECIFYING MAX FEATURES

# Seems to be performing initially better already. Now try a wider parameter search:,
# trying ngrams from (1,1) to (1,5), 1000 max features in the vocabulary to 5000 max features
# in increments of 1000, and try values of 1, 2 and 3 for alpha

# (its not actually version 7 im just too lazy to think of another name yet)

cNBt_pipelineV5 = Pipeline([
    ('cv', TfidfVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('nb', ComplementNB())
])

cNBt_paramsV5 = {
    'cv__ngram_range' : [(1,x) for x in range (11,18)],
    'cv__norm' : ["l1", "l2"],
    'nb__alpha' : [0.25 * i for i in range (1,4 * 3)]
}

cNBtResult_V5 = grid_search(cNBt_pipelineV5, cNBt_paramsV5)
report_scores(cNBtResult_V5)

Fitting 5 folds for each of 154 candidates, totalling 770 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 770 out of 770 | elapsed:  3.6min finished


---- RESULTS ---- 

The algorithm being optimised was: ('nb', ComplementNB())
The best parameters found were: {'cv__ngram_range': (1, 15), 'cv__norm': 'l2', 'nb__alpha': 1.0}
Score report:

              precision    recall  f1-score   support

           0       0.67      0.80      0.73      1209
           1       0.89      0.81      0.85      2546

    accuracy                           0.81      3755
   macro avg       0.78      0.81      0.79      3755
weighted avg       0.82      0.81      0.81      3755



In [151]:
from sklearn.svm import *
from sklearn.linear_model import *

In [None]:
psvc_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('psvc', SVC())
])

psvc_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (11,18)],
    'cv__norm' : ["l1", "l2"],
    'psvc__alpha' : [0.25 * i for i in range (1,4 * 3)]
}

psvcResult_V1 = grid_search(psvc_pipelineV1, psvc_paramsV1)
report_scores(psvcResult_V1)

In [152]:
sgd_pipelineV1 = Pipeline([
    ('cv', CountVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('sgd', SGDClassifier(n_jobs = -1, penalty = "elasticnet"))
])

sgd_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    'cv__max_features' : [1000 * i for i in range (1,6)],
    "sgd__alpha" : [10 ** x for x in range (-4, 1)],
    "sgd__l1_ratio" : [0, 0.25, 0.5, 0.75, 1]
    
}

sgdResult_V1 = grid_search(sgd_pipelineV1, sgd_paramsV1)
report_scores(sgdResult_V1)

Fitting 5 folds for each of 625 candidates, totalling 3125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 3125 out of 3125 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('sgd', SGDClassifier(n_jobs=-1, penalty='elasticnet'))
The best parameters found were: {'cv__max_features': 5000, 'cv__ngram_range': (1, 1), 'sgd__alpha': 0.0001, 'sgd__l1_ratio': 0.75}
Score report:

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1209
           1       0.89      0.89      0.89      2546

    accuracy                           0.86      3755
   macro avg       0.83      0.84      0.84      3755
weighted avg       0.86      0.86      0.86      3755

f1 score: 0.8937340404635631
precision: 0.8939096267190569
recall: 0.8935585231736056


In [154]:
sgd_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('sgd', SGDClassifier(n_jobs = -1, penalty = "elasticnet"))
])

sgd_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,7)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    "sgd__alpha" : [10 ** x for x in range (-5, -2)],
    "sgd__l1_ratio" : [0, 0.25, 0.5, 0.75, 1]
    
}

sgdResult_V2 = grid_search(sgd_pipelineV2, sgd_paramsV2)
report_scores(sgdResult_V2)

Fitting 5 folds for each of 630 candidates, totalling 3150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 3150 out of 3150 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('sgd', SGDClassifier(n_jobs=-1, penalty='elasticnet'))
The best parameters found were: {'cv__max_features': 7000, 'cv__ngram_range': (1, 1), 'sgd__alpha': 0.0001, 'sgd__l1_ratio': 0.5}
Score report:

              precision    recall  f1-score   support

           0       0.73      0.75      0.74      1209
           1       0.88      0.87      0.87      2546

    accuracy                           0.83      3755
   macro avg       0.81      0.81      0.81      3755
weighted avg       0.83      0.83      0.83      3755

f1 score: 0.8743320799525034
precision: 0.8811328280813722
recall: 0.8676355066771406


In [155]:
sgdt_pipelineV1 = Pipeline([
    ('cv', TfidfVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('sgd', SGDClassifier(n_jobs = -1, penalty = "elasticnet"))
])

sgdt_paramsV1 = {
    'cv__ngram_range' : [(1,x) for x in range (1,6)],
    #'cv__max_features' : [1000 * i for i in range (1,6)],
    "sgd__alpha" : [10 ** x for x in range (-5, -2)],
    "sgd__l1_ratio" : [0, 0.25, 0.5, 0.75, 1]
    
}

sgdtResult_V1 = grid_search(sgdt_pipelineV1, sgdt_paramsV1)
report_scores(sgdtResult_V1)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:   46.3s finished


---- RESULTS ---- 

The algorithm being optimised was: ('sgd', SGDClassifier(n_jobs=-1, penalty='elasticnet'))
The best parameters found were: {'cv__ngram_range': (1, 1), 'sgd__alpha': 0.0001, 'sgd__l1_ratio': 0.5}
Score report:

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      1209
           1       0.86      0.87      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.79      0.79      0.79      3755
weighted avg       0.82      0.82      0.82      3755

f1 score: 0.8660399529964747
precision: 0.863671875
recall: 0.868421052631579


In [115]:
#MultinomialNB with optimal parameters confusion matrix 

mnb_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 14000, ngram_range = (1,8))), 
     ("nb", MultinomialNB(alpha = 4))
])

mnb_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), mnb_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      1209
           1       0.89      0.84      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.80      0.81      0.80      3755
weighted avg       0.83      0.82      0.83      3755

CONFUSION MATRIX:

 [[ 951  258]
 [ 408 2138]]


In [117]:
#ComplementNB with optimal parameters confusion matrix 

cnb_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 14000, ngram_range = (1,11))), 
     ("nb", ComplementNB(alpha = 2))
])

cnb_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), cnb_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.49      0.85      0.62      1209
           1       0.89      0.58      0.70      2546

    accuracy                           0.66      3755
   macro avg       0.69      0.71      0.66      3755
weighted avg       0.76      0.66      0.67      3755

CONFUSION MATRIX:

 [[1022  187]
 [1074 1472]]


In [118]:
#BernoulliNB with optimal parameters confusion matrix 

bnb_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 5000, ngram_range = (1,2))), 
     ("nb", BernoulliNB(alpha = 2))
])

bnb_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), bnb_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      1209
           1       0.86      0.87      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.79      0.79      0.79      3755
weighted avg       0.82      0.82      0.82      3755

CONFUSION MATRIX:

 [[ 863  346]
 [ 331 2215]]


In [122]:
#LinearSVC with optimal parameters confusion matrix 

svc_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 3000, ngram_range = (1,1))), 
     ("nb", LinearSVC(C = 1))
])

svc_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), svc_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.65      0.82      0.73      1209
           1       0.90      0.79      0.84      2546

    accuracy                           0.80      3755
   macro avg       0.78      0.81      0.79      3755
weighted avg       0.82      0.80      0.81      3755

CONFUSION MATRIX:

 [[ 996  213]
 [ 534 2012]]


In [123]:
#Random Forest with optimal parameters confusion matrix 

rf_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 5000, ngram_range = (1,2))), 
     ("nb", RandomForestClassifier(n_estimators = 300))
])

rf_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), rf_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.36      0.06      0.11      1209
           1       0.68      0.94      0.79      2546

    accuracy                           0.66      3755
   macro avg       0.52      0.50      0.45      3755
weighted avg       0.58      0.66      0.57      3755

CONFUSION MATRIX:

 [[  78 1131]
 [ 141 2405]]


In [127]:
#Logistic Regression with optimal parameters confusion matrix 

lr_CM_pipe = Pipeline([
    ("cv", CountVectorizer(max_features = 11000, ngram_range = (1,1))), 
    ("nb", LogisticRegression(C = 1))
])

lr_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), lr_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.58      0.77      0.66      1209
           1       0.87      0.73      0.80      2546

    accuracy                           0.75      3755
   macro avg       0.73      0.75      0.73      3755
weighted avg       0.78      0.75      0.75      3755

CONFUSION MATRIX:

 [[ 934  275]
 [ 677 1869]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [129]:
#Multinomial Naive Bayes using TfidfVectorizer with optimal parameters confusion matrix

mnbt_CM_pipe = Pipeline([
    ("cv", TfidfVectorizer(ngram_range = (1,9))), 
    ("nb", MultinomialNB(alpha = 0.25))
])

mnbt_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), mnbt_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1209
           1       0.87      0.87      0.87      2546

    accuracy                           0.82      3755
   macro avg       0.80      0.80      0.80      3755
weighted avg       0.82      0.82      0.82      3755

CONFUSION MATRIX:

 [[ 881  328]
 [ 330 2216]]


In [130]:
#Multinomial Naive Bayes using TfidfVectorizer with optimal parameters confusion matrix

cnbt_CM_pipe = Pipeline([
    ("cv", TfidfVectorizer(ngram_range = (1,15))), 
    ("nb", ComplementNB(alpha = 1))
])

cnbt_CM_pipe.fit(simplified_training.tokens, simplified_training.label.astype("int"))
tup = simplified_testing.label.astype("int"), cnbt_CM_pipe.predict(simplified_testing.tokens) 
print(classification_report(tup[0], tup[1]))
print("CONFUSION MATRIX:\n\n", confusion_matrix(tup[0], tup[1]))

              precision    recall  f1-score   support

           0       0.67      0.80      0.73      1209
           1       0.89      0.81      0.85      2546

    accuracy                           0.81      3755
   macro avg       0.78      0.81      0.79      3755
weighted avg       0.82      0.81      0.81      3755

CONFUSION MATRIX:

 [[ 964  245]
 [ 472 2074]]


In [19]:
from sklearn.ensemble import *

In [31]:
# NOW LETS TRY SOME ENSEMBLE LEARNING!

# FIRST LETS TRY Random Forest, LinearSVC, LogisticRegression and the Multinomial NB using tfidf

svc_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 23000)),
    ("svc", LinearSVC(C = 1))
])

bnb_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,2), max_features = 5000)),
    ("bnb", BernoulliNB(alpha = 2))
])

mnb_pipe = Pipeline([
    ("tf", TfidfVectorizer(ngram_range = (1,9))),
    ("mnb", MultinomialNB(alpha = 0.25))
])

sgd_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 5000)), 
    ("sgdcf", SGDClassifier(alpha = 0.0001, l1_ratio = 0.75, penalty = "elasticnet", random_state = 1))
])

rf_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 18000)),
    ("rf", RandomForestClassifier(n_estimators = 800, max_depth = None, max_features = "auto", bootstrap = True, random_state = 1, n_jobs = -1))
])

vcf1 = VotingClassifier(estimators = [
    ("rf", rf_pipe),
    ("svc", svc_pipe),
    #("bnb", bnb_pipe), #this one is is more opposite to random forest so comment out lsvc for now
    #("lr", lr_pipe),
    ("mnbt", mnb_pipe), #use the naive bayes classifier with tfidf as it has equal precision and recall
    ("sgd", sgd_pipe)
], voting = "hard")

#vcf1.fit(simplified_training.tokens, simplified_training.label.astype("int"))
#print(classification_report(simplified_testing.label.astype("int"), vcf1.predict(simplified_testing.tokens)))

In [32]:
def powerset(s):
    x = len(s)
    output = []
    for i in range(1 << x):
        subset = [s[j] for j in range(x) if (i & (1 << j))]
        #we are only interested in pairs so disregard singletons
        if(len(subset) > 1):
            output.append(subset)
    return output

In [33]:
group = powerset([svc_pipe, bnb_pipe, mnb_pipe, sgd_pipe, rf_pipe])
group

[[Pipeline(steps=[('cv', CountVectorizer(max_features=23000)),
                  ('svc', LinearSVC(C=1))]),
  Pipeline(steps=[('cv', CountVectorizer(max_features=5000, ngram_range=(1, 2))),
                  ('bnb', BernoulliNB(alpha=2))])],
 [Pipeline(steps=[('cv', CountVectorizer(max_features=23000)),
                  ('svc', LinearSVC(C=1))]),
  Pipeline(steps=[('tf', TfidfVectorizer(ngram_range=(1, 9))),
                  ('mnb', MultinomialNB(alpha=0.25))])],
 [Pipeline(steps=[('cv', CountVectorizer(max_features=5000, ngram_range=(1, 2))),
                  ('bnb', BernoulliNB(alpha=2))]),
  Pipeline(steps=[('tf', TfidfVectorizer(ngram_range=(1, 9))),
                  ('mnb', MultinomialNB(alpha=0.25))])],
 [Pipeline(steps=[('cv', CountVectorizer(max_features=23000)),
                  ('svc', LinearSVC(C=1))]),
  Pipeline(steps=[('cv', CountVectorizer(max_features=5000, ngram_range=(1, 2))),
                  ('bnb', BernoulliNB(alpha=2))]),
  Pipeline(steps=[('tf', TfidfVector

In [87]:
names = []

for pipeline in group:
    name = pipeline.steps[1][0]
    names.append(name)
    print("the name is:",name)
    
print("the names are:",names)

the name is: svc
the name is: mnb
the names are: ['svc', 'mnb']


In [34]:
def try_all_voting_combinations():
    possibilities = powerset([svc_pipe, bnb_pipe, mnb_pipe, sgd_pipe, rf_pipe])
    j = 0
    
    #store the combinations and their respective scores in the same order so it can be easily copied into a spreadsheet
    combinations = []
    f1s = []
    precisions = []
    recalls = []
    
    for group in possibilities:
        print("-- COMBINATION",str(j),"--\n")
        model_names = []
        for pipeline in group:
            #get the names of the models in the combinations so we can print it
            name = pipeline.steps[1][0]
            model_names.append(name)
        for_voter = []
        for i in range(len(group)):
            for_voter.append((str(i), group[i]))
        vcf = VotingClassifier(estimators = for_voter, voting = "hard")
        vcf.fit(simplified_training.tokens, simplified_training.label.astype("int"))
        true = simplified_testing.label.astype("int")
        predictions = vcf.predict(simplified_testing.tokens)
        print("combination tried:",model_names)
        combinations.append(model_names)
        f1 = f1_score(true, predictions)
        f1s.append(f1)
        precision = precision_score(true, predictions)
        precisions.append(precision)
        recall = recall_score(true, predictions)
        recalls.append(recall)
        print("f1 score:",f1,"| precision:",precision,"| recall:",recall,"\n")
        j = j + 1
        
    return combinations, f1s, precisions, recalls

In [35]:
possibilities = powerset([svc_pipe, bnb_pipe, mnb_pipe, sgd_pipe, rf_pipe])
len(possibilities)

26

In [36]:
combinations, f1s, precisions, recalls = try_all_voting_combinations()

-- COMBINATION 0 --

combination tried: ['svc', 'bnb']
f1 score: 0.775224337929525 | precision: 0.8754325259515571 | recall: 0.6956009426551453 

-- COMBINATION 1 --

combination tried: ['svc', 'mnb']
f1 score: 0.75284661754856 | precision: 0.8722193481634765 | recall: 0.6622152395915161 

-- COMBINATION 2 --

combination tried: ['bnb', 'mnb']
f1 score: 0.831046487814868 | precision: 0.8682071031236628 | recall: 0.7969363707776905 

-- COMBINATION 3 --

combination tried: ['svc', 'bnb', 'mnb']
f1 score: 0.8700098328416913 | precision: 0.8712091374556912 | recall: 0.8688138256087982 

-- COMBINATION 4 --

combination tried: ['svc', 'sgdcf']
f1 score: 0.7484333034914951 | precision: 0.8699271592091571 | recall: 0.6567164179104478 

-- COMBINATION 5 --

combination tried: ['bnb', 'sgdcf']
f1 score: 0.8089413749472797 | precision: 0.8734061930783242 | recall: 0.753338570306363 

-- COMBINATION 6 --

combination tried: ['svc', 'bnb', 'sgdcf']
f1 score: 0.8524126791843327 | precision: 0.8770

In [37]:
tup = combinations, f1s, precisions, recalls

In [45]:
recalls

[0.6956009426551453,
 0.6622152395915161,
 0.7969363707776905,
 0.8688138256087982,
 0.6567164179104478,
 0.753338570306363,
 0.8291437549096622,
 0.7776904948939513,
 0.8373919874312648,
 0.8880597014925373,
 0.7914375490966221,
 0.7187745483110762,
 0.8550667714061273,
 0.8813825608798115,
 0.8586017282010998,
 0.9175176747839748,
 0.9340141398271798,
 0.8609583660644148,
 0.8181461115475255,
 0.882560879811469,
 0.9245875883739199,
 0.8275726630007856,
 0.9021995286724274,
 0.835820895522388,
 0.8802042419481539,
 0.9131971720345641]

In [38]:
fitted = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 18000)),
    ("rf", RandomForestClassifier(n_estimators = 800, max_depth = None, max_features = "auto", bootstrap = True, random_state = 1))
])
fitted.fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), fitted.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.47      0.06      0.10      1209
           1       0.68      0.97      0.80      2546

    accuracy                           0.68      3755
   macro avg       0.58      0.51      0.45      3755
weighted avg       0.61      0.68      0.58      3755



In [28]:
fitted2 = Pipeline([
        ("cv", CountVectorizer(ngram_range = (1,7), max_features = 23000)), 
        ("sgdcf", SGDClassifier(alpha = 0.0001, l1_ratio = 0.6, penalty = "elasticnet", random_state = 1))
    ]).fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), fitted2.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.81      0.77      0.79      1209
           1       0.89      0.91      0.90      2546

    accuracy                           0.87      3755
   macro avg       0.85      0.84      0.85      3755
weighted avg       0.87      0.87      0.87      3755



In [None]:
fitted3 = 

In [37]:
vcf = VotingClassifier([
    ("f2", fitted2),
    ("f", fitted)
], voting = "hard", n_jobs = -1)
vcf.fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), vcf.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1209
           1       0.89      0.90      0.90      2546

    accuracy                           0.86      3755
   macro avg       0.84      0.84      0.84      3755
weighted avg       0.86      0.86      0.86      3755



In [40]:
sgd_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,7), max_features = 23000)), 
    ("sgdcf", SGDClassifier(alpha = 0.0001, l1_ratio = 0.6, penalty = "elasticnet", random_state = 1))
])

svc_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 3000)),
    ("svc", LinearSVC(C = 1))
])

mnb_pipe = Pipeline([
    ("tf", TfidfVectorizer(ngram_range = (1,9))),
    ("mnb", MultinomialNB(alpha = 0.25))
])

bnb_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,2), max_features = 5000, binary = True)),
    ("bnb", BernoulliNB(alpha = 2))
])

rf_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 18000)),
    ("rf", RandomForestClassifier(n_estimators = 800, max_depth = None, max_features = "auto", bootstrap = True, random_state = 1))
])

vcf3 = VotingClassifier([
    ("sdg", sgd_pipe),
    ("svc", svc_pipe),
    ("mnb", mnb_pipe),
    ("bnb", bnb_pipe),
    ("rf", rf_pipe)
], voting = "hard", n_jobs = -1)

vcf3.fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), vcf3.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.82      0.74      0.78      1209
           1       0.88      0.92      0.90      2546

    accuracy                           0.87      3755
   macro avg       0.85      0.83      0.84      3755
weighted avg       0.86      0.87      0.86      3755



In [43]:
vcf4 = VotingClassifier([
    ("sdg", sgd_pipe),
    ("svc", svc_pipe),
    ("mnb", mnb_pipe),
    #("bnb", bnb_pipe),
    ("rf", rf_pipe)
], voting = "hard", n_jobs = -1)

vcf4.fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), vcf4.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1209
           1       0.90      0.90      0.90      2546

    accuracy                           0.86      3755
   macro avg       0.84      0.84      0.84      3755
weighted avg       0.86      0.86      0.86      3755



In [36]:
print(classification_report(simplified_testing.label.astype("int"), vcf.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78      1209
           1       0.89      0.90      0.90      2546

    accuracy                           0.86      3755
   macro avg       0.84      0.84      0.84      3755
weighted avg       0.86      0.86      0.86      3755



In [150]:
# NOW LETS TRY SOME ENSEMBLE LEARNING!

# FIRST LETS TRY Random Forest, LinearSVC, LogisticRegression and the Multinomial NB using tfidf

vcf2 = VotingClassifier(estimators = [
    ("rf", rf_CM_pipe),
    #("svc", svc_CM_pipe),
    #("cnb", cnb_CM_pipe), #this one is is more opposite to random forest so comment out lsvc for now
    ("lr", lr_CM_pipe),
    ("mnbt", mnbt_CM_pipe), #use the naive bayes classifier with tfidf as it has equal precision and recall
], voting = "hard", n_jobs = -1)

vcf2.fit(simplified_training.tokens, simplified_training.label.astype("int"))
print(classification_report(simplified_testing.label.astype("int"), vcf2.predict(simplified_testing.tokens)))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1209
           1       0.87      0.88      0.88      2546

    accuracy                           0.83      3755
   macro avg       0.81      0.81      0.81      3755
weighted avg       0.83      0.83      0.83      3755



In [174]:
from sklearn.model_selection import *

In [186]:
def random_search(pipeline, params):
    
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
    clf = RandomizedSearchCV(pipeline, params, scoring = "f1", verbose = 6, n_jobs = -1, cv = cv, n_iter = 600)
    clf.fit(simplified_training.tokens, simplified_training.label.astype("int"))
    return clf

In [183]:
rf_pipeline = Pipeline ([
    ("cv", CountVectorizer()),
    ("rf", RandomForestClassifier(n_jobs = -1))
])

rf_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,20)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'rf__n_estimators' : [100 * x for x in range (1,11)],
    'rf__max_depth' : [25, 50, 75, None],
    'rf__max_features' : ["auto", "sqrt"],
    'rf__bootstrap' : [True, False]
}

res = random_search(rf_pipeline, rf_params)
report_scores(res)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 58.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 74.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 97.4min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 110.0min finished


---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier(n_jobs=-1))
The best parameters found were: {'rf__n_estimators': 800, 'rf__max_features': 'auto', 'rf__max_depth': None, 'rf__bootstrap': True, 'cv__ngram_range': (1, 1), 'cv__max_features': 18000}
Score report:

              precision    recall  f1-score   support

           0       0.43      0.05      0.09      1209
           1       0.68      0.97      0.80      2546

    accuracy                           0.67      3755
   macro avg       0.55      0.51      0.44      3755
weighted avg       0.60      0.67      0.57      3755

f1 score: 0.8003246753246753
precision: 0.6820697288323188
recall: 0.9681853888452474


In [193]:
lr_pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("lr", LogisticRegression(penalty = "l2", n_jobs = -1))
])

lr_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,21)],
    'cv__max_features' : [1000 * i for i in range (1,31)],
    'lr__C' : [10 ** i for i in range (-6, 6)],
    'lr__dual' : [True, False]
}

res = random_search(lr_pipeline, lr_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('lr', LogisticRegression(n_jobs=-1))
The best parameters found were: {'lr__dual': False, 'lr__C': 1, 'cv__ngram_range': (1, 3), 'cv__max_features': 22000}
Score report:

              precision    recall  f1-score   support

           0       0.64      0.72      0.68      1209
           1       0.86      0.81      0.83      2546

    accuracy                           0.78      3755
   macro avg       0.75      0.76      0.76      3755
weighted avg       0.79      0.78      0.78      3755

f1 score: 0.8323886639676115
precision: 0.858813700918964
recall: 0.8075412411626081


In [192]:
svc_pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("svc", LinearSVC())
])

svc_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'svc__C' : [10 ** i for i in range (-6,4)]
}

res = random_search(svc_pipeline, svc_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('svc', LinearSVC())
The best parameters found were: {'svc__C': 0.1, 'cv__ngram_range': (1, 2), 'cv__max_features': 28000}
Score report:

              precision    recall  f1-score   support

           0       0.68      0.71      0.70      1209
           1       0.86      0.84      0.85      2546

    accuracy                           0.80      3755
   macro avg       0.77      0.78      0.77      3755
weighted avg       0.80      0.80      0.80      3755

f1 score: 0.851697438951757
precision: 0.8610999598554797
recall: 0.8424980361351139


In [213]:
mnb_pipeline = Pipeline([
    ("cv", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])

mnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    #see if it is worth not choosing max features or if it is
    'cv__max_features' : [1000 * i for i in range (1,30)] + [None],
    'mnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 6)] #np.linspace(0.0, 5.0, num = 10),
}

res = random_search(mnb_pipeline, mnb_params)
report_scores(res)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 12.3min finished


---- RESULTS ---- 

The algorithm being optimised was: ('mnb', MultinomialNB())
The best parameters found were: {'mnb__alpha': 0.5, 'cv__ngram_range': (1, 2), 'cv__max_features': 21000}
Score report:

              precision    recall  f1-score   support

           0       0.43      0.78      0.56      1209
           1       0.83      0.52      0.64      2546

    accuracy                           0.60      3755
   macro avg       0.63      0.65      0.60      3755
weighted avg       0.70      0.60      0.61      3755

f1 score: 0.6384726921217979
precision: 0.8297738693467337
recall: 0.51885310290652


In [208]:
mnb_pipeline = Pipeline([
    ("cv", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])

mnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    #see if it is worth not choosing max features or if it is
    'mnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 6)] #np.linspace(0.0, 5.0, num = 10),
}

res = random_search(mnb_pipeline, mnb_params)
report_scores(res)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 348 candidates, totalling 1740 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1740 out of 1740 | elapsed:  7.5min finished


---- RESULTS ---- 

The algorithm being optimised was: ('mnb', MultinomialNB())
The best parameters found were: {'mnb__alpha': 0.5, 'cv__ngram_range': (1, 2)}
Score report:

              precision    recall  f1-score   support

           0       0.45      0.76      0.57      1209
           1       0.83      0.57      0.68      2546

    accuracy                           0.63      3755
   macro avg       0.64      0.66      0.62      3755
weighted avg       0.71      0.63      0.64      3755

f1 score: 0.6750759168418594
precision: 0.8328530259365994
recall: 0.5675569520816968


In [209]:
mnb_pipeline = Pipeline([
    ("cv", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])

mnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'mnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 6)] #np.linspace(0.0, 5.0, num = 10),
}

res = random_search(mnb_pipeline, mnb_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('mnb', MultinomialNB())
The best parameters found were: {'mnb__alpha': 0.5, 'cv__ngram_range': (1, 2), 'cv__max_features': 24000}
Score report:

              precision    recall  f1-score   support

           0       0.43      0.77      0.55      1209
           1       0.82      0.52      0.64      2546

    accuracy                           0.60      3755
   macro avg       0.63      0.64      0.60      3755
weighted avg       0.70      0.60      0.61      3755

f1 score: 0.6390760346487007
precision: 0.8248447204968944
recall: 0.5216025137470542


In [210]:
sgd_pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("sgd", SGDClassifier(penalty = "elasticnet", max_iter = 1500))
])

sgd_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'sgd__alpha' : [10 ** i for i in range (-6, 4)],
    'sgd__l1_ratio' : [0.1 * i for i in range (0, 11)] #np.linspace(0.0, 1.0, num = 10)
}

res = random_search(sgd_pipeline, sgd_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('sgd', SGDClassifier(max_iter=1500, penalty='elasticnet'))
The best parameters found were: {'sgd__l1_ratio': 0.6000000000000001, 'sgd__alpha': 0.0001, 'cv__ngram_range': (1, 7), 'cv__max_features': 23000}
Score report:

              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1209
           1       0.89      0.89      0.89      2546

    accuracy                           0.85      3755
   macro avg       0.83      0.83      0.83      3755
weighted avg       0.85      0.85      0.85      3755

f1 score: 0.892289582107122
precision: 0.8914151313210505
recall: 0.8931657501963864


In [211]:
cnb_pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("cnb", ComplementNB())
])

cnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'cnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 6)] #np.linspace (0.0, 5.0, num = 10)
}

res = random_search(cnb_pipeline, cnb_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('cnb', ComplementNB())
The best parameters found were: {'cv__ngram_range': (1, 4), 'cv__max_features': 25000, 'cnb__alpha': 1.0}
Score report:

              precision    recall  f1-score   support

           0       0.36      0.86      0.51      1209
           1       0.81      0.28      0.41      2546

    accuracy                           0.47      3755
   macro avg       0.59      0.57      0.46      3755
weighted avg       0.66      0.47      0.44      3755

f1 score: 0.41425650014607074
precision: 0.8084378563283923
recall: 0.2784760408483896


In [212]:
bnb_pipeline = Pipeline([
    ("cv", CountVectorizer(binary = True)),
    ("bnb", BernoulliNB())
])

bnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'bnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 6)] #np.linspace (0.0, 5.0, num = 10)
}

res = random_search(bnb_pipeline, bnb_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('bnb', BernoulliNB())
The best parameters found were: {'cv__ngram_range': (1, 2), 'cv__max_features': 16000, 'bnb__alpha': 0.0}
Score report:

              precision    recall  f1-score   support

           0       0.37      0.72      0.49      1209
           1       0.75      0.40      0.53      2546

    accuracy                           0.51      3755
   macro avg       0.56      0.56      0.51      3755
weighted avg       0.63      0.51      0.51      3755

f1 score: 0.52620813091281
precision: 0.7538461538461538
recall: 0.40416339355852315




In [206]:
[0.5 * i for i in range (2 * 0, 2 * 5)]

[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5]

In [214]:
mnb_pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("mnb", MultinomialNB())
])

mnb_params = {
    'cv__ngram_range' : [(1,x) for x in range (1,30)],
    #see if it is worth not choosing max features or if it is
    'cv__max_features' : [1000 * i for i in range (1,30)],
    'mnb__alpha' : [0.5 * i for i in range (2 * 0, 2 * 5)] #np.linspace(0.0, 5.0, num = 2),
}

res = random_search(mnb_pipeline, mnb_params)
report_scores(res)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | ela

---- RESULTS ---- 

The algorithm being optimised was: ('mnb', MultinomialNB())
The best parameters found were: {'mnb__alpha': 0.5, 'cv__ngram_range': (1, 2), 'cv__max_features': 23000}
Score report:

              precision    recall  f1-score   support

           0       0.36      0.82      0.50      1209
           1       0.78      0.30      0.43      2546

    accuracy                           0.47      3755
   macro avg       0.57      0.56      0.47      3755
weighted avg       0.64      0.47      0.45      3755

f1 score: 0.43451202263083455
precision: 0.7765419615773509
recall: 0.3016496465043205


In [None]:
sgd_pipelineV2 = Pipeline([
    ('cv', CountVectorizer()),
    #('df', Densifier()), #we need to convert the sparse matrix into a dense matrix
    ('sgd', SGDClassifier(n_jobs = -1, penalty = "elasticnet"))
])

sgd_paramsV2 = {
    'cv__ngram_range' : [(1,x) for x in range (1,7)],
    'cv__max_features' : [1000 * i for i in range (4,11)],
    "sgd__alpha" : [10 ** x for x in range (-5, -2)],
    "sgd__l1_ratio" : [0, 0.25, 0.5, 0.75, 1]
    
}

sgdResult_V2 = grid_search(sgd_pipelineV2, sgd_paramsV2)
report_scores(sgdResult_V2)

In [274]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.preprocessing import LabelEncoder

In [293]:
#training_vectorizer = CountVectorizer()
#training_vectorizer.fit(simplified_training.tokens)
#training_vectorizer.transform(simplified_training.tokens).toarray().shape

vectorize_layer = TextVectorization(
    max_tokens = 10000,
    output_sequence_length = 250,
    output_mode = "int"
)

vectorize_layer.adapt(simplified_training.tokens.to_numpy())
vectorize_layer.__dict__

{'_self_setattr_tracking': True,
 '_obj_reference_counts_dict': ObjectIdentityDictionary({<_ObjectIdentityWrapper wrapping 10000>: 1, <_ObjectIdentityWrapper wrapping 1>: 1, <_ObjectIdentityWrapper wrapping 'lower_and_strip_punctuation'>: 1, <_ObjectIdentityWrapper wrapping 'whitespace'>: 1, <_ObjectIdentityWrapper wrapping 'int'>: 1, <_ObjectIdentityWrapper wrapping 250>: 1, <_ObjectIdentityWrapper wrapping True>: 1, <_ObjectIdentityWrapper wrapping 0>: 1, <_ObjectIdentityWrapper wrapping False>: 2, <_ObjectIdentityWrapper wrapping DictWrapper(OrderedDict())>: 1, <_ObjectIdentityWrapper wrapping <tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup object at 0x000001D8E0C34C70>>: 1}),
 '_max_tokens': 10000,
 '_oov_value': 1,
 '_standardize': 'lower_and_strip_punctuation',
 '_split': 'whitespace',
 '_ngrams_arg': None,
 '_ngrams': None,
 '_output_mode': 'int',
 '_output_sequence_length': 250,
 '_pad_to_max': True,
 '_vocab_size': 0,
 '_called': False,
 '_instrumented

In [306]:
cv = CountVectorizer()
cv.fit(simplified_training.tokens)
res = cv.transform(simplified_training.tokens).todense()
type(res)

numpy.matrix

In [311]:
#convert the matrix made by the CountVectorizer and the labels into a tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((res, simplified_training.label.astype("int")))

In [312]:
model = tf.keras.Sequential([
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
])

model.fit(dataset)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [309]:
import operator

real_instances = original_training[original_training.label == "real"]
fake_instances = original_training[original_training.label != "real"]
#fake_instances = simplified_training[not ("real" in simplified_training.label)]
#eal_toks = tokens["real" in tokens.label]
#ake_toks = tokens["real" not in tokens.label]
real_instances.label.value_counts()

real    4921
Name: label, dtype: int64

In [247]:
.shape

(0, 3)

In [261]:
real_cv = CountVectorizer()
real_cv.fit(real_instances.tweetText)

fake_cv = CountVectorizer()
fake_cv.fit(fake_instances.tweetText)

reals = [(k,v) for k, v in real_cv.vocabulary_.items()]
reals.sort(key = operator.itemgetter(1))
reals.reverse()
fakes = [(k,v) for k, v in fake_cv.vocabulary_.items()]
fakes.sort(key = operator.itemgetter(1))
fakes.reverse()

In [264]:
fakess = simplified_training[simplified_training.label == 1]
realss = simplified_training[simplified_training.label == 0]

In [267]:
fakess.lang.value_counts(normalize = True) * 100

en         73.610517
es         10.859342
unknown     8.497221
pt          1.613938
fr          1.303976
nl          0.780248
de          0.684053
it          0.662676
ar          0.609235
ru          0.502351
no          0.245832
sv          0.245832
da          0.181702
fi          0.106883
hu          0.053442
ro          0.042753
Name: lang, dtype: float64

In [268]:
realss.lang.value_counts(normalize = True) * 100

en         82.909978
es          5.588295
unknown     5.344442
fr          1.910181
de          1.219264
it          0.690916
ar          0.467385
sv          0.447064
nl          0.304816
no          0.284495
ru          0.284495
pt          0.243853
da          0.162569
ro          0.081284
fi          0.040642
hu          0.020321
Name: lang, dtype: float64

In [47]:
pd.DataFrame(mNBResult_V7.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cv__max_features,param_cv__ngram_range,param_nb__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.811465,0.052013,0.182936,0.014244,17000,"(1, 8)",2,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 2.0}",0.837532,0.868768,0.941638,0.918847,0.818393,0.877036,0.046882,548
1,1.774688,0.042030,0.182933,0.018495,17000,"(1, 8)",2.5,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 2.5}",0.835291,0.866933,0.942343,0.920810,0.819748,0.877025,0.047556,553
2,1.774855,0.073815,0.189302,0.012025,17000,"(1, 8)",3,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 3.0}",0.833563,0.865421,0.944935,0.921502,0.818950,0.876874,0.048961,574
3,1.701766,0.060904,0.175637,0.011096,17000,"(1, 8)",3.5,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 3.5}",0.835125,0.864071,0.945408,0.921184,0.823049,0.877767,0.047875,402
4,1.744092,0.097567,0.173818,0.009744,17000,"(1, 8)",4,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 4.0}",0.835085,0.863065,0.945492,0.922234,0.824486,0.878072,0.047834,310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,1.808868,0.033338,0.190575,0.015797,22500,"(1, 12)",5.5,"{'cv__max_features': 22500, 'cv__ngram_range': (1, 12), 'nb__alpha': 5.5}",0.832084,0.860859,0.947341,0.926425,0.826633,0.878669,0.049379,136
716,1.806728,0.043773,0.181072,0.018915,22500,"(1, 12)",6,"{'cv__max_features': 22500, 'cv__ngram_range': (1, 12), 'nb__alpha': 6.0}",0.830762,0.860660,0.944700,0.925381,0.826774,0.877655,0.048682,432
717,1.806973,0.057512,0.185379,0.010972,22500,"(1, 12)",6.5,"{'cv__max_features': 22500, 'cv__ngram_range': (1, 12), 'nb__alpha': 6.5}",0.831104,0.859661,0.944487,0.925735,0.826913,0.877580,0.048670,452
718,1.791002,0.041953,0.194709,0.015359,22500,"(1, 12)",7,"{'cv__max_features': 22500, 'cv__ngram_range': (1, 12), 'nb__alpha': 7.0}",0.829975,0.858731,0.944926,0.921423,0.824938,0.875999,0.048658,625


In [43]:
#how to display the results of the grid search as a large table
pd.DataFrame(mNBResult_V5.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cv__max_features,param_cv__ngram_range,param_nb__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.700852,0.064033,0.182134,0.016466,17000,"(1, 8)",3,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 3}",0.833563,0.865421,0.944935,0.921502,0.818950,0.876874,0.048961,116
1,1.725590,0.058813,0.177360,0.019758,17000,"(1, 8)",4,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 4}",0.835085,0.863065,0.945492,0.922234,0.824486,0.878072,0.047834,68
2,1.656569,0.041990,0.174861,0.012518,17000,"(1, 8)",5,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 5}",0.833523,0.860460,0.945904,0.921879,0.825911,0.877535,0.048030,99
3,1.654335,0.038229,0.164041,0.012219,17000,"(1, 8)",6,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 8), 'nb__alpha': 6}",0.832386,0.859861,0.946773,0.923475,0.824950,0.877489,0.049031,101
4,1.711115,0.046159,0.182961,0.019486,17000,"(1, 9)",3,"{'cv__max_features': 17000, 'cv__ngram_range': (1, 9), 'nb__alpha': 3}",0.836688,0.863806,0.945209,0.921584,0.821155,0.877688,0.048074,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1.771514,0.044925,0.190980,0.024662,22000,"(1, 11)",6,"{'cv__max_features': 22000, 'cv__ngram_range': (1, 11), 'nb__alpha': 6}",0.831440,0.860060,0.946776,0.925381,0.826720,0.878076,0.049182,66
116,1.797188,0.039153,0.178165,0.015074,22000,"(1, 12)",3,"{'cv__max_features': 22000, 'cv__ngram_range': (1, 12), 'nb__alpha': 3}",0.836305,0.865623,0.944833,0.922996,0.824252,0.878802,0.047473,18
117,1.769360,0.048427,0.191083,0.014231,22000,"(1, 12)",4,"{'cv__max_features': 22000, 'cv__ngram_range': (1, 12), 'nb__alpha': 4}",0.834473,0.863467,0.945304,0.924720,0.825944,0.878782,0.048011,20
118,1.807535,0.059350,0.191093,0.015101,22000,"(1, 12)",5,"{'cv__max_features': 22000, 'cv__ngram_range': (1, 12), 'nb__alpha': 5}",0.833599,0.861460,0.947260,0.924392,0.826788,0.878700,0.048610,26


In [39]:
#pipeline which always starts with a countvectorizer
mNaiveBayesPipeline = Pipeline([
    ('cv', CountVectorizer()), 
    #('dt', Densifier()), #used for converting from sparse matrix to dense matrix, not needed here
    ('nb', MultinomialNB())
])

#the second lot of parameters to try
mNaiveBayesParams = {
    'cv__ngram_range': [(1,x) for x in range(14,18)],
    'cv__max_features' : [500 * i for i in range(2 * 12,2 * 17)],
    'nb__alpha' : [0.5 * i for i in range (0,6)]
}

#ignore this one for now
cNBpipeline = Pipeline([('cv', CountVectorizer()), ('cnb', ComplementNB())])
cNBparams = {
    'cv__ngram_range': [(1,x) for x in range(14,18)],
    'cv__max_features' : [1000* i for i in range(11,18)],
    'cnb__alpha' : [1 * i for i in range (0,9)]
}

#add: ('pca', PCA()),
lsvcPipeline = Pipeline([('cv', CountVectorizer()), ('df', Densifier()), ('pca', PCA()), ('lsvc', LinearSVC())])
lsvcParams = {
    'cv__ngram_range': [(1,x) for x in range(10,15)],
    'cv__max_features' : [1000* i for i in range(14,18)],
    'lsvc__C' : [1 * i for i in range(0,3)]
}

rForestPipeline = Pipeline([('cv', CountVectorizer()), ('rf', RandomForestClassifier())])
rForestParams = {
    'cv__ngram_range': [(1,x) for x in range(13,19)],
    'cv__max_features' : [1000* i for i in range(10,17)],
    'rf__n_estimators' : [10 * i for i in range(10,15)]
}


In [34]:
from sklearn.metrics import classification_report

In [35]:
from sklearn.metrics import classification_report

def reportScores(clf):
    
    print("---- RESULTS ----","\n")
    print("The algorithm being optimised was:",clf.estimator.steps[-1])
    print("The best parameters found were:", clf.best_params_)
    y_test_true = simplified_testing.label
    y_test_predictions = clf.predict(simplified_testing.tokens)
    print("Score report:\n")
    #prevent the chance of any of the lists being treated as objects
    print(classification_report(y_test_true.astype('int'), y_test_predictions.astype('int')))

In [None]:
#FIRST SEARCH: ngram 3-16, max features 5-16, alpha 0-9, RESULT: ngram: (1,15), features: 15000, alpha: 0
#so now try making the features and ngram even LARGER! got an accuracy of 81

#SECOND SEARCH: similar not much change, ngram 14-18, features 13-19, alpha 0-2
#RESULTS: features 14000, ngram 15, alpha 0
mNBResult = gridSearch(mNaiveBayesPipeline, mNaiveBayesParams)
reportScores(mNBResult)

In [None]:
# best parameters found from alpha 0-9, max features 5k-16k and ngram (3,16): 
# alpha 2, max features 15000, ngram (1,15) with accuracy of 66. Not very good
cNBResult = gridSearch(cNBpipeline, cNBparams) 
reportScores(cNBResult)

In [None]:
# 66 was okay... tried ngram 13-17, max features 13-17 and estimators 80-110 and best was 13000 features, ngram 1,15 and n estimators 110
#so try again with everything a bit higher
rForestResult = gridSearch(rForestPipeline, rForestParams)
reportScores(rForestResult)

In [None]:
#parameters searched: ngram 10-12, max features 10-12, C 0-2, found: ngram 11, max features 12000
#introducing PCA made the computer die so don't use it again
lsvcResult = gridSearch(lsvcPipeline, lsvcParams)
reportScores(lsvcResult)

In [None]:
# EXTRA ONES TO DO AFTERWARDS, KEEP ADDING THEM BELOW THEN COMMENT THE RESULTS UP ABOVE

In [31]:
rForestResult = gridSearch(rForestPipeline, rForestParams)
reportScores(rForestResult)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 40.2min
[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed: 42.6min finished


---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier())
The best parameters found were: {'cv__max_features': 12000, 'cv__ngram_range': (1, 15), 'rf__n_estimators': 100}
Score report:

              precision    recall  f1-score   support

           0       0.28      0.05      0.08      1209
           1       0.68      0.94      0.79      2546

    accuracy                           0.65      3755
   macro avg       0.48      0.49      0.43      3755
weighted avg       0.55      0.65      0.56      3755



In [32]:
lsvcResult = gridSearch(lsvcPipeline, lsvcParams)
reportScores(lsvcResult)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 11.0min finished


---- RESULTS ---- 

The algorithm being optimised was: ('lsvc', LinearSVC())
The best parameters found were: {'cv__max_features': 15000, 'cv__ngram_range': (1, 13), 'lsvc__C': 1}
Score report:

              precision    recall  f1-score   support

           0       0.64      0.80      0.71      1209
           1       0.89      0.78      0.84      2546

    accuracy                           0.79      3755
   macro avg       0.77      0.79      0.77      3755
weighted avg       0.81      0.79      0.79      3755



In [None]:
mNBResult = gridSearch(mNaiveBayesPipeline, mNaiveBayesParams)
reportScores(mNBResult)

In [None]:
#NOW TRY DOING THE SAME BUT USING TFIDF INSTEAD

mnbtPipeline = Pipeline([
    ('tf', TfidfVectorizer()), 
    #('dt', Densifier()), #used for converting from sparse matrix to dense matrix
    ('nb', MultinomialNB())
])

#the second lot of parameters to try
mnbtParams = {
    'tf__ngram_range': [(1,0.5 * x) for x in range(2 * 12,2 *17)],
    'tf__norm' : ["l1", "l2"],
    'nb__alpha' : [0.25 * i for i in range (0,6)]
}

#ignore this one for now
cnbtPipeline = Pipeline([('tf', TfidfVectorizer()), ('cnb', ComplementNB())])
cnbtParams = {
    'tf__ngram_range': [(1,x) for x in range(14,18)],
    'tf__norm' : ["l1", "l2"],
    'cnb__alpha' : [1 * i for i in range (0,9)]
}

#add: ('pca', PCA()),
lsvctPipeline = Pipeline([('tf', TfidfVectorizer()), ('df', Densifier()), ('lsvc', LinearSVC())])
lsvctParams = {
    'tf__ngram_range': [(1,x) for x in range(10,15)],
    'tf__norm' : ["l1", "l2"],
    'lsvc__C' : [1 * i for i in range(0,3)]
}

rftPipeline = Pipeline([('tf', TfidfVectorizer()), ('rf', RandomForestClassifier())])
rftParams = {
    'tf__ngram_range': [(1,x) for x in range(13,19)],
    'tf__norm' : ["l1", "l2"],
    'rf__n_estimators' : [10 * i for i in range(10,15)]
}


In [None]:
#FIRST SEARCH: ngram 14-18, alpha 0-6, RESULT: alpha = 0.5, ngram = 14, norm = l2, accuracy = 87

#SECOND SEARCH: 
mnbtResult = gridSearch(mnbtPipeline, mnbtParams)
reportScores(mnbtResult)

In [None]:
cnbtResult = gridSearch(cnbtPipeline, cnbtParams)
reportScores(cnbtResult)

In [None]:
lsvctResult = gridSearch(lsvctPipeline, lsvctParams)
reportScores(lsvctResult)

In [None]:
#FIRST SEARCH: 
rftResult = gridSearch(rftPipeline, rftParams)
reportScores(rftResult)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 39.7min


In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from random import shuffle

In [17]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(simplified_training.tokens)

raw_train = tokenizer.texts_to_sequences(simplified_training.tokens)
raw_test = tokenizer.texts_to_sequences(simplified_testing.tokens)

padded_train = pad_sequences(raw_train, padding = "post", maxlen = 24)
padded_test = pad_sequences(raw_train, padding = "post", maxlen = 24)

#pair each instance with it's label
unsplitwlabels = list(zip(padded_train, simplified_training.label))
testwlabels = list(zip(padded_test, simplified_testing.label))

#shuffle the training data before splitting it into a validation set
shuffle(unsplitwlabels)

In [18]:
#split into training and validation

#take the first n elements of the list
trainwlabels = unsplitwlabels[:12000]

#take the last n elements of the list
validationwlabels = unsplitwlabels[12000:]

In [19]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

11844

In [37]:
model = Sequential()

model.add(layers.Embedding(input_dim = vocab_size, output_dim = 5, input_length = 24))
model.add(layers.AveragePooling1D(24))
model.add(layers.Flatten())
model.add(layers.Dense(5, activation = "relu"))
model.add(layers.Dense(1, activation = "sigmoid"))
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 24, 5)             59220     
_________________________________________________________________
average_pooling1d_3 (Average (None, 1, 5)              0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 5)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 6         
Total params: 59,256
Trainable params: 59,256
Non-trainable params: 0
_________________________________________________________________


In [38]:
train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))
train_label = np.array(list(np.array(x[1]).astype(int) for x in trainwlabels))
validation_data = np.array(list(np.array(x[0]).astype(int) for x in validationwlabels))
validation_label = np.array(list(np.array(x[1]).astype(int) for x in validationwlabels))
test_data = np.array(list(np.array(x[0]).astype(int) for x in testwlabels))
test_label = np.array(list(np.array(x[1]).astype(int) for x in testwlabels))

In [39]:
model.fit(
    train_data, train_label,
    epochs = 30, 
    verbose = True,
    validation_data = (validation_data, validation_label)
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1f2b426bbe0>

In [40]:
predictions = model.predict(test_data)

#The model outputs probabilities, so convert them into classes by mapping them to True/False depending on whether they
#are greater than 0.5, then casting this to an integer type will convert True and False into 1s and 0s compatible with 
#the labels from the testing set
predictions = (predictions > 0.5).astype('int').reshape(-1)

In [41]:
from sklearn.metrics import classification_report
print(classification_report(test_label, predictions))

              precision    recall  f1-score   support

           0       0.34      0.02      0.04      1209
           1       0.68      0.98      0.80      2546

    accuracy                           0.67      3755
   macro avg       0.51      0.50      0.42      3755
weighted avg       0.57      0.67      0.56      3755

