In [1]:
import pandas as pd
import numpy as np
import copy

#make the columns as wide as possible so we can see all the text
pd.set_option('display.max_colwidth', None)

In [2]:
original_training = pd.read_csv("mediaeval-2015-trainingset.txt", delimiter = "\t")
original_training.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [3]:
#repeat the same process for the testing dataset
original_testing = pd.read_csv("mediaeval-2015-testset.txt", delimiter = "\t")
original_testing.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,578854927457349632,kereeen RT @Shyman33: Eclipse from ISS.... http://t.co/je2hcFpVfN,70824972,eclipse_01,peay_s,Fri Mar 20 09:45:43 +0000 2015,fake
1,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse from ISS.... http://t.co/oqwtTL0ThS,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
2,578891261353984000,“@Shyman33: Eclipse from ISS.... http://t.co/C0VfboScRj” 우주에서본 3.20 일식 Wow! amazing!,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
3,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
4,578975333841551360,@ebonfigli: Éclipse vue de l'ISS... Autre chose... http://t.co/yNBN7c4O51\n\nLa création divine n'a pas de limite 😍,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake


In [4]:
# BOOKMARK 1

#we can see that the dataset is skewed towards fake and humor tweets
original_training.label.value_counts(normalize = True) * 100

fake     47.222806
real     34.468025
humor    18.309169
Name: label, dtype: float64

In [5]:
# BOOKMARK 2

#(6742 + 2614) - 4921 = 4435 additional real entries needed to make the dataset balanced
original_training.label.value_counts()

fake     6742
real     4921
humor    2614
Name: label, dtype: int64

In [6]:
# BOOKMARK 3

#there are no non null values to being with we can see
original_training.info()
print("\n")
original_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int6

In [7]:
# BOOKMARK 4

#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

#Do the same for the testing set
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

original_training.head()

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake


In [8]:
# BOOKMARK 5 

#we can see that not all the posts are in English
original_training[:10]

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake
5,42nd #time #square #NYC #subway #hurricane http://t.co/daX5YY7X,fake
6,Just in time for #halloween a photo of #hurricane #sandy #frankenstorm http://t.co/xquKB4VN,fake
7,Crazy pic of #Hurricane #Sandy prayers go out to family and friends on the East Coast http://t.co/c4sceiMt,fake
8,#sandy #newyork #hurricane #statueofliberty #USA http://t.co/iQfEbO1E,fake
9,#nyc #hurricane http://t.co/Gv3QxZlq,fake


In [14]:
# BOOKMARK 6

#add a column to store the language, initially empty before langdetect populates it
original_training["lang"] = np.nan
original_testing["lang"] = np.nan
original_training.head()

Unnamed: 0,tweetText,label,lang
0,¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN,fake,
1,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del huracán. Parece el ""Día de la Independencia 2"" http://t.co/41jUweux REAL! RT.",fake,
2,"Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",fake,
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,fake,
4,My fave place in the world #nyc #hurricane #sandy #statueofliberty 🗽 http://t.co/Ex61doZk,fake,


In [15]:
import langdetect as l
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk.stem as st
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/georgegarrington/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/georgegarrington/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#view the languages supported by the stemming algorithm
st.SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [19]:
# BOOKMARK 7 TweetHandler class

#responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(st.SnowballStemmer.languages)
        #some languages are supported by stemming but NOT supported by language specific tokenizing,
        #only the tokens that are in this set are supported by language specific tokenizing
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language, tokenizes and stems
    #specific to the detected language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = l.detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        # if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        # and stopwords may still be supported in the language that does not support language specific tokenization
        # e.g. arabic, hungarian, romanian so tokenize with the english
        # version of the algorithm if this is the case and use the stemming and stopwords specific to 
        # the language if this is available even if the tokenization algorithm isnt
        # use a python ternary expression to do this
        tokens = nltk.word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = st.SnowballStemmer(nltkprop)
        
        # store all tokens to be output as a concatenated string here so that this string
        # can later be fed to a CountVectorizer or TfIDFVectorizer , filter out any unwanted tokens 
        # and don't add them 
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            #carry out stemming specific to the language detected
            filtered_tokens += " " + stemmer.stem(tok)
        
        #comment these out when you do not need to check if it works anymore
        #print("original tokens:", tokens,"\n")
        #print("filtered tokens:", filtered_tokens,"\n")
        
        return filtered_tokens, lang_prediction

In [20]:
# BOOKMARK 8, transform the dataset from a dataset of tweets into a dataset of labelled tokens in concatenated
# string form, along with the detected language

def transform_data(arg):

    #copy the instance given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = copy.deepcopy(arg)
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset["tokens"][i]
        label = dataset["label"][i]

        #disregard the humour information for now, map humor and fake to a single class
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
            
        #for testing
        #print("The old value of the row is:",dataset.loc[i],"\n")
        
        tokens, lang = th.parse_tweet(tweet)
        
        #replace the row with the simplified tokens, the mapped labels and the detected language
        dataset.loc[i] = tokens, label, lang
        
        #for testing
        #print("The new value of the row is:",dataset.loc[i],"\n\n")
    
    return dataset

In [21]:
#transform the data
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [24]:
# BOOKMARK 9

#get an idea of how many of each language there are, we can see that it is predominantly english
simplified_training.lang.value_counts(normalize = True) * 100

en         76.794845
es          9.091546
unknown     7.410520
fr          1.505919
pt          1.134692
de          0.854521
it          0.672410
nl          0.595363
ar          0.553338
ru          0.427261
sv          0.308188
no          0.245150
da          0.189115
fi          0.112068
ro          0.063038
hu          0.042026
Name: lang, dtype: float64

In [25]:
simplified_training.label.value_counts()

1    9356
0    4921
Name: label, dtype: int64

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [29]:
#class to convert the sparse matrix produced by the count vectorizer into a dense matrix in order for it 
#to be able to be used with different algorithms in a pipeline that require a dense matrix and not a sparse matrix
class Densifier():

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None, **kwargs):
        return X.todense()

In [19]:
# !!!! DONT THINK YOU NEED THIS !!!!!

def testNaiveBayes():
    
    #pipeline which always starts with a countvectorizer
    pipeline = Pipeline([
        ('cv', CountVectorizer()), 
        #('dt', Densifier()), 
        ('nb', MultinomialNB())
    ])

    #carry out a grid search for optimal parameter selection. Due to constraints of using my laptop this
    #can't be done that extensively, I will leave it to run overnight a few times
    clf = GridSearchCV(pipeline, {
        'cv__ngram_range': [(1,x) for x in range(9,10)],
        'cv__max_features' : [1000* i for i in range(8,12)]
        #,'nb__alpha' : [0.5 * i for i in range (5,7)]
    }, scoring='f1', verbose = 4) 
    #make sure we are focusing on maximizing the f1 score and not a different metric, add some verbosity so we can
    #see the progress of the grid search to get an idea of how much time it is taking
    
    #make sure the labels are treated as ints and not objects
    clf.fit(simplified_training.tokens,simplified_training.label.astype('int'))

In [20]:
def gridSearch(pipeline, params):
    
    #carry out a grid search for optimal parameter selection. Due to constraints of using my laptop this
    #can't be done that extensively, I will leave it to run overnight a few times
    
    #make n_jobs -1 so that all cores that are available are used which should hopefully make it quicker
    
    clf = GridSearchCV(pipeline, params, scoring = "f1", verbose = 6, n_jobs = -1)
    
    #make sure we are focusing on maximizing the f1 score and not a different metric, add some verbosity so we can
    #see the progress of the grid search to get an idea of how much time it is taking
    
    #make sure the labels are treated as ints and not objects
    clf.fit(simplified_training.tokens, simplified_training.label.astype("int"))
    return clf

In [39]:
#pipeline which always starts with a countvectorizer
mNaiveBayesPipeline = Pipeline([
    ('cv', CountVectorizer()), 
    #('dt', Densifier()), #used for converting from sparse matrix to dense matrix
    ('nb', MultinomialNB())
])

#the second lot of parameters to try
mNaiveBayesParams = {
    'cv__ngram_range': [(1,x) for x in range(14,18)],
    'cv__max_features' : [500 * i for i in range(2 * 12,2 * 17)],
    'nb__alpha' : [0.5 * i for i in range (0,6)]
}

#ignore this one for now
cNBpipeline = Pipeline([('cv', CountVectorizer()), ('cnb', ComplementNB())])
cNBparams = {
    'cv__ngram_range': [(1,x) for x in range(14,18)],
    'cv__max_features' : [1000* i for i in range(11,18)],
    'cnb__alpha' : [1 * i for i in range (0,9)]
}

#add: ('pca', PCA()),
lsvcPipeline = Pipeline([('cv', CountVectorizer()), ('df', Densifier()), ('pca', PCA()), ('lsvc', LinearSVC())])
lsvcParams = {
    'cv__ngram_range': [(1,x) for x in range(10,15)],
    'cv__max_features' : [1000* i for i in range(14,18)],
    'lsvc__C' : [1 * i for i in range(0,3)]
}

rForestPipeline = Pipeline([('cv', CountVectorizer()), ('rf', RandomForestClassifier())])
rForestParams = {
    'cv__ngram_range': [(1,x) for x in range(13,19)],
    'cv__max_features' : [1000* i for i in range(10,17)],
    'rf__n_estimators' : [10 * i for i in range(10,15)]
}


In [34]:
from sklearn.metrics import classification_report

In [35]:
def reportScores(clf):
    
    print("---- RESULTS ----","\n")
    print("The algorithm being optimised was:",clf.estimator.steps[-1])
    print("The best parameters found were:", clf.best_params_)
    y_test_true = simplified_testing.label
    y_test_predictions = clf.predict(simplified_testing.tokens)
    print("Score report:\n")
    #prevent the chance of any of the lists being treated as objects
    print(classification_report(y_test_true.astype('int'), y_test_predictions.astype('int')))

In [None]:
#FIRST SEARCH: ngram 3-16, max features 5-16, alpha 0-9, RESULT: ngram: (1,15), features: 15000, alpha: 0
#so now try making the features and ngram even LARGER! got an accuracy of 81

#SECOND SEARCH: similar not much change, ngram 14-18, features 13-19, alpha 0-2
#RESULTS: features 14000, ngram 15, alpha 0
mNBResult = gridSearch(mNaiveBayesPipeline, mNaiveBayesParams)
reportScores(mNBResult)

In [None]:
# best parameters found from alpha 0-9, max features 5k-16k and ngram (3,16): 
# alpha 2, max features 15000, ngram (1,15) with accuracy of 66. Not very good
cNBResult = gridSearch(cNBpipeline, cNBparams) 
reportScores(cNBResult)

In [None]:
# 66 was okay... tried ngram 13-17, max features 13-17 and estimators 80-110 and best was 13000 features, ngram 1,15 and n estimators 110
#so try again with everything a bit higher
rForestResult = gridSearch(rForestPipeline, rForestParams)
reportScores(rForestResult)

In [None]:
#parameters searched: ngram 10-12, max features 10-12, C 0-2, found: ngram 11, max features 12000
#introducing PCA made the computer die so don't use it again
lsvcResult = gridSearch(lsvcPipeline, lsvcParams)
reportScores(lsvcResult)

In [None]:
# EXTRA ONES TO DO AFTERWARDS, KEEP ADDING THEM BELOW THEN COMMENT THE RESULTS UP ABOVE

In [31]:
rForestResult = gridSearch(rForestPipeline, rForestParams)
reportScores(rForestResult)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 40.2min
[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed: 42.6min finished


---- RESULTS ---- 

The algorithm being optimised was: ('rf', RandomForestClassifier())
The best parameters found were: {'cv__max_features': 12000, 'cv__ngram_range': (1, 15), 'rf__n_estimators': 100}
Score report:

              precision    recall  f1-score   support

           0       0.28      0.05      0.08      1209
           1       0.68      0.94      0.79      2546

    accuracy                           0.65      3755
   macro avg       0.48      0.49      0.43      3755
weighted avg       0.55      0.65      0.56      3755



In [32]:
lsvcResult = gridSearch(lsvcPipeline, lsvcParams)
reportScores(lsvcResult)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 11.0min finished


---- RESULTS ---- 

The algorithm being optimised was: ('lsvc', LinearSVC())
The best parameters found were: {'cv__max_features': 15000, 'cv__ngram_range': (1, 13), 'lsvc__C': 1}
Score report:

              precision    recall  f1-score   support

           0       0.64      0.80      0.71      1209
           1       0.89      0.78      0.84      2546

    accuracy                           0.79      3755
   macro avg       0.77      0.79      0.77      3755
weighted avg       0.81      0.79      0.79      3755



In [None]:
mNBResult = gridSearch(mNaiveBayesPipeline, mNaiveBayesParams)
reportScores(mNBResult)

In [None]:
#NOW TRY DOING THE SAME BUT USING TFIDF INSTEAD

mnbtPipeline = Pipeline([
    ('tf', TfidfVectorizer()), 
    #('dt', Densifier()), #used for converting from sparse matrix to dense matrix
    ('nb', MultinomialNB())
])

#the second lot of parameters to try
mnbtParams = {
    'tf__ngram_range': [(1,0.5 * x) for x in range(2 * 12,2 *17)],
    'tf__norm' : ["l1", "l2"],
    'nb__alpha' : [0.25 * i for i in range (0,6)]
}

#ignore this one for now
cnbtPipeline = Pipeline([('tf', TfidfVectorizer()), ('cnb', ComplementNB())])
cnbtParams = {
    'tf__ngram_range': [(1,x) for x in range(14,18)],
    'tf__norm' : ["l1", "l2"],
    'cnb__alpha' : [1 * i for i in range (0,9)]
}

#add: ('pca', PCA()),
lsvctPipeline = Pipeline([('tf', TfidfVectorizer()), ('df', Densifier()), ('lsvc', LinearSVC())])
lsvctParams = {
    'tf__ngram_range': [(1,x) for x in range(10,15)],
    'tf__norm' : ["l1", "l2"],
    'lsvc__C' : [1 * i for i in range(0,3)]
}

rftPipeline = Pipeline([('tf', TfidfVectorizer()), ('rf', RandomForestClassifier())])
rftParams = {
    'tf__ngram_range': [(1,x) for x in range(13,19)],
    'tf__norm' : ["l1", "l2"],
    'rf__n_estimators' : [10 * i for i in range(10,15)]
}


In [None]:
#FIRST SEARCH: ngram 14-18, alpha 0-6, RESULT: alpha = 0.5, ngram = 14, norm = l2, accuracy = 87

#SECOND SEARCH: 
mnbtResult = gridSearch(mnbtPipeline, mnbtParams)
reportScores(mnbtResult)

In [None]:
cnbtResult = gridSearch(cnbtPipeline, cnbtParams)
reportScores(cnbtResult)

In [None]:
lsvctResult = gridSearch(lsvctPipeline, lsvctParams)
reportScores(lsvctResult)

In [None]:
#FIRST SEARCH: 
rftResult = gridSearch(rftPipeline, rftParams)
reportScores(rftResult)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 39.7min
