In [1]:
import pandas as pd
import numpy as np
import copy
import emoji

#make the columns as wide as possible so we can see all the text
pd.set_option('display.max_colwidth', None)

In [2]:
original_training = pd.read_csv("mediaeval-2015-trainingset.txt", delimiter = "\t")

#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

#add a column to store the language
original_training["lang"] = np.nan

#view a specific range, uncomment to view only those tweets from certain classes
original_training[10000:10020]

Unnamed: 0,tweetText,label,lang
10000,So touching! RT @DreamCameTrue_: RT @Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/yBaVo3FZ /via @CarrieFairygirl,real,
10001,Thank goodness for people who are kind #sandy http://t.co/Pc25SgSy /via @CarrieFairygirl,real,
10002,RT “@Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/wfMtqwjl /via @CarrieFairygirl”\n\nComplete fire hazard,real,
10003,RT “@Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/wfMtqwjl /via @CarrieFairygirl”\n\nComplete fire hazard,real,
10004,Thank goodness for people who are kind #sandy http://t.co/ghz8E5je /via @CarrieFairygirl,real,
10005,@Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/WE72RVCE /via @CarrieFairygirl LOVE THIS!,real,
10006,RT @alyssa_milano: Thank goodness for people who are kind #sandy http://t.co/45y8vlMQ /via @CarrieFairygirl,real,
10007,“@Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/wuZ5PyNk /via @CarrieFairygirl” Awesome.,real,
10008,RT @Alyssa_Milano Thank goodness for people who are kind #sandy http://t.co/3g21wFvG /via @CarrieFairygirl,real,
10009,"É, a humanidade ainda tem jeito. “@Alyssa_Milano: Thank goodness for people who are kind #sandy http://t.co/xwu5jSIg /via @CarrieFairygirl”",real,


In [3]:
#repeat the same process for the testing dataset
original_testing = pd.read_csv("mediaeval-2015-testset.txt", delimiter = "\t")
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)
original_testing["lang"] = np.nan

In [4]:
#we can see that the dataset is skewed towards fake and humor tweets
original_training.label.value_counts()

fake     6742
real     4921
humor    2614
Name: label, dtype: int64

In [5]:
#there are no non null values to being with we can see
original_training.info()
print("\n")
original_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweetText  14277 non-null  object 
 1   label      14277 non-null  object 
 2   lang       0 non-null      float64
dtypes: float64(1), object(2)
memory usage: 334.7+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweetText  3755 non-null   object 
 1   label      3755 non-null   object 
 2   lang       0 non-null      float64
dtypes: float64(1), object(2)
memory usage: 88.1+ KB


In [6]:
import langdetect as l
import nltk
nltk.download('punkt')
import nltk.stem as st
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/georgegarrington/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
#view the languages supported by the stemming algorithm
st.SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [11]:
#responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(st.SnowballStemmer.languages)
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language,
    #stems tokens in language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = l.detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        #if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        #and stopwords may still be supported in the language e.g. arabic, hungarian, romanian so tokenize with the english
        #version of the algorithm if this is the case and use the stemming and stopwords specific to the language if this
        #is available even if the tokenization algorithm isnt
        tokens = nltk.word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = st.SnowballStemmer(nltkprop)
        
        #store all tokens to be output here, filter out any unwanted tokens and don't add them
        #store the tokens in a space seperated string so that it can be fed into a count vectorizer
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            filtered_tokens += " " + stemmer.stem(tok)
            #carry out stemming specific to the language detected
            #filtered_tokens.add(stemmer.stem(tok))            
        
        #comment these out when you do not need to check if it works anymore
        #print("original tokens:", tokens,"\n")
        #print("filtered tokens:", filtered_tokens,"\n")
        
        #don't think having a column of the languages is really necessary but let's see
        return filtered_tokens, lang_prediction

In [12]:
#change the name of the property to tokens which is more appropriate
#training_set = training_set.rename(columns = {"tweetText" : "tokens"})

#make a new dataset from the original dataset transforming the tweet text into stemmed, simplified and filtered 
#tokens as handled by the TweetHandler class
def transform_data(arg):

    #copy the instance given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = copy.deepcopy(arg)
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset["tokens"][i]
        label = dataset["label"][i]

        #disregard the humour information for now, simplify all the classes to two different classes
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
            
        #for testing
        #print("The old value of the row is:",dataset.loc[i],"\n")
        
        tokens, lang = th.parse_tweet(tweet)
        
        #change the type of tweetText from a string to a set of strings of the stemmed tokenized words
        dataset.loc[i] = tokens, label, lang
        
        #for testing
        #print("The new value of the row is:",dataset.loc[i],"\n\n")
    
    return dataset

In [13]:
#transform the data
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [14]:
#have a quick look to see if the tokens have been simplified how we want them to, which they have
simplified_training

Unnamed: 0,tokens,label,lang
0,acuerd pelicul el dia despues mañan me recuerd pas huracan sandy,1,es
1,milenagimon mir sandy ny tremend imag huracan parec dia independent real rt,1,es
2,buen fot huracan sandy recuerd pelicul dia independent id4 sandy,1,es
3,scari shit hurrican ny,1,en
4,my fave place world nyc hurrican sandi statueofliberti,1,en
...,...,...,...
14272,bobombdom slap tweetdeck pigfish,1,en
14273,new speci fish found brazil realli good photoshop what you think,1,en
14274,what call pigfish,1,en
14275,pigfish e dop pescecan pesc maial,1,it


In [24]:
#get an idea of how many of each language there are, we can see that it is predominantly english
simplified_training.lang.value_counts(normalize = True) * 100

en         76.829866
es          9.028507
unknown     7.424529
fr          1.512923
pt          1.106675
de          0.882538
it          0.707432
nl          0.616376
ar          0.553338
ru          0.420256
sv          0.287175
no          0.252154
da          0.189115
fi          0.098060
ro          0.049030
hu          0.042026
Name: lang, dtype: float64

In [26]:
simplified_training.label.value_counts()

1    9356
0    4921
Name: label, dtype: int64

In [27]:
simplified_testing[simplified_testing.lang == "en"]

Unnamed: 0,tokens,label,lang
0,kereeen rt shyman33 eclips iss,1,en
1,absolut beauti rt shyman33 eclips iss,1,en
2,shyman33 eclips iss 우주에서본 일식 wow amaz,1,en
3,eclips iss,1,en
8,dit dus rt the solar eclips seen intern space station solareclips iss space,1,en
...,...,...,...
3693,syria syrian hero boy rescu girl shootout,1,en
3695,böhmermann zdf neo varoufaki fake finger varoufak neo magazin royal mit jan b https via youtub,1,en
3743,top stori jan böhmermann ist laut zdf fake see,1,en
3748,zdf neo lay jauch varoufaki stinkefing worth watch,1,en


In [28]:
#view the percentages of each language
simplified_testing.lang.value_counts(normalize = True) * 100

en         73.981358
unknown    15.472703
ar          4.687084
es          1.677763
de          1.091877
pt          0.958722
fr          0.878828
nl          0.559254
it          0.479361
fi          0.106525
sv          0.053262
ru          0.026631
ro          0.026631
Name: lang, dtype: float64

In [29]:
#check if there are any null values after simplifying, there aren't any
simplified_training.info()
print("\n")
simplified_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  14277 non-null  object
 1   label   14277 non-null  object
 2   lang    14277 non-null  object
dtypes: object(3)
memory usage: 334.7+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  3755 non-null   object
 1   label   3755 non-null   object
 2   lang    3755 non-null   object
dtypes: object(3)
memory usage: 88.1+ KB


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA

In [32]:
#class to convert the sparse matrix produced by the count vectorizer into a dense matrix in order for it 
#to be able to be used with different algorithms that require a dense matrix and not a sparse matrix
class Densifier():

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None, **kwarge):
        return X.todense()

In [59]:
#pipeline which always starts with a countvectorizer
pipeline = Pipeline([
    ('cv', CountVectorizer()), 
    #('dt', Densifier()), 
    ('nb', MultinomialNB())
])

#carry out a grid search for optimal parameter selection. Due to constraints of using my laptop this
#can't be done extensively 
clf = GridSearchCV(pipeline, {
    'cv__ngram_range': [(1,x) for x in range(3,10)],
    'cv__max_features' : [1000* i for i in range(8,12)],
    'nb__alpha' : [0.5 * i for i in range (1,10)]
}, scoring='f1', verbose = 1) 
#make sure we are focusing on maximizing the f1 score and not a different metric, add some verbosity so we can
#see the progress of the grid search to get an idea of how much time it is taking

In [58]:
#make sure the labels are treated as ints and not objects
clf.fit(simplified_training.tokens,simplified_training.label.astype('int'))

Fitting 5 folds for each of 252 candidates, totalling 1260 fits
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5 ....


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5, score=0.841, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5 ....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5, score=0.880, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5 ....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5, score=0.928, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5 ....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s


[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5, score=0.897, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5 ....


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.3s remaining:    0.0s


[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=0.5, score=0.798, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.839, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.877, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.929, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.897, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.795, total=   0.3s
[CV] cv__max_features=8000, cv__ngram_range=(1, 3), nb__alpha=1.5 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.0, score=0.797, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.840, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.867, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.934, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.901, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.803, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 4), nb__alpha=2.0 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=1.5, score=0.804, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.836, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.863, total=   0.4s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.937, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.906, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.805, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 5), nb__alpha=2.5 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.0, score=0.814, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.834, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.864, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.935, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.910, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.818, total=   0.5s
[CV] cv__max_features=8000, cv__ngram_range=(1, 6), nb__alpha=3.0 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=2.5, score=0.817, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.834, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.863, total=   0.8s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.936, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.912, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.816, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 7), nb__alpha=3.5 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.0, score=0.816, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.834, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.859, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.936, total=   0.7s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.906, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.819, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 8), nb__alpha=4.0 ....
[CV]

[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=3.5, score=0.819, total=   0.7s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.831, total=   0.7s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.856, total=   0.7s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.936, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.907, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0 ....
[CV]  cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.818, total=   0.6s
[CV] cv__max_features=8000, cv__ngram_range=(1, 9), nb__alpha=4.5 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.0, score=0.809, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5, score=0.835, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5, score=0.869, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5, score=0.938, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5, score=0.908, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 3), nb__alpha=4.5, score=0.809, total=   0.3s
[CV] cv__max_features=9000, cv__ngram_range=(1, 4), nb__alpha=0.5 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 4), nb__alpha=4.5, score=0.813, total=   0.4s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5, score=0.841, total=   0.4s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5, score=0.870, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5, score=0.934, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5, score=0.903, total=   0.4s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=0.5, score=0.801, total=   0.4s
[CV] cv__max_features=9000, cv__ngram_range=(1, 5), nb__alpha=1.0 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=0.5, score=0.812, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0, score=0.838, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0, score=0.866, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0, score=0.935, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0, score=0.906, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.0, score=0.815, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 6), nb__alpha=1.5 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.0, score=0.814, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5, score=0.837, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5, score=0.865, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5, score=0.938, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5, score=0.909, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=1.5, score=0.816, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 7), nb__alpha=2.0 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=1.5, score=0.812, total=   0.6s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0, score=0.837, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0, score=0.862, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0, score=0.936, total=   0.6s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0, score=0.910, total=   0.6s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.0, score=0.811, total=   0.5s
[CV] cv__max_features=9000, cv__ngram_range=(1, 8), nb__alpha=2.5 ....
[CV]

[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.0, score=0.818, total=   0.9s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5, score=0.835, total=   0.9s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5, score=0.861, total=   0.8s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5, score=0.937, total=   0.8s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5, score=0.912, total=   0.8s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5 ....
[CV]  cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=2.5, score=0.818, total=   0.8s
[CV] cv__max_features=9000, cv__ngram_range=(1, 9), nb__alpha=3.0 ....
[CV]

[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=2.5, score=0.807, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0, score=0.835, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0, score=0.872, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0, score=0.937, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0, score=0.910, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.0, score=0.807, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 3), nb__alpha=3.5 ..

[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.0, score=0.810, total=   0.3s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5, score=0.837, total=   0.4s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5, score=0.864, total=   0.4s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5, score=0.939, total=   0.4s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5, score=0.907, total=   0.4s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=3.5, score=0.811, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 4), nb__alpha=4.0 ..

[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=3.5, score=0.812, total=   0.4s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0, score=0.834, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0, score=0.863, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0, score=0.939, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0, score=0.911, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.0, score=0.815, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 5), nb__alpha=4.5 ..

[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.0, score=0.818, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5, score=0.830, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5, score=0.861, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5, score=0.940, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5, score=0.914, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 6), nb__alpha=4.5, score=0.820, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 7), nb__alpha=0.5 ..

[CV]  cv__max_features=10000, cv__ngram_range=(1, 7), nb__alpha=4.5, score=0.821, total=   0.5s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5, score=0.842, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5, score=0.871, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5, score=0.937, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5, score=0.908, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=0.5, score=0.807, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 8), nb__alpha=1.0 ..

[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=0.5, score=0.810, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0, score=0.840, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0, score=0.865, total=   0.7s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0, score=0.939, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0, score=0.910, total=   0.6s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0 ...
[CV]  cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.0, score=0.814, total=   0.7s
[CV] cv__max_features=10000, cv__ngram_range=(1, 9), nb__alpha=1.5 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.0, score=0.799, total=   0.3s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5, score=0.839, total=   0.3s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5, score=0.877, total=   0.3s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5, score=0.935, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5, score=0.905, total=   0.3s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=1.5, score=0.802, total=   0.3s
[CV] cv__max_features=11000, cv__ngram_range=(1, 3), nb__alpha=2.0 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=1.5, score=0.808, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0, score=0.839, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0, score=0.869, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0, score=0.936, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0, score=0.905, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.0, score=0.809, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 4), nb__alpha=2.5 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.0, score=0.812, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5, score=0.838, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5, score=0.866, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5, score=0.938, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5, score=0.907, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=2.5, score=0.814, total=   0.4s
[CV] cv__max_features=11000, cv__ngram_range=(1, 5), nb__alpha=3.0 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=2.5, score=0.815, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0, score=0.833, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0, score=0.865, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0, score=0.941, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0, score=0.912, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.0, score=0.814, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 6), nb__alpha=3.5 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.0, score=0.818, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5, score=0.835, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5, score=0.863, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5, score=0.940, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5, score=0.913, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=3.5, score=0.819, total=   0.5s
[CV] cv__max_features=11000, cv__ngram_range=(1, 7), nb__alpha=4.0 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=3.5, score=0.822, total=   0.8s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0, score=0.832, total=   0.7s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0, score=0.859, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0, score=0.941, total=   0.8s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0, score=0.914, total=   1.2s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.0, score=0.824, total=   1.2s
[CV] cv__max_features=11000, cv__ngram_range=(1, 8), nb__alpha=4.5 ..

[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.0, score=0.823, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5, score=0.832, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5, score=0.858, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5, score=0.943, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5, score=0.915, total=   0.6s
[CV] cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5 ...
[CV]  cv__max_features=11000, cv__ngram_range=(1, 9), nb__alpha=4.5, score=0.823, total=   0.7s


[Parallel(n_jobs=1)]: Done 1260 out of 1260 | elapsed: 10.9min finished


GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cv__max_features': [8000, 9000, 10000, 11000],
                         'cv__ngram_range': [(1, 3), (1, 4), (1, 5), (1, 6),
                                             (1, 7), (1, 8), (1, 9)],
                         'nb__alpha': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0,
                                       4.5]},
             scoring='f1', verbose=5)

In [64]:
clf.__dict__

{'scoring': 'f1',
 'estimator': Pipeline(steps=[('cv', CountVectorizer()), ('nb', MultinomialNB())]),
 'n_jobs': None,
 'iid': 'deprecated',
 'refit': True,
 'cv': None,
 'verbose': 1,
 'pre_dispatch': '2*n_jobs',
 'error_score': nan,
 'return_train_score': False,
 'param_grid': {'cv__ngram_range': [(1, 3),
   (1, 4),
   (1, 5),
   (1, 6),
   (1, 7),
   (1, 8),
   (1, 9)],
  'cv__max_features': [8000, 9000, 10000, 11000],
  'nb__alpha': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5]}}

In [63]:
#view the results of the grid search
pd.DataFrame(clf.cv_results_)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [65]:
y_test_true = simplified_testing.label
y_test_predictions = clf.predict(simplified_testing.tokens)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [54]:
from sklearn.metrics import classification_report

In [56]:
print(classification_report(y_test_true.astype('int'), y_test_predictions.astype('int')))

              precision    recall  f1-score   support

           0       0.64      0.80      0.71      1209
           1       0.89      0.78      0.83      2546

    accuracy                           0.79      3755
   macro avg       0.76      0.79      0.77      3755
weighted avg       0.81      0.79      0.79      3755

