# CS4248 Project - Labelled Unreliable News (LUN)

## Imports

In [1]:
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string
import gensim

from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from readability import Readability
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmer = None
# lemmatizer = None

TEST_SIZE = 0.1
SMOOTHING = 1.0
NGRAM_RANGE = (1, 1)

## Helper Functions

In [3]:
def preprocess(sentence, lower_case=True, remove_punctuation=True, replace_contractions=True):
    if lower_case:
        sentence = sentence.lower()
    if remove_punctuation:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    if replace_contractions:
        news = sentence.split()
        new_news = []
        for word in news:
            if word in contractions:
                new_news.append(contractions[word])
            else:
                new_news.append(word)
        sentence = " ".join(new_news)
    return sentence

In [4]:
def tokenize(sentence, stemmer=stemmer, lemmatizer=lemmatizer, remove_stop_words=False):
    tokens = word_tokenize(sentence)
    
    if remove_stop_words:
        tokens = [token for token in tokens if token not in stop_words]
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
    return tokens

## Load Data

In [5]:
full_train_df = pd.read_csv('raw_data/fulltrain.csv', header=None)
full_train_df.columns = ['label', 'text']
full_train_df.head()

Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [6]:
train_df = full_train_df.drop_duplicates(subset=['text'])
print(f"No. training samples (all classes): {len(train_df)}")

No. training samples (all classes): 48652


## Training

In [10]:
X = train_df['text'].values
y = train_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=42)
X_train.shape, X_test.shape

((43786,), (4866,))

In [8]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, smooth_idf=True, preprocessor=preprocess, tokenizer=tokenize, token_pattern=None)
# Uncomment for default TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, smooth_idf=True)

In [12]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(feature_names)}")
print(feature_names[:150])

Vocabulary size: 287057
['0' '00' '000' '0000' '000000' '00000000' '00000000000000'
 '0000000000000001834' '00000000001' '0000000001ounce' '000000003'
 '00000001' '0000000rom' '0000001' '0000004' '000001' '00000138'
 '00000153' '00000186' '000002' '000004' '000005' '000006' '0000068'
 '00001' '00001010' '000013s' '000015' '000016' '00003' '00004' '00005'
 '00007' '0000gmt' '0000hrs' '0001' '00010'
 '00010010101010101010101010101010' '00010052' '0001111' '000125' '00016'
 '00016925' '000185' '0001er' '0001ers' '0001erspredictable' '0001that'
 '0001which' '00020004' '0003' '00030' '00033' '00034' '0004' '00049'
 '0005' '00050860' '00053' '000567kln00067q' '0006' '0007' '00071'
 '0007149' '0008' '00080' '000fold' '000s' '000x' '001' '0010' '00100000'
 '00100001' '001000011011010101010101010101010101010010' '00101110' '0015'
 '0016' '0018' '0019' '001as' '001delivering' '001ers' '001essentially'
 '001mwcm2' '001s' '001same' '001that' '001unfuknbeeeleeevable' '002'
 '0021' '00226' '0025' '0

In [13]:
print(X_train_tfidf.shape)
print(X_train_tfidf)

(43786, 287057)
  (0, 19243)	0.06228037137564887
  (0, 17490)	0.047131398921413464
  (0, 21182)	0.06228037137564887
  (0, 213159)	0.022438264955753927
  (0, 48866)	0.043107901331845395
  (0, 29994)	0.03497361987001771
  (0, 54665)	0.017491060155421477
  (0, 56016)	0.016107396006457345
  (0, 34728)	0.048959900575351684
  (0, 6680)	0.018517857479427826
  (0, 630)	0.018009742697625838
  (0, 163447)	0.021167670007759626
  (0, 279676)	0.027837730905439256
  (0, 173365)	0.07770955198058324
  (0, 4836)	0.037897970811578245
  (0, 14640)	0.02061631588966255
  (0, 56536)	0.029259568056674932
  (0, 94314)	0.011674792652959566
  (0, 66111)	0.02906307540372125
  (0, 258015)	0.056877835687428116
  (0, 213503)	0.015154253465198171
  (0, 52731)	0.008114416114198593
  (0, 134090)	0.03645305696911107
  (0, 42247)	0.012222576232641578
  (0, 136050)	0.024303535980486433
  :	:
  (43785, 187968)	0.03801152260274759
  (43785, 103791)	0.02791798374294524
  (43785, 34081)	0.05821512825753399
  (43785, 151482)	

In [15]:
X_train_tfidf.shape, y_train.shape

((43786, 287057), (43786,))

## Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0, max_iter=200).fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

### Testing

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = clf.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

### Validation

In [None]:
test_df = pd.read_csv('raw_data/balancedtest.csv', header=None, names=['label', 'text'])
print(f"No. test samples (all classes): {len(test_df)}")
test_df.sample(5)

In [None]:
X_val = test_df['text'].values
y_val = test_df['label'].values
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_val_tfidf.shape

In [None]:
y_pred_val = clf.predict(X_val_tfidf)
print(f"Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"F1 score: {f1_score(y_val, y_pred_val, average='macro')}")

In [None]:
print(classification_report(y_val, y_pred_val, target_names=['satire', 'hoax', 'propaganda', 'reliable']))

In [None]:
cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()	

## Naive Bayes

In [None]:
nb = MultinomialNB().fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

### Testing

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = nb.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

### Validation

In [None]:
y_pred_val = nb.predict(X_val_tfidf)
print(f"Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"F1 score: {f1_score(y_val, y_pred_val, average='macro')}")

In [None]:
print(classification_report(y_val, y_pred_val, target_names=['satire', 'hoax', 'propaganda', 'reliable']))

In [None]:
cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()	

## Linear Support Vector Machine

In [None]:
sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None).fit(X_train_tfidf, y_train)
y_pred = sgd.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

### Testing

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = sgd.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

### Validation

In [None]:
y_pred_val = nb.predict(X_val_tfidf)
print(f"Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"F1 score: {f1_score(y_val, y_pred_val, average='macro')}")

In [None]:
print(classification_report(y_val, y_pred_val, target_names=['satire', 'hoax', 'propaganda', 'reliable']))

In [None]:
cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()	

## Random Forest

In [14]:
rfclf = RandomForestClassifier(n_estimators = 100, max_features = 'sqrt').fit(X_train_tfidf, y_train)
y_pred = rfclf.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

Accuracy: 1.0
F1 score: 1.0


### Hyper-Paramter Tuning

In [15]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [17]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train_tfidf, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


ValueError: 
All the 300 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: "These were the types of blunders expected in August, not September. Not when the games start to count. Faulty headsets. A holding penalty on third-and-short. Consecutive face- mask penalties, at 15 yards apiece. Lining up only 10 men on the field on defense. Being forced to call a timeout because the offense is lined up incorrectly. 'Winning is a result of doing a lot of things right,' Denver Broncos coach Josh McDaniels said Monday. 'Yesterday, we did some things right. We just didn't do enough of them right.' Add it up, and the litany of mistakes, mental miscues and miscommunication had a tangible effect in the Broncos' 24-17 loss Sunday at Jacksonville. Wasn't the four-game preseason meant to get those blunders out of the way? 'When you boil it down, that's really how you lose a game. It's the little things,' cornerback Champ Bailey said. Perhaps no gaffe was more glaring than when the Broncos' defense lined up only 10 players against the Jaguars' offense on a second-and-goal play midway through the third quarter. The play resulted in a touchdown that put Jacksonville ahead 14-7. 'We had a substitution issue,' McDaniels said. Here's how that mistake happened: - Inside linebacker Mario Haggan left the field after the first-down play (a run by Jaguars star Maurice Jones- Drew), and headed to the sideline to talk to trainer Steve Antonopulos. - To replace Haggan, reserve linebacker Joe Mays came on the field at the same time Denver coaches called in the nickel package, with five defensive backs and only two defensive linemen, which sent Kevin Vickerson and Jason Hunter off the field. - Nickel pass rusher Jarvis Moss had come on the field but ran off, perhaps in a miscommunication after seeing Mays in the huddle. No one replaced Moss. -And no one -- not linebacker D.J. Williams, the defensive player in radio communication with the coaches, not the coaches watching from the booth or the coaches on the sideline -- realized the mistake in time to call a timeout. The ball was snapped and Jaguars quarterback David Garrard, who felt no pass-rush pressure, found tight end Marcedes Lewis in the end zone for the TD. 'They changed personnel, we changed personnel. They changed it a little later on that specific play,' McDaniels said. 'We changed with them, but we had a guy come off and a guy that did not go on. That was simply that.' Denver's offense wasn't without miscues, either. Kyle Orton was forced to call a timeout in the fourth quarter when Richard Quinn lined up incorrectly in a two-tight end formation. The CBS camera crew then focused on McDaniels as he lit into Quinn, a second-round pick in the 2009 draft, on the sideline during the timeout. 'It was a miscommunication,' Quinn said. 'But we got it together, and everything was fine after that.' Other gaffes were the result of youth and inexperience, such as the holding penalty called against rookie offensive tackle Zane Beadles on a third-down play during the Broncos' opening possession. The penalty negated a first-down run by Knowshon Moreno. Orton was sacked on the next play, and the Broncos punted. 'It's learning the differences between the college and the pro game, and you can't necessarily take a guy down like that,' said Beadles, a second-round pick from Utah. 'It'll get called every time. It's a learning curve.' Bailey, a nine-time Pro Bowler in his seventh season with the Broncos, admitted that the number of mistakes, especially the mental errors, was frustrating. But, Bailey said, it's reassuring to know they're not fundamental flaws. 'It kind of keeps you optimistic because you know you can correct those,' Bailey said. 'We know we can.' "

--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/Users/danelynn/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: "With hurricane season over it officially ends today it looks like Florida has skated again this year. The season produced nine named storms, three of which became hurricanes, but none that made landfall as hurricanes in the United States. Chris Landsea, science and operations officer at the National Hurricane Center in Miami, offers his assessment of the season. Why such a quiet year? I think the main reason is the El Nio that developed in the Pacific Ocean this year. We get an El Nio every four to six years and it changes the global weather pattern. Some of them are nice, and some of them aren't so nice. The one beneficial aspect is the reduction in Atlantic hurricanes. You tend to get more windshear that disrupts the storms. We've seen promising looking tropical waves but they would sputter or either not develop or barely develop at all. How long is this El Nio expected to last? It's not very predictable. Most last just a year. The question is whether it will stick around for the 2010 hurricane season. Most El Nios last for a year, but we have seen others span two hurricane seasons. It's usually the ones that start a little late that get very strong. It remains to be seen if this El Nio causes two quiet seasons in a row. Quiet, of course, can depend on where you live. Do you think our perception of a season focuses too much on hurricanes' impact on the U.S.? Yes. You could have a very busy year like last year, but the folks in Florida were very fortunate because we did not have any direct impacts. If it's an active year, your chances of getting hit do go up. If you speak of a specific county or city, your chances of getting hit are very small. But if it's a quiet year, and that one hurricane hits your city or county like Miami during the 1992 Hurricane Andrew then it was the worst hurricane season you've experienced in your life. What's a better measure of a season? Just counting the number of storms can be very misleading. We use something called 'accumulated cyclone energy' (a measurement of the total activity of each storm, and the seasonal total of all storms). Is this why the hurricane center's seasonal predictions now offer a possible range of storms instead of a specific number? We're never going to get it exactly right. To provide that kind of precision is a little misleading, at least in our opinion. We express it two ways: One is a range, so this year we said there were 4 to 6 storms expected. We also give the chances of a quiet, average or busy year. For this year, there was a 40 percent chance of a quiet, 50 percent chance of near average, and 10 percent chance of a busy year. So much has been made in recent years about global warming's impact on hurricanes. How does that figure in with this fluctuation in storms? Just like there were some who were misguided in saying the 2005 season (27 named storms) was a harbinger of global warming, there were folks on the other side who said the 2006 and 2007 season meant there was no indication of global warming. In general, Atlantic hurricane activity has been very busy since the mid '90s. That's not to say I don't think there's any global warming influence. I do think hurricanes are being impacted by global warming, but it's a really tiny, tiny component. Even with a Category 5 hurricane with 175 mph winds, maybe 1 to 2 mph could be ascribed to global warming. But if that's the case, it's so small we can't measure it. Andy Boyle can be reached at aboylesptimes.com or (727) 893-8087. "


[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   0.0s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   0.0s
[CV] END bootstrap=False, max_depth=30

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=80, 

[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   0.0s
[CV] END bootstrap=False, max_depth

[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   0.0s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1200; total time=   0.0s
[CV] END bootstrap=True, max_depth=

### Testing

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = rfclf.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

### Validation

In [None]:
y_pred_val = rfclf.predict(X_val_tfidf)
print(f"Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"F1 score: {f1_score(y_val, y_pred_val, average='macro')}")