In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import random

In [52]:
RANDOM_SEED = 42
N_JOBS = -1

In [53]:
random.seed(RANDOM_SEED)

In [54]:
fake_df = pd.read_csv('data/Fake.csv')
true_df = pd.read_csv('data/True.csv')

In [55]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [56]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [57]:
fake_data = [(f'{fake_df.iloc[index]["title"]}. {fake_df.iloc[index]["text"]}', 1) for index in range(fake_df.shape[0])]

In [58]:
print(len(fake_data))
fake_data[:5]

23481


[(' Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing. Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like th

In [59]:
true_data = []
for index in range(true_df.shape[0]):
    title = true_df.iloc[index]['title']
    text = true_df.iloc[index]['text']
    if text.__contains__('(Reuters) - '):
        text = text[text.index('-') + 2:]
    true_data.append((f'{title}. {text}', 0))

In [60]:
print(len(true_data))
true_data[:5]

21417


[('As U.S. budget fight looms, Republicans flip their fiscal script. The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretion

In [61]:
data = true_data + fake_data
print(len(data))
data[:5]


44898


[('As U.S. budget fight looms, Republicans flip their fiscal script. The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretion

In [62]:
# reducing number of samples for code testing
# remove to train on whole data
true_data = true_data[:1000]
fake_data = fake_data[:1000]

In [63]:
def tokenize(data):
    return [
        (
            word_tokenize(sample[0]),
            sample[1]
        )
        for sample in data
    ]

In [64]:
true_data = tokenize(true_data)
fake_data = tokenize(fake_data)

In [65]:
def remove_stopwords(data):
    stop_words = set(stopwords.words('english'))
    return [
        (
            [word for word in sample[0] if word not in stop_words],
            sample[1]
        )
        for sample in data
    ]

In [66]:
true_data = remove_stopwords(true_data)
fake_data = remove_stopwords(fake_data)

In [67]:
def train_test_split(fake_data, true_data, split=0.1):
    fake_data = [(' '.join(sample[0]), sample[1]) for sample in fake_data]
    true_data = [(' '.join(sample[0]), sample[1]) for sample in true_data]

    random.seed(RANDOM_SEED)

    fake_test = random.sample(fake_data, int(len(fake_data) * split))
    fake_train = [sample for sample in fake_data if not fake_test.__contains__(sample)]
    true_test = random.sample(true_data, int(len(true_data) * split))
    true_train = [sample for sample in true_data if not true_test.__contains__(sample)]

    train_data = fake_train + true_train
    test_data = fake_test + true_test

    random.shuffle(train_data)
    random.shuffle(test_data)

    X_train = [sample[0] for sample in train_data]
    y_train = [sample[1] for sample in train_data]
    X_test = [sample[0] for sample in test_data]
    y_test = [sample[1] for sample in test_data]
    return X_train, y_train, X_test, y_test

In [68]:
X_train, y_train, X_test, y_test = train_test_split(fake_data, true_data)

In [69]:
feature_extractors = {
    'name': 'feature_extractor',
    'models': [
        {
            'model': HashingVectorizer(),
            'parameters': {
                'decode_error': ['strict', 'ignore', 'replace'],
                'strip_accents': ['ascii', 'unicode'],
                'analyzer': ['word', 'char', 'char_wb'],
                'ngram_range': [(1, 1), (1, 2), (2, 2)],
                'lowercase': [True, False],
                'binary': [True, False],
            }
        },
        {
            'model': CountVectorizer(),
            'parameters': {
                'decode_error': ['strict', 'ignore', 'replace'],
                'strip_accents': ['ascii', 'unicode'],
                'analyzer': ['word', 'char', 'char_wb'],
                'ngram_range': [(1, 1), (1, 2), (2, 2)],
                'lowercase': [True, False],
                'binary': [True, False],
            }
        },
        {
            'model': TfidfVectorizer(),
            'parameters': {
                'decode_error': ['strict', 'ignore', 'replace'],
                'strip_accents': ['ascii', 'unicode'],
                'analyzer': ['word', 'char', 'char_wb'],
                'ngram_range': [(1, 1), (1, 2), (2, 2)],
                'lowercase': [True, False],
                'binary': [True, False],
                'norm': ['l1', 'l2'],
                'use_idf': [True, False],
                'smooth_idf': [True, False],
                'sublinear_tf': [True, False],
            }
        }
    ]
}

In [70]:
models = {
    'name': 'classifier',
    'models': [
        {
            'model': MultinomialNB(),
            'parameters': {
                'alpha': np.linspace(0, 1, 4),
                'fit_prior': [True, False],
            }
        },
        {
            'model': LogisticRegression(),
            'parameters': {
                'fit_intercept': [True, False],
                'C': np.linspace(0.1, 1, 4),
                'class_weight': ['balanced', None],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'multi_class': ['auto', 'ovr', 'multinomial'],
                'warm_start': [True, False],
                'n_jobs': [N_JOBS],
                'random_state': [RANDOM_SEED],
            }
        },
        {
            'model': RandomForestClassifier(),
            'parameters': {
                'n_estimators': [int(num) for num in np.linspace(50, 1000, 10)],
                'max_depth': [None] + [num for num in np.linspace(5, 200, 4)],
                'min_samples_split': [int(num) for num in np.linspace(2, 10, 4)],
                'min_samples_leaf': [int(num) for num in np.linspace(1, 10, 4)],
                'max_features': ['auto', 'sqrt', 'log2'],
                'bootstrap': [True, False],
                'warm_start': [True, False],
                'class_weight': ['balanced', 'balanced_subsample', None],
                'n_jobs': [N_JOBS],
                'random_state': [RANDOM_SEED],
            }
        },
        {
            'model': SVC(),
            'parameters': {
                'C': [0.1, 0.5, 1, 10, 50],
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                'gamma': ['scale', 'auto'],
                'shrinking': [True, False],
                'probability': [True, False],
                'class_weight': ['balanced', None],
                'decision_function_shape': ['ovo', 'ovr'],
                'random_state': [RANDOM_SEED],
            }
        },
        {
            'model': KNeighborsClassifier(),
            'parameters': {
                'n_neighbors': [2],
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'p': [1, 2],
                'n_jobs': [N_JOBS],
            }
        },
    ]
}


In [71]:
def train_evaluate(extractor, classifier, X_train, y_train, X_test, y_test):
    def make_pipeline():
        return Pipeline([
            (feature_extractors['name'], extractor['model']),
            (models['name'], classifier['model'])
        ])

    def make_params():
        extractor_params = {}
        for key, value in extractor['parameters'].items():
            extractor_params[f'{feature_extractors["name"]}__{key}'] = value

        classifier_params = {}
        for key, value in classifier['parameters'].items():
            classifier_params[f'{models["name"]}__{key}'] = value

        return {**extractor_params, **classifier_params}

    def search_fit(pipeline, params):
        search_cv = RandomizedSearchCV(pipeline,
                                       params,
                                       n_iter=20,
                                       cv=3,
                                       random_state=RANDOM_SEED,
                                       n_jobs=N_JOBS)
        search_cv.fit(X_train, y_train)
        return search_cv

    def evaluate(search_cv):
        y_pred = search_cv.predict(X_test)
        return accuracy_score(y_test, y_pred)

    def make_results(search_cv, score):
        return {
            'best_extractor': search_cv.best_estimator_.steps[0][1],
            'model': search_cv.best_estimator_.steps[1][1],
            'parameters': search_cv.best_params_,
            'score': score
        }

    pipeline = make_pipeline()
    params = make_params()
    search_cv = search_fit(pipeline, params)
    score = evaluate(search_cv)
    return make_results(search_cv, score)

In [72]:
# for testing
# train_evaluate(feature_extractors['models'][0], models['models'][1], X_train, y_train, X_test, y_test)

In [73]:
results = []

for extractor in feature_extractors['models']:
    for classifier in models['models']:
        result = train_evaluate(extractor, classifier, X_train, y_train, X_test, y_test)
        print(result)
        results.append(result)

 0.98443702        nan 0.75652105        nan        nan 0.7882072
        nan        nan        nan 0.96330922 0.96720182        nan
        nan        nan]
 0.982775   0.99722129        nan 0.96943331 0.95498331 0.97999629
 0.98388703 0.99666574 0.80656093 0.96776479 0.99833148 0.96554535
 0.96664255        nan]
  warn('class_weight presets "balanced" or '
Traceback (most recent call last):
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 638, in fit
    self._count(X, Y)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-pac

{'best_extractor': HashingVectorizer(binary=True, decode_error='ignore', lowercase=False,
                  ngram_range=(2, 2), strip_accents='unicode'), 'model': MultinomialNB(alpha=0.0), 'parameters': {'feature_extractor__strip_accents': 'unicode', 'feature_extractor__ngram_range': (2, 2), 'feature_extractor__lowercase': False, 'feature_extractor__decode_error': 'ignore', 'feature_extractor__binary': True, 'feature_extractor__analyzer': 'word', 'classifier__fit_prior': True, 'classifier__alpha': 0.0}, 'score': 0.98}
{'best_extractor': HashingVectorizer(analyzer='char', binary=True, decode_error='replace',
                  ngram_range=(1, 2), strip_accents='unicode'), 'model': LogisticRegression(C=0.7, class_weight='balanced', fit_intercept=False,
                   multi_class='multinomial', n_jobs=-1, random_state=42,
                   solver='newton-cg'), 'parameters': {'feature_extractor__strip_accents': 'unicode', 'feature_extractor__ngram_range': (1, 2), 'feature_extractor__lo

In [74]:
results = sorted(results, key=lambda x: x.get('score'), reverse=True)

In [75]:
for result in results:
    print(f"""Vectorizer: {result.get('best_extractor').__class__.__name__}
Classifier: {result.get('model').__class__.__name__}
Score: {result.get('score')}
""")


Vectorizer: HashingVectorizer
Classifier: LogisticRegression
Score: 1.0

Vectorizer: HashingVectorizer
Classifier: RandomForestClassifier
Score: 1.0

Vectorizer: HashingVectorizer
Classifier: SVC
Score: 1.0

Vectorizer: CountVectorizer
Classifier: LogisticRegression
Score: 1.0

Vectorizer: CountVectorizer
Classifier: RandomForestClassifier
Score: 1.0

Vectorizer: CountVectorizer
Classifier: SVC
Score: 1.0

Vectorizer: TfidfVectorizer
Classifier: LogisticRegression
Score: 1.0

Vectorizer: TfidfVectorizer
Classifier: RandomForestClassifier
Score: 1.0

Vectorizer: TfidfVectorizer
Classifier: SVC
Score: 1.0

Vectorizer: CountVectorizer
Classifier: MultinomialNB
Score: 0.995

Vectorizer: CountVectorizer
Classifier: KNeighborsClassifier
Score: 0.99

Vectorizer: TfidfVectorizer
Classifier: MultinomialNB
Score: 0.99

Vectorizer: HashingVectorizer
Classifier: MultinomialNB
Score: 0.98

Vectorizer: TfidfVectorizer
Classifier: KNeighborsClassifier
Score: 0.975

Vectorizer: HashingVectorizer
Class