In [49]:
import spacy
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.inspection import DecisionBoundaryDisplay

from FakeNews.Tokenizer import Tokenizer
from FakeNews.Data import Data
from FakeNews.Cleaner import Cleaner
from FakeNews.Filter import Filter
from FakeNews.Lemmatizer import Lemmatizer
from FakeNews.Predictor import Predictor
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split as tts
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
cleaner = Cleaner()
pp = Pipeline([('tokenizing', Tokenizer()),
              ('filtering', Filter()),
              ('lemmatizing', Lemmatizer())])


In [33]:
pipelines = {}

In [46]:
pipelines['svc'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                   ('text', TfidfVectorizer(), 1)])),
                ('pca', PCA()),
                ('svc', SVC())])

In [374]:
pipelines['svc_nmf'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                    ('text', TfidfVectorizer(), 1)])),
                 ('nmf', PCA()),
                 ('svc', SVC())])

In [202]:
pipelines['kmeans'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                      ('text', TfidfVectorizer(), 1)])),
                   ('pca', PCA()),
                   ('kmeans', KMeans(n_clusters=2, random_state=27)),
                   ('predicting', Predictor())])

In [221]:
pipelines['kmeans_nmf'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                                   ('text', TfidfVectorizer(), 1)])),
                                ('nmf', NMF()),
                                ('kmeans', KMeans(n_clusters=2, random_state=27)),
                                ('predicting', Predictor())])

In [9]:
data = Data()
data.load()
cleaner.fit(data.X)
data.X = cleaner.transform(data.X)
data.y = cleaner.transform(data.y)

Cleaning...
Cleaning...


In [100]:
data.X

array([['As U.S. budget fight looms, Republicans flip their fiscal script',
        'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportiona

In [10]:
X_train, X_test, y_train, y_test = tts(data.X,
                                       data.y,
                                       train_size = .25,
                                       test_size = .25,
                                       random_state = 42)

In [385]:
X_train50, X_test50, y_train50, y_test50 = tts(data.X,
                                               data.y,
                                               train_size = .5,
                                               test_size = .5,
                                               random_state = 42)

In [11]:
X_train.shape

(9510, 4)

In [12]:
X_train = pp.fit_transform(X_train)

Tokenizing ...
Filtering...
Lemmatizing...


In [82]:
param_grids = {key: {} for key in pipelines.keys()}

In [152]:
gcvs = {key: None for key in pipelines.keys()}

In [475]:
param_grids['svc'] = {}
param_grids['svc']['vectorizing__title__max_features'] = [175]
param_grids['svc']['vectorizing__text__max_features'] = [150]
param_grids['svc']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,3),(1, 2), (1,1)]
param_grids['svc']['vectorizing__text__ngram_range'] = [(1,1)]#[(1,1),(1, 2), (1,3)]
param_grids['svc']['vectorizing__text__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc']['vectorizing__title__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc']['vectorizing__title__max_df'] =[.99]#[.99, .95]
param_grids['svc']['vectorizing__text__max_df'] = [.99]#[.99, .95]
param_grids['svc']['svc__kernel'] = ['linear']#['linear', 'rbf']
param_grids['svc']['svc__C'] = [6]#[5,6,7]

In [476]:

gcvs['svc'] = GridSearchCV(pipelines['svc'],
                           param_grid=param_grids['svc'],
                           cv = 3, n_jobs = -1,
                           verbose = 100).fit(X_train50, y_train50)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [477]:
gcvs['svc'].best_score_

np.float64(0.9923238696109359)

In [478]:
gcvs['svc'].score(X_test50, y_test50)

0.9919558359621451

In [479]:
gcvs['svc'].best_params_

{'svc__C': 6,
 'svc__kernel': 'linear',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 150,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 1),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 175,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 2)}

In [486]:

param_grids['svc_nmf'] = {}
param_grids['svc_nmf']['vectorizing__title__max_features'] = [175]
param_grids['svc_nmf']['vectorizing__text__max_features'] = [150]
param_grids['svc_nmf']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,3),(1, 2), (1,1)]
param_grids['svc_nmf']['vectorizing__text__ngram_range'] = [(1,1)]#[(1,1),(1, 2), (1,3)]
param_grids['svc_nmf']['vectorizing__text__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc_nmf']['vectorizing__title__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc_nmf']['vectorizing__title__max_df'] =[.99]#[.99, .95]
param_grids['svc_nmf']['vectorizing__text__max_df'] = [.99]#[.99, .95]
param_grids['svc_nmf']['nmf__n_components'] = [None,80,160,200]
param_grids['svc_nmf']['svc__kernel'] = ['linear']#['linear', 'rbf']
param_grids['svc_nmf']['svc__C'] = [6]#[5,6,7]



In [487]:


gcvs['svc_nmf'] = GridSearchCV(pipelines['svc_nmf'],
                           param_grid=param_grids['svc_nmf'],
                           cv = 3, n_jobs = -1,
                           verbose = 100).fit(X_train50, y_train50)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [488]:
gcvs['svc_nmf'].best_score_

np.float64(0.9930073606729758)

In [489]:
gcvs['svc_nmf'].score(X_test50, y_test50)

0.9919558359621451

In [490]:
gcvs['svc_nmf'].best_params_

{'nmf__n_components': None,
 'svc__C': 6,
 'svc__kernel': 'linear',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 150,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 1),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 175,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 2)}

In [617]:
param_grids['kmeans'] = {}
param_grids['kmeans']['vectorizing__title__max_features'] = [800]#[800, 1600]#[28,30, 32]
param_grids['kmeans']['vectorizing__text__max_features'] = [2000]#[2,3,4]
param_grids['kmeans']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,1), (1,2), (1,3)]
param_grids['kmeans']['vectorizing__text__ngram_range'] = [(1,3)]#[(1,4), (1,3)]
param_grids['kmeans']['vectorizing__text__min_df'] = [1]#[1,2]
param_grids['kmeans']['vectorizing__title__min_df'] = [4]#[5,4,3,2]
param_grids['kmeans']['vectorizing__text__max_df'] = [.99]#[.99, .98]
param_grids['kmeans']['vectorizing__title__max_df'] = [.99]#[.99, .98]#[5,4,3,2]
param_grids['kmeans']['pca__whiten'] = [False]
param_grids['kmeans']['pca__n_components'] = [450]#[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
param_grids['kmeans']['kmeans__n_init'] = [1]#[1,2,4]


In [618]:
gcvs['kmeans'] = GridSearchCV(pipelines['kmeans'],
                              param_grids['kmeans'],
                              cv = 3, n_jobs = -1,
                              verbose = 100).fit(X_train50, y_train50)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [619]:
gcvs['kmeans'].best_score_

np.float64(0.9013669821240798)

In [620]:
gcvs['kmeans'].score(X_test50, y_test50)

Predicting...




0.896372239747634

In [621]:
gcvs['kmeans'].best_params_

{'kmeans__n_init': 1,
 'pca__n_components': 450,
 'pca__whiten': False,
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 2000,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 3),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 800,
 'vectorizing__title__min_df': 4,
 'vectorizing__title__ngram_range': (1, 2)}

In [830]:
param_grids['kmeans_nmf'] = {}
param_grids['kmeans_nmf']['vectorizing__title__max_features'] = [500]#[400,500,600]#[300,400,500]#[800, 1600]#[28,30, 32]
param_grids['kmeans_nmf']['vectorizing__text__max_features'] = [600]#[600, 700]#[550, 600, 650]#[2,3,4]
param_grids['kmeans_nmf']['vectorizing__title__ngram_range'] = [(1,1)]#[(1,1), (1,2), (1,3)]
param_grids['kmeans_nmf']['vectorizing__text__ngram_range'] = [(1,2)]#[(1,3), (1,2)]#[(1,4), (1,3)]
param_grids['kmeans_nmf']['vectorizing__text__min_df'] = [1]#[1,2]
param_grids['kmeans_nmf']['vectorizing__title__min_df'] = [1]#[5,4,3,2]
param_grids['kmeans_nmf']['vectorizing__text__max_df'] = [.99]#[.99, .98]
param_grids['kmeans_nmf']['vectorizing__title__max_df'] = [.99]#[.99, .98]#[5,4,3,2]
param_grids['kmeans_nmf']['nmf__max_iter'] = [1000]
param_grids['kmeans_nmf']['nmf__n_components'] = [10]#[2,4,8,10,12,14,16,18]#[10,11]#[11,12,13,14,15]#[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
param_grids['kmeans_nmf']['nmf__random_state'] = [27]
param_grids['kmeans_nmf']['nmf__init'] = ['nndsvda']#['nndsvd', 'nndsvda']
param_grids['kmeans_nmf']['nmf__solver'] = ['cd']#['mu', 'cd']
param_grids['kmeans_nmf']['kmeans__random_state'] = [27]
param_grids['kmeans_nmf']['kmeans__n_init'] = [1]#[1,2,4]


In [831]:
gcvs['kmeans_nmf'] = GridSearchCV(pipelines['kmeans_nmf'],
                              param_grids['kmeans_nmf'],
                              cv = 5, n_jobs = -1,
                              verbose = 100).fit(X_train50, y_train50)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [832]:
gcvs['kmeans_nmf'].best_score_

np.float64(0.9180336487907466)

In [833]:
gcvs['kmeans_nmf'].score(X_test50, y_test50)

Predicting...




0.9086225026288117

In [834]:
gcvs['kmeans_nmf'].best_params_

{'kmeans__n_init': 1,
 'kmeans__random_state': 27,
 'nmf__init': 'nndsvda',
 'nmf__max_iter': 1000,
 'nmf__n_components': 10,
 'nmf__random_state': 27,
 'nmf__solver': 'cd',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 600,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 2),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 500,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 1)}