In [None]:
import spacy
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.inspection import DecisionBoundaryDisplay

from FakeNews.Tokenizer import Tokenizer
from FakeNews.Data import Data
from FakeNews.Cleaner import Cleaner
from FakeNews.Filter import Filter
from FakeNews.Lemmatizer import Lemmatizer
from FakeNews.Predictor import Predictor
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split as tts
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
cleaner = Cleaner()
pp = Pipeline([('tokenizing', Tokenizer()),
              ('filtering', Filter()),
              ('lemmatizing', Lemmatizer())])


In [33]:
pipelines = {}

In [46]:
pipelines['svc'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                   ('text', TfidfVectorizer(), 1)])),
                ('pca', PCA()),
                ('svc', SVC())])

In [374]:
pipelines['svc_nmf'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                    ('text', TfidfVectorizer(), 1)])),
                 ('nmf', PCA()),
                 ('svc', SVC())])

In [202]:
pipelines['kmeans'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                      ('text', TfidfVectorizer(), 1)])),
                   ('pca', PCA()),
                   ('kmeans', KMeans(n_clusters=2, random_state=27)),
                   ('predicting', Predictor())])

In [221]:
pipelines['kmeans_nmf'] = Pipeline([('vectorizing', ColumnTransformer([('title', TfidfVectorizer(), 0),
                                                                   ('text', TfidfVectorizer(), 1)])),
                                ('nmf', NMF()),
                                ('kmeans', KMeans(n_clusters=2, random_state=27)),
                                ('predicting', Predictor())])

In [9]:
data = Data()
data.load()
cleaner.fit(data.X)
data.X = cleaner.transform(data.X)
data.y = cleaner.transform(data.y)

Cleaning...
Cleaning...


In [100]:
data.X

array([['As U.S. budget fight looms, Republicans flip their fiscal script',
        'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportiona

In [10]:
X_train, X_test, y_train, y_test = tts(data.X,
                                       data.y,
                                       train_size = .25,
                                       test_size = .25,
                                       random_state = 42)

In [385]:
X_train50, X_test50, y_train50, y_test50 = tts(data.X,
                                               data.y,
                                               train_size = .5,
                                               test_size = .5,
                                               random_state = 42)

In [841]:
X_train

Unnamed: 0,0,1,2,3
0,Italy rescues more_than_250 migrants Mediterranean,ROME Reuters More_than_250 migrants rescued central Mediterranean the_night Monday Tuesday Italy_s_Coast_Guard said statement said migrants large rubber dinghy small boats rescued missions ships non governmental organization Migrant arrivals Italy fallen two-thirds_year year July officials working U.N.-backed government Tripoli pressure people smugglers Libyan city Sabratha stop boats leaving Italy bolstering Libyan coast guard s ability turn boats Last_week the_United_Nations began bringing African refugees Italy Libya evacuating detention centers conditions condemned rights groups inhumane,worldnews,"December_26,_2017"
1,RACIST RANT Supreme_Court Justice Exposes Slanted Personal Opinion Law Enforcement,Supreme_Court_Justic_Sotomayor went racist rant law enforcement targeted minorities fear black brown people secret people color disproportionate victims type scrutiny s weighed racist manner hope wise Latina woman richness experiences reach better conclusion white male hasn t lived life Supreme_Court Justice SotomayorSotomayor brings racist view court clouding decisions Americans Supreme_Court Justice Sonia_Sotomayor Monday issued vehement dissent Fourth_Amendment case writing majority s opinion sanctions police stops corrode civil liberties threaten lives fiery objection came case Utah man challenged arrest based stop later found unlawful 5 3 majority opinion Sotomayor wrote dramatic ramifications law abiding citizens targeted police especially minorities secret people color disproportionate victims type scrutiny wrote generations black brown parents given children talk instructing run street hands seen think talking stranger fear officer gun react legitimizing conduct produces double consciousness case tells white black guilty innocent officer verify legal status time added says body subject invasion courts excuse violation rights implies citizen democracy subject carceral state waiting cataloged major decisions the_next_week including cases affirmative action abortion immigration Sotomayor s anger signals quiet term death Justice Antonin_Scalia increasingly contentious born experience inherent physiological cultural differences said jurists women nonwhite gender national origins difference judging CNN,politics,"Jun 21,_2016"
2,EPIC COMMIE_OBAMA PICTURED Vietnam President,Wow commie putz lifting arms restrictions Vietnam Obama gleefully gets photo right bust Ho_Chi_Minh s rich symbolism right,politics,"May_23,_2016"
3,STATE INCLUDE TRANSGENDER CURRICULUM PUBLIC SCHOOLS genders gender gender,s compassionate convincing young boys girls gender God assigned birth means absolutely fact government funded surgeries try sex think identify want contact the_School_Board live Fairfax_County VA let know feel tax dollars radical indoctrination children nation s largest public school systems preparing include gender identity classroom curriculum including lessons sexual fluidity spectrum idea s thing 100 percent boys 100_percent girls Fairfax County Public Schools released report recommending changes family life curriculum grades 7 12 changes critics radical gender ideology formally introduced next_week larger picture attack nature created order said Peter_Sprigg the_Family_Research_Council Human beings created male female current transgender ideology goes way telling genders gender gender supposed affirm plan calls teaching seventh graders transgenderism tenth graders concept sexuality broader spectrum sure smells like unadulterated sex indoctrination load kids going learning middle school Students provided definitions sexual orientation terms heterosexuality homosexuality bisexuality gender identity term transgender district s recommendations state Emphasis placed recognizing experiencing changes role respectful inclusive language promoting environment free bias discrimination Eighth graders taught individual identity occurs lifetime includes component sexual orientation gender identity Individual identity described having parts biological gender gender identity includes transgender gender role sexual orientation includes heterosexual bisexual homosexual district introduce young teenagers concept sexuality broader spectrum tenth grade taught s sexuality develops lifetime Emphasis placed understanding broader boundless fluid spectrum sexuality developed lifetime document states Sexual orientation gender identity terms discussed focus appreciation individual differences imagine parents freaking Parents need protect kids assault said Andrea_Lafferty president Traditional_Values_Coalition imagine place today Last_week school board voted include gender identity district s nondiscrimination policy decision strongly opposed parents Lafferty led opposition nondiscrimination policy warned district moving deconstruction gender end deconstruction gender absolutely told majority people pushing saying clearly motivation School_Board spokesman John_Torre told the_Washington_Times proposed curriculum changes last_week s vote allow boys identity girls use bathrooms locker rooms choice believe purely coincidental matters worse Lafferty contends parents able opt children classes lessons mandatory health curriculum Torre told parents able opt classes including sexual orientation gender identity lessons forthright information Lafferty said telling people truth bullying parents intimidating threatening confess m bit old school sex education believe God created male female reading Bible indicate dozens options m open learning new things asked school district provide textbooks scientific data instruct children dozens dozens possible genders s reply received Torre Lessons developed proposed lesson objectives stated need develop lessons proposed objectives implemented fall_2016 words don t clue the_Family_Research_Council s Sprigg said s pretty good reason t produce textbook fluidity s ideological concept told s scientific warned Fairfax_County s planned curriculum harmful students s going create confusion minds young people don t need confusion sexual identity said board introduce changes May_21 Lafferty said hopes parents turn force voice objections FOX_News,politics,"May_18,_2015"
4,Mexico counter negative tone U.S. presidential race,MEXICO CITY Reuters Mexico government planning diplomatic push counter aggressive rhetoric nationals U.S. election race senior official said Tuesday calls Donald_Trump massive wall divide nations Republican presidential candidates Marco_Rubio Ted_Cruz proposed building wall U.S.-Mexico border Trump taken hardest line the_United_States southern neighbor Republican runner labeled Mexican migrants drug runners rapists proposed mass deportations undocumented immigrants said Mexico killing the_United_States cheap labor Francisco_Guzman Mexican President Enrique_Pena Nieto chief staff Tuesday said time government push negative images Mexico plans use extensive network U.S. consulates highlight benefits U.S.-Mexican relations U.S. economy American people Guzman said initiative include forums U.S. business people politicians public figures Mexico opinions heard Mexico positioning U.S. election debate generally adequate Guzman told group foreign correspondents reflect constructive relationship Mexico U.S. Trump rolled series primary wins Tuesday looked set command Republican nomination battle Mexican government till avoided direct confrontation billionaire real estate developer country diplomat called policies comments ignorant racist Mexican presidents Vicente_Fox Felipe_Calderon publicly condemned Trump comparing Adolf_Hitler going involve U.S. presidential debate want polarize want positive institutional relationship past Guzman said U.S.-Mexico relationship requires bridges fewer walls isolationist solution solution added The_United_States Mexico trade partner destination about_80_percent exports addition sharing cultural family links promote stronger ties Mexico government plans meet final contenders the_White_House major parties selected candidates Guzman said candidate wants project forward opinion relationship Mexico threat opportunity said Writing David_Alire_Garcia Editing Simon_Gardner Andrew_Hay article funded SAP independently created Reuters editorial staff SAP editorial involvement creation production,politicsNews,"March_2,_2016"
...,...,...,...,...
9505,Supreme_Court sympathetic property owner wetlands dispute,"WASHINGTON Reuters The_U.S._Supreme_Court Wednesday appeared likely rule property owners challenge federal government court need permits national water protection law case involving company plans Minnesota peat court heard one-hour argument case balancing property rights environmental law instance landmark 1972 U.S._Clean_Water_Act majority justices appeared sympathetic North_Dakota based Hawkes_Co_Inc fighting Obama administration finding property includes wetlands law mandates property owners permits situations particular plot land falls law jurisdiction important developers property owners finding triggers lengthy expensive permitting process Hawkes lawyers argued company able contest needs permit process Liberal conservative justices alike expressed concern current arrangement burden property owners Conservative Chief Justice John_Roberts said applicants disregard government finding need permit great practical risk Liberal Ruth_Bader_Ginsburg called process arduous expensive Liberal Stephen_Breyer called government decision Hawkes needed permit perfectly suited review courts liberal Elena_Kagan expressed support government raising concerns impact ruling favoring property owners actions government agencies the_Securities_and_Exchange_Commission Property rights advocates said permitting process two_years cost up_to_$270,000 owners facing penalties up_to_$37,500 day noncompliance Business groups including the_National_Association_of_Home_Builders the_U.S. Chamber Commerce 29 states filed court papers opposing Obama administration case case follows justices unanimous 2012 ruling property owners facing enforcement action the_Clean_Water_Act ask court intervene forced comply pay financial penalties Obama administration last_year issued new regulation defining scope federal jurisdiction bodies water federal appeals court rule hold challenged 18 states Only_eight justices participated case following Justice Antonin_Scalia February death ruling the_end_of_June",politicsNews,"March_30,_2016"
9506,HILLARY THUGS_Spray_Paint_20 Cars Outside Trump Rally Video,attendees left Republican presidential nominee Donald_Trump s Saturday rally Bangor find parked cars vandalized white spray paint Bangor police spokesman said officers seeking witnesses reviewing video criminal mischief case New_York_City billionaire s speech blasting rigged system the_Cross_Insurance_Center More_than_20 cars parked the_Bangor_Raceway Buck_Street hit lines white paint Attendees said owners cars appeared drive noticing vandalism Paul_Foster painter Trump supporter Eastbrook van painted said rally couldn t peaceful blamed vandalism Trump opponents saying ain t thinking know Trump supporters Bangor > > leave rally find cars spray painted pic.twitter.com/jW5UzkdkoB BasketOfDeplorables @betioserrano October_15 2016Read GP,politics,"Oct 15,_2016"
9507,Pennsylvania mayors charged pay play corruption case,"PHILADELPHIA Reuters mayor Allentown largest city Pennsylvania latest officials charged long running federal corruption investigation according indictments unsealed Wednesday Edwin_Pawlowski Democrat running fourth term people accused bribery extortion wire fraud charges indictments included total 73 counts Seven pleaded guilty probe Wednesday new defendants include Vaughn_Spencer 70 Democrat previously served black mayor Reading Pennsylvania Pawlowski Spencer essentially sale sign City_Hall sold highest bidder said Louis_Lappen acting U.S. attorney Philadelphia news conference Wednesday news conference Pawlowski called accusation fiction vowed running reelection want clear wrong accepted dime salary said way shape form City_Hall sale Pawlowski run unsuccessfully governor U.S._Senate accused accepting more_than_$150,000 contributions campaigns vendors understanding receive city contracts exchange mayor sought cover scheme deleting emails instructing campaign aides sweeping office listening devices installed law enforcement indictment said Spencer charged similar scheme allegedly directed contract donor engineering firm accused agreeing bribe city council president exchange repealing campaign finance limits lawyer Spencer Geoffrey_Johnson said intends vigorously defend charges declined comment saying reviewing indictment separate schemes outlined indictments included players James_Hickey business consultant named indictments Michael_Fleck previously pleaded guilty served campaign manager mayors lawyer Hickey immediately respond request comment Charges Pawlowski expected Earlier_this_year Allentown managing director pleaded guilty investigation implicated Pawlowski $3_million bid rigging scheme benefit campaign donor",politicsNews,"July_26,_2017"
9508,Congress split privatizing air traffic control,"WASHINGTON Reuters The_U.S._Congress divided privatize nation air traffic control system chambers advance bills expand airline passenger protections Thursday the_Senate_Commerce, Science_and_Transportation_Committee adopted legislation includes Federal_Aviation_Administration reforms unlike House panel spin air traffic control leaves FAA support said Senator Bill_Nelson Democrat panel The_U.S._House_Transportation_and_Infrastructure_Committee approved privatization plan Tuesday putting air traffic control oversight nonprofit corporation President Donald_Trump said modernize air traffic control lower flying costs proposal drawn fire private plane owners rural airports Critics hand control key asset special interests big airlines American_Airlines_Group_Inc United_Airlines Southwest_Airlines_Co_and_JetBlue_Airways_Corp proposal Congress Sept._30 reauthorize FAA Republican Senator John_Thune chairman committee said Senate proposal August recess potentially leaving little time come compromise measure Senate bill require new rules prohibiting cancellation baggage seat selection day change fees reasonable proportional Airlines America airline trade group said result government mandated price controls rejected 2016 U.S. airlines collected $7.1_billion baggage reservation change fees measures illegal airline bump boarded passenger flight April United passenger forcibly removed seat prompting public outrage airline banned practice Senate bill requires new rules mandate airlines promptly refund passengers baggage fees fees receive services House bill require FAA set minimum seat sizes U.S. airlines minimum distance rows protect safety health airline passengers average distance rows seats dropped 35_inches the_1970s about_31_inches today supporters average width airline seats shrunk 18_inches about_16 1/2 inches contentious issue training requirements pilots certain simulated training_hours counted Thune said two-thirds U.S. airports served regional carriers crisis trying attract pilots Democrats said revised rules lead unqualified pilots bills seek speed approval commercial drone use testing studying privacy implications measures require medium- large sized airports provide clean private rooms terminals nursing mothers enshrine ban making flight mobile phone calls law Senate bill direct study airplane air quality require study airlines shrinking airplane bathrooms add seats",politicsNews,"June_29,_2017"


In [12]:
X_train = pp.fit_transform(X_train)

Tokenizing ...
Filtering...
Lemmatizing...


In [None]:

X_train50 = pp.fit_transform(X_train50)


In [82]:
param_grids = {key: {} for key in pipelines.keys()}

In [152]:
gcvs = {key: None for key in pipelines.keys()}

In [475]:
param_grids['svc'] = {}
param_grids['svc']['vectorizing__title__max_features'] = [175]
param_grids['svc']['vectorizing__text__max_features'] = [150]
param_grids['svc']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,3),(1, 2), (1,1)]
param_grids['svc']['vectorizing__text__ngram_range'] = [(1,1)]#[(1,1),(1, 2), (1,3)]
param_grids['svc']['vectorizing__text__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc']['vectorizing__title__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc']['vectorizing__title__max_df'] =[.99]#[.99, .95]
param_grids['svc']['vectorizing__text__max_df'] = [.99]#[.99, .95]
param_grids['svc']['svc__kernel'] = ['linear']#['linear', 'rbf']
param_grids['svc']['svc__C'] = [6]#[5,6,7]

In [476]:

gcvs['svc'] = GridSearchCV(pipelines['svc'],
                           param_grid=param_grids['svc'],
                           cv = 3, n_jobs = -1,
                           verbose = 100).fit(X_train50, y_train50)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [477]:
gcvs['svc'].best_score_

np.float64(0.9923238696109359)

In [478]:
gcvs['svc'].score(X_test50, y_test50)

0.9919558359621451

In [479]:
gcvs['svc'].best_params_

{'svc__C': 6,
 'svc__kernel': 'linear',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 150,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 1),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 175,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 2)}

In [486]:

param_grids['svc_nmf'] = {}
param_grids['svc_nmf']['vectorizing__title__max_features'] = [175]
param_grids['svc_nmf']['vectorizing__text__max_features'] = [150]
param_grids['svc_nmf']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,3),(1, 2), (1,1)]
param_grids['svc_nmf']['vectorizing__text__ngram_range'] = [(1,1)]#[(1,1),(1, 2), (1,3)]
param_grids['svc_nmf']['vectorizing__text__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc_nmf']['vectorizing__title__min_df'] = [1]#[1,2,3]#[2,3,4]
param_grids['svc_nmf']['vectorizing__title__max_df'] =[.99]#[.99, .95]
param_grids['svc_nmf']['vectorizing__text__max_df'] = [.99]#[.99, .95]
param_grids['svc_nmf']['nmf__n_components'] = [None]#[None,80,160,200]
param_grids['svc_nmf']['svc__kernel'] = ['linear']#['linear', 'rbf']
param_grids['svc_nmf']['svc__C'] = [6]#[5,6,7]



In [487]:


gcvs['svc_nmf'] = GridSearchCV(pipelines['svc_nmf'],
                           param_grid=param_grids['svc_nmf'],
                           cv = 3, n_jobs = -1,
                           verbose = 100).fit(X_train50, y_train50)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [488]:
gcvs['svc_nmf'].best_score_

np.float64(0.9930073606729758)

In [489]:
gcvs['svc_nmf'].score(X_test50, y_test50)

0.9919558359621451

In [490]:
gcvs['svc_nmf'].best_params_

{'nmf__n_components': None,
 'svc__C': 6,
 'svc__kernel': 'linear',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 150,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 1),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 175,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 2)}

In [617]:
param_grids['kmeans'] = {}
param_grids['kmeans']['vectorizing__title__max_features'] = [800]#[800, 1600]#[28,30, 32]
param_grids['kmeans']['vectorizing__text__max_features'] = [2000]#[2,3,4]
param_grids['kmeans']['vectorizing__title__ngram_range'] = [(1,2)]#[(1,1), (1,2), (1,3)]
param_grids['kmeans']['vectorizing__text__ngram_range'] = [(1,3)]#[(1,4), (1,3)]
param_grids['kmeans']['vectorizing__text__min_df'] = [1]#[1,2]
param_grids['kmeans']['vectorizing__title__min_df'] = [4]#[5,4,3,2]
param_grids['kmeans']['vectorizing__text__max_df'] = [.99]#[.99, .98]
param_grids['kmeans']['vectorizing__title__max_df'] = [.99]#[.99, .98]#[5,4,3,2]
param_grids['kmeans']['pca__whiten'] = [False]
param_grids['kmeans']['pca__n_components'] = [450]#[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
param_grids['kmeans']['kmeans__n_init'] = [1]#[1,2,4]


In [618]:
gcvs['kmeans'] = GridSearchCV(pipelines['kmeans'],
                              param_grids['kmeans'],
                              cv = 3, n_jobs = -1,
                              verbose = 100).fit(X_train50, y_train50)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [619]:
gcvs['kmeans'].best_score_

np.float64(0.9013669821240798)

In [620]:
gcvs['kmeans'].score(X_test50, y_test50)

Predicting...




0.896372239747634

In [621]:
gcvs['kmeans'].best_params_

{'kmeans__n_init': 1,
 'pca__n_components': 450,
 'pca__whiten': False,
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 2000,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 3),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 800,
 'vectorizing__title__min_df': 4,
 'vectorizing__title__ngram_range': (1, 2)}

In [830]:
param_grids['kmeans_nmf'] = {}
param_grids['kmeans_nmf']['vectorizing__title__max_features'] = [500]#[400,500,600]#[300,400,500]#[800, 1600]#[28,30, 32]
param_grids['kmeans_nmf']['vectorizing__text__max_features'] = [600]#[600, 700]#[550, 600, 650]#[2,3,4]
param_grids['kmeans_nmf']['vectorizing__title__ngram_range'] = [(1,1)]#[(1,1), (1,2), (1,3)]
param_grids['kmeans_nmf']['vectorizing__text__ngram_range'] = [(1,2)]#[(1,3), (1,2)]#[(1,4), (1,3)]
param_grids['kmeans_nmf']['vectorizing__text__min_df'] = [1]#[1,2]
param_grids['kmeans_nmf']['vectorizing__title__min_df'] = [1]#[5,4,3,2]
param_grids['kmeans_nmf']['vectorizing__text__max_df'] = [.99]#[.99, .98]
param_grids['kmeans_nmf']['vectorizing__title__max_df'] = [.99]#[.99, .98]#[5,4,3,2]
param_grids['kmeans_nmf']['nmf__max_iter'] = [1000]
param_grids['kmeans_nmf']['nmf__n_components'] = [10]#[2,4,8,10,12,14,16,18]#[10,11]#[11,12,13,14,15]#[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
param_grids['kmeans_nmf']['nmf__random_state'] = [27]
param_grids['kmeans_nmf']['nmf__init'] = ['nndsvda']#['nndsvd', 'nndsvda']
param_grids['kmeans_nmf']['nmf__solver'] = ['cd']#['mu', 'cd']
param_grids['kmeans_nmf']['kmeans__random_state'] = [27]
param_grids['kmeans_nmf']['kmeans__n_init'] = [1]#[1,2,4]


In [831]:
gcvs['kmeans_nmf'] = GridSearchCV(pipelines['kmeans_nmf'],
                              param_grids['kmeans_nmf'],
                              cv = 5, n_jobs = -1,
                              verbose = 100).fit(X_train50, y_train50)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [832]:
gcvs['kmeans_nmf'].best_score_

np.float64(0.9180336487907466)

In [833]:
gcvs['kmeans_nmf'].score(X_test50, y_test50)

Predicting...




0.9086225026288117

In [834]:
gcvs['kmeans_nmf'].best_params_

{'kmeans__n_init': 1,
 'kmeans__random_state': 27,
 'nmf__init': 'nndsvda',
 'nmf__max_iter': 1000,
 'nmf__n_components': 10,
 'nmf__random_state': 27,
 'nmf__solver': 'cd',
 'vectorizing__text__max_df': 0.99,
 'vectorizing__text__max_features': 600,
 'vectorizing__text__min_df': 1,
 'vectorizing__text__ngram_range': (1, 2),
 'vectorizing__title__max_df': 0.99,
 'vectorizing__title__max_features': 500,
 'vectorizing__title__min_df': 1,
 'vectorizing__title__ngram_range': (1, 1)}