# Project 3 Part 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from collections import Counter
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, roc_auc_score
import seaborn as sns
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)

### Table of Contents
- [Preprocessing and Modeling](#Preprocessing-and-Modeling)
- [Combine both subreddit dataframes](#Combine-both-subreddit-dataframes)
- [Cleaning function for all_text column](#Cleaning-function-for-all_text-column)
- [EDA](#EDA)

## Preprocessing and Modeling

In [2]:
#read the comb_df csv
comb_df= pd.read_csv('./datasets/comb_df.csv')

In [3]:
comb_df.shape

(1869, 7)

In [4]:
comb_df.head()

Unnamed: 0,name,subreddit,title,selftext,all_text,y,clean_text
0,t3_ggfbbc,investing,This video is the simplest video that explains...,# [https://youtu.be/PqiewtqGYM4](https://youtu...,This video is the simplest video that explains...,0,simplest explains stock people work
1,t3_ggfazw,investing,Non index funds that do well when the market i...,I thought I’d try something a little different...,Non index funds that do well when the market i...,0,non well market flat thought something little ...
2,t3_ggf7zk,investing,What profits should we expect for a company th...,I'm new to investing and have no background in...,What profits should we expect for a company th...,0,profit expect company come first race vaccine ...
3,t3_ggeebs,investing,Daily Advice Thread - All basic help or advice...,"If your question is ""I have $10,000, what do I...",Daily Advice Thread - All basic help or advice...,0,daily basic help question must question person...
4,t3_ggedr4,investing,Group and company f/s and consolidated statements,I can't seem to understand. The difference bet...,Group and company f/s and consolidated stateme...,0,group company f statementsi seem group company...


Majority class StudentLoans will be 1 while investing is 0 for target y

Calculate baseline accuracy to see if model is better than null model (predicting plurality class.)

In [5]:
comb_df['y'].value_counts(normalize=True)

1    0.506153
0    0.493847
Name: y, dtype: float64

The baseline accuracy for student loans is 0.50641 and 0.49359 for investing. We will proceed to create our X which is the clean_text column of the comb_df consisting of cleaned titles and selftext combined and y target classifications where y is the subreddit category in which student loans is 1 and investing is 0. We will proceed to do a train test split on the X and y variables and begin modeling.

In [6]:
#create X matrix and target y
X = comb_df['clean_text']
y = comb_df['y']

In [7]:
#do a train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [8]:
# Instantiate the "CountVectorizer" object, which is sklearn's
# bag of words tool.
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000,
                             min_df=2,
                             max_df=.98) 

In [9]:
# Fit the vectorizer on our corpus.
cvec.fit(X_train)
X_train_cvec = cvec.transform(X_train)

In [10]:
# Convert X_train into a DataFrame.

X_train_cvec_df = pd.DataFrame(X_train_cvec.toarray(),
                          columns=cvec.get_feature_names())
X_train_cvec_df.head()

Unnamed: 0,aapl,ability,able,accept,access,account,accrue,accruing,across,act,action,active,activity,actual,actually,affect,age,agency,aggressively,ago,ai,airline,allocation,allow,almost,along,alternative,although,always,amazing,amazon,amd,america,american,among,amount,amzn,analysis,analyst,annual,another,answer,anyone,anything,anyway,anywhere,app,apparently,appears,apple,application,apply,applying,appreciate,approach,approximately,april,area,argument,article,ask,asking,asset,assistance,assume,assuming,assumption,august,auto,automatic,automatically,available,average,aware,away,bachelor,back,balance,bank,bankruptcy,barrel,base,basic,basically,basis,bear,beat,become,begin,beginning,belief,believe,benefit,berkshire,best,bet,better,big,biggest,bill,billion,bit,bitcoin,bn,boeing,book,borrow,borrower,bottom,bought,break,broker,brokerage,bubble,buffet,buffett,built,bull,bullish,bureau,business,buy,buyback,buying,calculate,calculation,calculator,california,call,came,campus,cancel,cannot,cap,capacity,capital,capitalization,capitalize,car,care,career,case,cash,cause,cent,central,ceo,certain,certainly,chain,chance,change,charge,chart,cheap,check,checking,china,chinese,choice,choose,chunk,circumstance,citizen,city,claim,class,clear,clo,clos,close,closing,co,collection,college,come,coming,comment,commercial,common,community,company,compare,complete,completely,con,concern,consumer,contact,continue,contract,contribute,contribution,corona,coronavirus,corp,corporate,corporation,correct,correctly,cosigner,cost,count,country,couple,course,cover,covid,crash,crazy,create,creating,crisis,crypto,curious,currency,current,currently,customer,cut,dd,debt,december,department,direct,disney,dr,due,earlier,early,earn,earnest,earning,earnings,easy,economic,economy,ed,edit,effect,either,eligible,else,email,emergency,employee,employer,energy,engineering,enough,enter,entering,entire,entirely,environment,eps,equity,especially,essentially,estate,estimate,etc,etf,eu,europe,european,even,event,eventually,ever,every,everyone,everything,exactly,example,exchange,executive,exist,existing,expect,expectation,expecting,expense,expensive,experience,explain,exposure,extra,extremely,eye,face,fact,factor,fafsa,fairly,fall,family,far,fast,faster,fb,february,fee,feel,feeling,ffel,figure,file,filing,fill,final,finally,finance,financial,financially,financials,fine,finish,firm,first,five,float,flow,focus,folk,following,forbearance,force,foreign,forgiven,forgiveness,form,four,france,free,front,full,fully,future,gain,game,garnishment,gave,gdp,general,generally,generate,get,getting,give,given,giving,global,gm,go,goal,gone,google,got,gotten,gov,government,grace,grant,great,greatly,gross,group,grow,growing,growth,guess,guy,half,happen,happening,happens,happy,hate,health,healthcare,hear,hearing,heavily,hello,help,helpful,hey,hi,high,higher,highest,historical,history,hit,home,honestly,hope,hopefully,hoping,hour,house,housing,however,huge,hurt,ibm,ibr,idr,im,imagine,impact,important,impossible,inc,income,increase,increasing,inflation,info,information,initial,insight,institution,insurance,intel,interactive,interest,interesting,international,internet,invest,investment,investor,ira,issue,jan,january,job,july,jump,june,junk,keep,keeping,key,knew,know,knowing,known,lack,lake,large,larger,largest,last,late,later,law,layoff,le,learn,learning,least,leave,left,let,letter,level,leverage,life,likely,limit,line,link,list,literally,little,live,living,loan,local,log,long,longer,look,looking,lose,losing,loss,lost,lot,love,low,lower,lowest,lucky,lump,lyft,mae,mail,main,mainly,major,majority,make,making,management,manager,many,march,margin,mark,market,massive,master,math,matter,may,maybe,mean,meaning,meet,meeting,mention,message,microsoft,might,million,mine,minimum,minute,missing,mistake,mo,mobile,mohela,mom,moment,money,month,monthly,morning,mortgage,mostly,mother,move,movement,moving,msci,msft,much,multiple,musk,must,mutual,name,nasdaq,navient,nbsp,near,nearly,negative,negatively,nelnet,net,never,new,news,next,non,normal,normally,note,nothing,notice,november,number,nursing,nyse,obtain,obvious,obviously,october,offer,offering,office,offset,often,oil,okay,one,online,open,opening,opinion,opportunity,opt,option,original,originally,others,otherwise,outlook,overall,owe,owner,package,page,paper,parent,part,partner,past,pay,paye,paying,payment,payoff,pell,penny,people,per,percent,percentage,performance,perhaps,perkins,person,personal,personally,phone,pick,picking,place,plan,planning,platform,play,please,plus,pocket,point,poor,portfolio,portion,position,positive,possible,possibly,post,posting,potential,potentially,power,practice,pre,premium,pretty,previous,price,principal,principle,prior,private,pro,probably,problem,process,processing,professional,profit,program,project,property,pslf,public,publicly,pull,purchase,purchasing,put,putting,qualify,qualifying,quarter,question,quick,quickly,quite,raise,rally,range,rate,rather,rating,ratio,reach,real,realize,really,reason,reasonable,receive,receiving,recent,recently,recertification,recession,recover,recovery,reference,refinance,refinancing,reflect,regret,regular,reit,relative,relatively,release,relevant,relief,remain,remaining,remove,rent,rental,repaye,repayment,report,reporting,request,requirement,research,reserve,resource,response,responsible,rest,result,resume,retail,retirement,return,revenue,right,rise,risk,roth,roughly,rule,run,running,safe,salary,sale,sallie,save,saving,saw,say,saying,scenario,scholarship,school,schwab,science,score,sec,section,sector,secure,security,seeing,seeking,seem,seems,seen,sell,selling,semester,senior,sense,sent,sentiment,separate,separately,sept,september,service,servicer,servicers,set,several,share,sheet,shell,shit,shop,short,shorting,show,showing,sign,signer,significant,significantly,similar,simple,simply,since,single,site,situation,six,size,slightly,small,smaller,smart,social,sofi,somehow,someone,something,sometimes,somewhere,soon,sorry,sort,source,sp,specific,specifically,spent,split,spouse,spring,spy,st,stable,start,starting,state,statement,status,stay,step,stimulus,stock,stop,storage,store,story,straight,strategy,strike,strong,struggling,stuff,sub,success,suggest,suggestion,sum,summer,super,supply,support,sure,suspension,swing,switch,system,take,taken,taking,talk,talking,tanker,target,tax,teacher,tech,technical,technically,technology,tell,temporary,term,tesla,test,th,thank,thanks,theater,theory,therefore,thesis,thing,think,thinking,though,thought,three,throw,thus,thx,time,tip,title,tl,together,tomorrow,ton,took,tool,top,topic,total,track,transaction,transfer,travel,treasury,trouble,true,trump,trust,trying,tsg,tuition,turn,two,type,uber,uk,unable,unemployment,unfortunately,universal,university,unless,unsub,upcoming,upon,use,user,using,uso,usual,usually,vaccine,valuation,value,variable,via,view,virus,volatility,volume,voo,wage,wait,waiting,warren,watch,wave,way,website,week,well,went,whatever,whether,whole,wife,willing,wise,wish,within,without,work,worker,working,worry,worse,worst,worth,writing,wrong,xb,yahoo,year,yes,yet,yoy,yr,zero
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,2,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [11]:
X_train_cvec_df.shape

(1401, 1000)

In [12]:
# Transform test
X_test_cvec = cvec.transform(X_test)
X_test_cvec_df = pd.DataFrame(X_test_cvec.toarray(),
                         columns=cvec.get_feature_names())

X_test_cvec_df.head()

Unnamed: 0,aapl,ability,able,accept,access,account,accrue,accruing,across,act,action,active,activity,actual,actually,affect,age,agency,aggressively,ago,ai,airline,allocation,allow,almost,along,alternative,although,always,amazing,amazon,amd,america,american,among,amount,amzn,analysis,analyst,annual,another,answer,anyone,anything,anyway,anywhere,app,apparently,appears,apple,application,apply,applying,appreciate,approach,approximately,april,area,argument,article,ask,asking,asset,assistance,assume,assuming,assumption,august,auto,automatic,automatically,available,average,aware,away,bachelor,back,balance,bank,bankruptcy,barrel,base,basic,basically,basis,bear,beat,become,begin,beginning,belief,believe,benefit,berkshire,best,bet,better,big,biggest,bill,billion,bit,bitcoin,bn,boeing,book,borrow,borrower,bottom,bought,break,broker,brokerage,bubble,buffet,buffett,built,bull,bullish,bureau,business,buy,buyback,buying,calculate,calculation,calculator,california,call,came,campus,cancel,cannot,cap,capacity,capital,capitalization,capitalize,car,care,career,case,cash,cause,cent,central,ceo,certain,certainly,chain,chance,change,charge,chart,cheap,check,checking,china,chinese,choice,choose,chunk,circumstance,citizen,city,claim,class,clear,clo,clos,close,closing,co,collection,college,come,coming,comment,commercial,common,community,company,compare,complete,completely,con,concern,consumer,contact,continue,contract,contribute,contribution,corona,coronavirus,corp,corporate,corporation,correct,correctly,cosigner,cost,count,country,couple,course,cover,covid,crash,crazy,create,creating,crisis,crypto,curious,currency,current,currently,customer,cut,dd,debt,december,department,direct,disney,dr,due,earlier,early,earn,earnest,earning,earnings,easy,economic,economy,ed,edit,effect,either,eligible,else,email,emergency,employee,employer,energy,engineering,enough,enter,entering,entire,entirely,environment,eps,equity,especially,essentially,estate,estimate,etc,etf,eu,europe,european,even,event,eventually,ever,every,everyone,everything,exactly,example,exchange,executive,exist,existing,expect,expectation,expecting,expense,expensive,experience,explain,exposure,extra,extremely,eye,face,fact,factor,fafsa,fairly,fall,family,far,fast,faster,fb,february,fee,feel,feeling,ffel,figure,file,filing,fill,final,finally,finance,financial,financially,financials,fine,finish,firm,first,five,float,flow,focus,folk,following,forbearance,force,foreign,forgiven,forgiveness,form,four,france,free,front,full,fully,future,gain,game,garnishment,gave,gdp,general,generally,generate,get,getting,give,given,giving,global,gm,go,goal,gone,google,got,gotten,gov,government,grace,grant,great,greatly,gross,group,grow,growing,growth,guess,guy,half,happen,happening,happens,happy,hate,health,healthcare,hear,hearing,heavily,hello,help,helpful,hey,hi,high,higher,highest,historical,history,hit,home,honestly,hope,hopefully,hoping,hour,house,housing,however,huge,hurt,ibm,ibr,idr,im,imagine,impact,important,impossible,inc,income,increase,increasing,inflation,info,information,initial,insight,institution,insurance,intel,interactive,interest,interesting,international,internet,invest,investment,investor,ira,issue,jan,january,job,july,jump,june,junk,keep,keeping,key,knew,know,knowing,known,lack,lake,large,larger,largest,last,late,later,law,layoff,le,learn,learning,least,leave,left,let,letter,level,leverage,life,likely,limit,line,link,list,literally,little,live,living,loan,local,log,long,longer,look,looking,lose,losing,loss,lost,lot,love,low,lower,lowest,lucky,lump,lyft,mae,mail,main,mainly,major,majority,make,making,management,manager,many,march,margin,mark,market,massive,master,math,matter,may,maybe,mean,meaning,meet,meeting,mention,message,microsoft,might,million,mine,minimum,minute,missing,mistake,mo,mobile,mohela,mom,moment,money,month,monthly,morning,mortgage,mostly,mother,move,movement,moving,msci,msft,much,multiple,musk,must,mutual,name,nasdaq,navient,nbsp,near,nearly,negative,negatively,nelnet,net,never,new,news,next,non,normal,normally,note,nothing,notice,november,number,nursing,nyse,obtain,obvious,obviously,october,offer,offering,office,offset,often,oil,okay,one,online,open,opening,opinion,opportunity,opt,option,original,originally,others,otherwise,outlook,overall,owe,owner,package,page,paper,parent,part,partner,past,pay,paye,paying,payment,payoff,pell,penny,people,per,percent,percentage,performance,perhaps,perkins,person,personal,personally,phone,pick,picking,place,plan,planning,platform,play,please,plus,pocket,point,poor,portfolio,portion,position,positive,possible,possibly,post,posting,potential,potentially,power,practice,pre,premium,pretty,previous,price,principal,principle,prior,private,pro,probably,problem,process,processing,professional,profit,program,project,property,pslf,public,publicly,pull,purchase,purchasing,put,putting,qualify,qualifying,quarter,question,quick,quickly,quite,raise,rally,range,rate,rather,rating,ratio,reach,real,realize,really,reason,reasonable,receive,receiving,recent,recently,recertification,recession,recover,recovery,reference,refinance,refinancing,reflect,regret,regular,reit,relative,relatively,release,relevant,relief,remain,remaining,remove,rent,rental,repaye,repayment,report,reporting,request,requirement,research,reserve,resource,response,responsible,rest,result,resume,retail,retirement,return,revenue,right,rise,risk,roth,roughly,rule,run,running,safe,salary,sale,sallie,save,saving,saw,say,saying,scenario,scholarship,school,schwab,science,score,sec,section,sector,secure,security,seeing,seeking,seem,seems,seen,sell,selling,semester,senior,sense,sent,sentiment,separate,separately,sept,september,service,servicer,servicers,set,several,share,sheet,shell,shit,shop,short,shorting,show,showing,sign,signer,significant,significantly,similar,simple,simply,since,single,site,situation,six,size,slightly,small,smaller,smart,social,sofi,somehow,someone,something,sometimes,somewhere,soon,sorry,sort,source,sp,specific,specifically,spent,split,spouse,spring,spy,st,stable,start,starting,state,statement,status,stay,step,stimulus,stock,stop,storage,store,story,straight,strategy,strike,strong,struggling,stuff,sub,success,suggest,suggestion,sum,summer,super,supply,support,sure,suspension,swing,switch,system,take,taken,taking,talk,talking,tanker,target,tax,teacher,tech,technical,technically,technology,tell,temporary,term,tesla,test,th,thank,thanks,theater,theory,therefore,thesis,thing,think,thinking,though,thought,three,throw,thus,thx,time,tip,title,tl,together,tomorrow,ton,took,tool,top,topic,total,track,transaction,transfer,travel,treasury,trouble,true,trump,trust,trying,tsg,tuition,turn,two,type,uber,uk,unable,unemployment,unfortunately,universal,university,unless,unsub,upcoming,upon,use,user,using,uso,usual,usually,vaccine,valuation,value,variable,via,view,virus,volatility,volume,voo,wage,wait,waiting,warren,watch,wave,way,website,week,well,went,whatever,whether,whole,wife,willing,wise,wish,within,without,work,worker,working,worry,worse,worst,worth,writing,wrong,xb,yahoo,year,yes,yet,yoy,yr,zero
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Logistic regression count vectorizer

In [13]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver='lbfgs',max_iter=500)

# Fit model to training data.
lr.fit(X_train_cvec_df,y_train)

# Evaluate model on training data.
lr.score(X_train_cvec_df,y_train) #Accuracy

0.9992862241256245

In [14]:
# Evaluate model on testing data.

lr.score(X_test_cvec_df,y_test)

0.9743589743589743

### Multinomial Bayes count vectorizer

In [15]:
# Instantiate our Multinomial Bayes and fit

nb = MultinomialNB()
multi = nb.fit(X_train_cvec_df, y_train)

In [16]:
# Generate our predictions

predictions = multi.predict(X_test_cvec_df)

In [17]:
# Score our model on the training set.

multi.score(X_train_cvec_df, y_train)

0.9850107066381156

In [18]:
# Score our model on the test set.

multi.score(X_test_cvec_df, y_test)

0.9829059829059829

In [19]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[225,   6],
       [  2, 235]])

In [20]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [21]:
#print out each element in the confusion matrix
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 225
False Positives: 6
False Negatives: 2
True Positives: 235


In [22]:
#prob for positive class
pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()
#prob for negative class
neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
#getting the top features 
neg_top_features = np.take(cvec.get_feature_names(), neg_class_prob_sorted)
pos_top_features = np.take(cvec.get_feature_names(), pos_class_prob_sorted)

print(pos_class_prob_sorted)
print(neg_class_prob_sorted)
print(neg_top_features)
print(pos_top_features)

In [23]:
# Instantiate the TFIDF vectorizer transformer.
tvec = TfidfVectorizer()

In [24]:
# Fit the vectorizer on our corpus.
tvec.fit(X_train)
X_train_tvec = tvec.transform(X_train)

In [25]:
tvec_train_df = pd.DataFrame(X_train_tvec.toarray(),
                  columns=tvec.get_feature_names())

In [26]:
X_test_tvec = tvec.transform(X_test)

tvec_test_df = pd.DataFrame(X_test_tvec.toarray(),
                  columns=tvec.get_feature_names())

### Logistic Regression TFIDF vectorizer

In [27]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver='lbfgs',max_iter=500)

# Fit model to training data.
lr.fit(tvec_train_df,y_train)

# Evaluate model on training data.
lr.score(tvec_train_df,y_train) #Accuracy

0.9921484653818701

In [28]:
# Evaluate model on testing data.

lr.score(tvec_test_df,y_test)

0.9700854700854701

### Multinomial Bayes TFIDF vectorizer

In [29]:
# Instantiate our Multinomial Bayes and fit

nb = MultinomialNB()
multi = nb.fit(tvec_train_df, y_train)

In [30]:
# Generate our predictions

predictions = multi.predict(tvec_test_df)
multi.score(tvec_train_df, y_train)

0.9892933618843683

In [31]:
# Score our model on the test set.

multi.score(tvec_test_df, y_test)

0.9786324786324786

In [32]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[222,   9],
       [  1, 236]])

In [33]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [34]:
#print out each element in the confusion matrix
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 222
False Positives: 9
False Negatives: 1
True Positives: 236


In [35]:
#prob for positive class
pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()
#prob for negative class
neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
#getting the top features 
neg_top_features = np.take(tvec.get_feature_names(), neg_class_prob_sorted)
pos_top_features = np.take(tvec.get_feature_names(), pos_class_prob_sorted)

## Support vector machine count vectorizer

In [36]:
# Instantiate support vector machine.
svc = SVC()
# Fit support vector machine to training data.
svc.fit(X_train_cvec_df, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [37]:
# Generate predictions.
pred = svc.predict(X_test_cvec_df)

In [38]:
# Measure performance based on accuracy.
print(svc.score(X_train_cvec_df, y_train))
svc.score(X_test_cvec_df,y_test)

0.987152034261242


0.9572649572649573

## Support vector machine TFIDF vectorizer

In [39]:
# Fit support vector machine to training data.
svc.fit(tvec_train_df, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
# Generate predictions.
pred = svc.predict(tvec_test_df)

In [41]:
# Measure performance based on accuracy.
print(svc.score(tvec_train_df, y_train))
svc.score(tvec_test_df,y_test)

0.9992862241256245


0.9807692307692307

## Logistic Regression count vectorizer CV

In [42]:
#cross val score for logistic regression count vectorizer
lr_model_cvec = make_pipeline(
    cvec,
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)
)
lrcvec_cvscore = cross_val_score(lr_model_cvec, X_train, y_train, cv=5, scoring='roc_auc')
lr_model_cvec.fit(X_train, y_train)
y_pred = lr_model_cvec.predict(X_test)
print('Logistic regression model cvec')
print(f'Train score: {round(lrcvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(lr_model_cvec.named_steps.countvectorizer.get_feature_names())}')

Logistic regression model cvec
Train score: 0.9941
Test score: 0.9745
Number of features: 1000


## Logistic Regression TFIDF vectorizer CV

In [43]:
lr_model_tvec = make_pipeline(
    tvec,
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)
)
lrtvec_cvscore = cross_val_score(lr_model_tvec, X_train, y_train, cv=5, scoring='roc_auc')
lr_model_tvec.fit(X_train, y_train)
y_pred = lr_model_tvec.predict(X_test)
print('Logistic regression model')
print(f'Train score: {round(lrtvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(lr_model_tvec.named_steps.tfidfvectorizer.get_feature_names())}')

Logistic regression model
Train score: 0.9978
Test score: 0.9704
Number of features: 6610


## Multinomial Bayes count vectorizer CV

In [44]:
multi_model_cvec = make_pipeline(
    cvec,
    MultinomialNB()
)
multicvec_cvscore = cross_val_score(multi_model_cvec, X_train, y_train, cv=5, scoring='roc_auc')
multi_model_cvec.fit(X_train, y_train)
y_pred = multi_model_cvec.predict(X_test)
print('Multinomial naive Bayes classifier')
print(f'Train score: {round(multicvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(multi_model_cvec.named_steps.countvectorizer.get_feature_names())}')

Multinomial naive Bayes classifier
Train score: 0.9972
Test score: 0.9828
Number of features: 1000


## Multinomial Bayes TFIDF vectorizer CV

In [45]:
multi_model_tvec = make_pipeline(
    tvec,
    MultinomialNB()
)
multitvec_cvscore = cross_val_score(multi_model_tvec, X_train, y_train, cv=5, scoring='roc_auc')
multi_model_tvec.fit(X_train, y_train)
y_pred = multi_model_tvec.predict(X_test)
print('Multinomial naive Bayes classifier')
print(f'Train score: {round(multitvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(multi_model_tvec.named_steps.tfidfvectorizer.get_feature_names())}')

Multinomial naive Bayes classifier
Train score: 0.9979
Test score: 0.9784
Number of features: 6610


## GridsearchCV logistic regression count vectorizer

In [46]:
#Gridsearch CV logistic regresssion count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'lr__solver': ['lbfgs'],
    'lr__multi_class': ['auto'],
    'lr__random_state': [42]
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:   49.7s finished


Best params: {'cvec__max_df': 0.5, 'cvec__max_features': 2000, 'cvec__ngram_range': (1, 2), 'lr__multi_class': 'auto', 'lr__random_state': 42, 'lr__solver': 'lbfgs'}
Train score: 0.9722
Test score: 0.9722
Number of features: 2000


## Gridsearch CV logistic regresssion TFIDF vectorizer

In [47]:
#Gridsearch CV logistic regresssion TFIDF vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'lr__solver': ['lbfgs'],
    'lr__multi_class': ['auto'],
    'lr__random_state': [42]
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:   30.8s finished


Best params: {'lr__multi_class': 'auto', 'lr__random_state': 42, 'lr__solver': 'lbfgs', 'tvec__max_df': 0.5, 'tvec__max_features': 6000, 'tvec__ngram_range': (1, 3), 'tvec__stop_words': 'english'}
Train score: 0.9822
Test score: 0.9829
Number of features: 6000


## Gridsearch CV multinomial bayes count vectorizer

In [48]:
#Gridsearch CV multinomial bayes count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'nb__alpha': [0.1,0.25,0.5,0.75,1]

}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed:  2.5min finished


Best params: {'cvec__max_df': 0.5, 'cvec__max_features': 8000, 'cvec__ngram_range': (1, 3), 'nb__alpha': 0.5}
Train score: 0.9864
Test score: 0.9893
Number of features: 8000


## Gridsearch CV multinomial TFIDF vectorizer

In [49]:
#Gridsearch CV multinomial TFIDF vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'nb__alpha': [0.1,0.25,0.5,0.75,1]

}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed:  2.3min finished


Best params: {'nb__alpha': 0.1, 'tvec__max_df': 0.5, 'tvec__max_features': 4000, 'tvec__ngram_range': (1, 3), 'tvec__stop_words': 'english'}
Train score: 0.985
Test score: 0.9786
Number of features: 4000


## Gridsearch CV support vector machine count vectorizer

In [50]:
#Gridsearch CV support vector machine count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('svc', SVC())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'svc__kernel': ['linear','poly','rbf']
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:  2.5min finished


Best params: {'cvec__max_df': 0.5, 'cvec__max_features': 10000, 'cvec__ngram_range': (1, 2), 'svc__kernel': 'linear'}
Train score: 0.9743
Test score: 0.9679
Number of features: 10000


## Gridsearch CV support vector machine TFIDF vectorizer

In [53]:
#Gridsearch CV support vector machine count vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('svc', SVC())
])

params = {
    
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'svc__kernel': ['linear','poly','rbf']
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:  3.7min finished


Best params: {'svc__kernel': 'rbf', 'tvec__max_df': 0.5, 'tvec__max_features': 4000, 'tvec__ngram_range': (1, 3)}
Train score: 0.9843
Test score: 0.9893
Number of features: 4000



|Model|Train Score|Test Score|
|---|---|---|
|Logistic Regression count vectorizer CV|0.9941|0.9745|
|Logistic Regression TFIDF vectorizer CV|0.9978|0.9704|
|Multinomial Bayes count vectorizer CV|0.9972|0.9828|
|Multinomial Bayes TFIDF vectorizer CV|0.9979|0.9784|
||||