# Project 3 Part 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from collections import Counter
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, roc_auc_score
import seaborn as sns
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)

### Table of Contents
- [Preprocessing and Modeling](#Preprocessing-and-Modeling)
- [Combine both subreddit dataframes](#Combine-both-subreddit-dataframes)
- [Cleaning function for all_text column](#Cleaning-function-for-all_text-column)
- [EDA](#EDA)

## Preprocessing and Modeling

In [2]:
#read the comb_df csv
comb_df= pd.read_csv('./datasets/comb_df.csv')

In [3]:
comb_df.shape

(1939, 7)

In [4]:
comb_df.head()

Unnamed: 0,name,subreddit,title,selftext,all_text,y,clean_text
0,t3_gjnn7e,personalfinance,Overwhelmed By My Finances &amp; Getting Force...,"So, the overview: I'm an out of work cook with...",Overwhelmed By My Finances &amp; Getting Force...,1,finance getting forceful othersso overview wor...
1,t3_gjnmwp,personalfinance,National Insurance - Year is Not Full,According to [tax.service.gov.uk](https://tax....,National Insurance - Year is Not FullAccording...,1,national insurance year one asking voluntary c...
2,t3_gjnlbs,personalfinance,Refinancing Process,I’m in Wyoming. And I’m pursuing a refinance t...,Refinancing ProcessI’m in Wyoming. And I’m pur...,1,refinancing processi wyoming pursuing refinanc...
3,t3_gjned9,personalfinance,Finally maxing out my SEP IRA (as an employee)...,"Like the title says, I am now financially able...",Finally maxing out my SEP IRA (as an employee)...,1,finally maxing sep ira employee title say fina...
4,t3_gjneqq,personalfinance,Advice for limiting damage from joint &amp; au...,My fiance's father is going through a pretty r...,Advice for limiting damage from joint &amp; au...,1,limiting joint user person go rehab fiance fat...


Majority class personalfinance will be 1 while StudentLoans is 0 for target y

Calculate baseline accuracy to see if model is better than null model (predicting plurality class.)

In [5]:
comb_df['y'].value_counts(normalize=True)

1    0.51212
0    0.48788
Name: y, dtype: float64

The baseline accuracy for student loans is 0.48788 and 0.51212 for personalfinance. We will proceed to create our X which is the clean_text column of the comb_df consisting of cleaned titles and selftext combined and y target classifications where y is the subreddit category in which personalfinance is 1 and StudentLoans is 0. We will proceed to do a train test split on the X and y variables and begin modeling.

In [6]:
#create X matrix and target y
X = comb_df['clean_text']
y = comb_df['y']

In [7]:
#do a train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [8]:
type(y_test)

pandas.core.series.Series

In [9]:
# Instantiate the "CountVectorizer" object, which is sklearn's
# bag of words tool.
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000,
                             min_df=2,
                             max_df=.98) 

In [10]:
# Fit the vectorizer on our corpus.
cvec.fit(X_train)
X_train_cvec = cvec.transform(X_train)

In [11]:
# Convert X_train into a DataFrame.

X_train_cvec_df = pd.DataFrame(X_train_cvec.toarray(),
                          columns=cvec.get_feature_names())
X_train_cvec_df.head()

Unnamed: 0,ability,able,absolutely,accept,accepting,access,account,accrue,accruing,across,act,action,activity,actual,actually,affect,age,agency,aggressive,aggressively,ago,agreement,alert,allocation,allow,ally,almost,alone,along,although,always,amazing,american,amount,annual,annually,another,answer,anxiety,anymore,anyone,anything,anyway,anyways,anywhere,apartment,app,apparently,application,apply,applying,appreciate,approach,approval,approximately,apr,april,area,article,ask,asking,asset,assistance,assume,assuming,attack,attempting,august,auto,automatic,automatically,available,avalanche,average,aware,away,awhile,bachelor,back,balance,bank,banking,bankruptcy,basically,become,begin,beginning,believe,benefit,best,better,big,bigger,bill,bit,boat,bonus,book,borrow,borrower,bought,bracket,break,broke,broker,brokerage,brother,built,bureau,business,buy,buyer,buying,ca,calculation,calculator,california,call,calling,came,campus,cancel,cannot,cant,cap,capital,capitalize,car,care,career,case,cash,cat,cause,cc,certain,certainly,chance,change,changing,charge,chase,cheaper,check,checking,choice,choose,chose,chunk,circumstance,citizen,city,claim,claiming,class,clear,close,closer,closing,co,collect,collection,collector,college,come,coming,comment,community,company,complete,completely,computer,con,concept,concern,confirm,confirmation,confusing,contact,continue,contract,contribute,contributing,contribution,control,corona,coronavirus,correct,cosigner,cost,count,country,couple,course,court,cover,covid,crazy,crisis,curious,current,currently,customer,cut,day,debt,december,department,direct,discover,dr,due,earlier,early,earn,earnest,earning,earnings,easily,easy,economy,ed,edit,effect,either,eligibility,eligible,else,email,emergency,employee,employer,employment,enjoy,enough,entire,entirely,equifax,equity,error,escrow,especially,essentially,estate,estimate,etc,etf,europe,even,event,eventually,ever,every,everyone,everything,ex,exact,exactly,example,except,existing,expect,expecting,expense,expensive,experian,experience,explain,extra,extremely,fact,factor,fafsa,fair,fairly,fall,family,far,fargo,fast,faster,father,fear,february,fee,feel,feeling,fell,felt,ffel,fiance,fico,figure,figuring,file,filing,fill,final,finally,finance,financial,financially,fine,finish,finishing,first,five,fix,focus,folk,follow,following,foot,forbearance,forgiven,forgiveness,form,fortunate,four,free,freshman,front,frozen,fsa,full,fully,fun,future,gain,gas,gave,general,generally,get,getting,give,given,giving,go,goal,gone,gonna,google,got,gotten,gov,government,grace,grant,great,greatly,grocery,gross,group,grow,growing,growth,guess,guy,half,happen,happening,happens,happy,hate,health,healthcare,hear,hello,help,helpful,helping,hey,hi,high,higher,highest,hire,history,hit,hoa,home,honestly,hope,hopefully,hoping,hospital,hour,house,housing,however,hr,hsa,huge,hurt,hysa,ibr,idr,im,impact,important,impossible,improvement,income,increase,info,information,initial,input,insight,institution,insurance,interest,international,internet,internship,interview,invest,investment,ira,irs,ish,issue,item,january,job,july,june,karma,keep,keeping,kept,know,knowing,lake,large,larger,last,late,later,law,le,learn,learning,lease,least,leave,leaving,left,legal,let,letter,level,life,light,likely,limit,line,link,list,literally,little,live,living,loan,loansi,local,location,log,lol,long,longer,look,looking,lose,losing,loss,lost,lot,love,low,lower,lowest,luck,lucky,lump,mae,mail,main,major,make,making,management,many,march,mark,market,master,match,math,matter,max,maximum,maxing,may,maybe,mba,mean,meaning,meet,member,mental,mention,merchant,mess,message,might,mile,mine,minimal,minimum,minor,minute,miss,missing,mistake,mo,mobile,mom,moment,money,month,monthly,morning,mortgage,mostly,mother,move,moving,much,multiple,must,mutual,name,navient,nbsp,near,nearly,negative,negotiate,nelnet,net,network,never,new,news,next,nice,non,none,normal,normally,note,nothing,notice,november,number,nurse,nursing,nyc,obtain,obvious,obviously,october,offer,offering,office,offset,often,ok,okay,one,online,onto,open,opening,opinion,opportunity,option,original,originally,others,otherwise,overall,owe,owing,owner,pa,package,page,paper,paperwork,parent,part,particular,partner,party,past,path,pay,paycheck,paye,paying,payment,payoff,paypal,pell,penalty,people,per,percentage,perkins,person,personal,personally,phone,physical,pick,place,plan,planning,play,please,plus,pmi,pocket,point,policy,poor,portfolio,portion,position,possible,possibly,post,potential,potentially,pre,premium,prepare,pretty,previous,previously,price,primary,principal,principle,prior,private,pro,probably,problem,process,processing,professional,profit,program,promotion,property,pslf,pt,public,pull,pulling,purchase,purchasing,purpose,pursue,put,putting,qualify,qualifying,question,quick,quickly,quit,quite,raise,range,rate,rather,ratio,reach,real,realize,really,reason,reasonable,receipt,receive,receiving,recent,recently,recertification,recession,reference,refinance,refinancing,regular,relationship,relatively,relief,remaining,remember,remote,remove,rent,rental,renting,repair,repay,repaye,repayment,report,reporting,request,require,requirement,research,response,responsible,rest,result,retire,retirement,return,review,rich,right,risk,role,roll,rolling,rollover,room,roth,roughly,rule,run,running,safe,salary,sale,sallie,save,saving,saw,say,saying,scam,scenario,scholarship,school,schwab,science,score,search,searching,section,sector,secure,security,seeing,seeking,seem,seems,seen,self,sell,seller,selling,semester,senior,sense,sent,separate,separately,sept,september,serious,service,servicer,servicers,servicing,set,settle,settlement,several,share,shipping,shit,short,show,showing,sign,signer,significant,significantly,similar,simple,simply,since,single,sister,site,sitting,situation,six,slightly,small,smaller,smart,social,sofi,somehow,someone,something,somewhat,somewhere,son,soon,sorry,sort,source,specific,specifically,spent,split,spoke,spouse,spring,st,stable,start,starting,state,statement,stating,status,stay,staying,step,stimulus,stock,stop,story,straight,strategy,struggling,stuck,stuff,sub,submit,suggest,suggestion,sum,summer,super,support,sure,suspension,switch,switching,system,take,taken,taking,talk,talking,target,tax,taxable,teacher,teaching,technically,tell,telling,temporarily,temporary,ten,term,terrible,texas,text,th,thank,thankfully,thanks,therefore,thing,thinking,though,thought,three,throughout,throw,thus,till,time,tip,title,tl,together,tomorrow,ton,took,tool,top,topic,total,totally,town,track,training,transaction,transfer,transferring,travel,trip,trouble,true,trust,trying,tuition,turn,two,type,typically,uk,unable,unemployment,unfortunately,union,university,unless,unsub,unsure,upfront,upon,usa,use,using,usual,usually,utility,value,variable,vehicle,venmo,verify,via,virus,wage,wait,waiting,want,wanting,water,way,website,week,weekly,well,went,whatever,whether,whole,wife,wiki,willing,wise,wish,within,without,work,worker,working,worry,worse,worth,write,wrong,xb,year,yearly,yes,yet,young,yr,zero
0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,1,5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,2,2,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,9,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [12]:
X_train_cvec_df.shape

(1454, 1000)

In [13]:
# Transform test
X_test_cvec = cvec.transform(X_test)
X_test_cvec_df = pd.DataFrame(X_test_cvec.toarray(),
                         columns=cvec.get_feature_names())

X_test_cvec_df.head()

Unnamed: 0,ability,able,absolutely,accept,accepting,access,account,accrue,accruing,across,act,action,activity,actual,actually,affect,age,agency,aggressive,aggressively,ago,agreement,alert,allocation,allow,ally,almost,alone,along,although,always,amazing,american,amount,annual,annually,another,answer,anxiety,anymore,anyone,anything,anyway,anyways,anywhere,apartment,app,apparently,application,apply,applying,appreciate,approach,approval,approximately,apr,april,area,article,ask,asking,asset,assistance,assume,assuming,attack,attempting,august,auto,automatic,automatically,available,avalanche,average,aware,away,awhile,bachelor,back,balance,bank,banking,bankruptcy,basically,become,begin,beginning,believe,benefit,best,better,big,bigger,bill,bit,boat,bonus,book,borrow,borrower,bought,bracket,break,broke,broker,brokerage,brother,built,bureau,business,buy,buyer,buying,ca,calculation,calculator,california,call,calling,came,campus,cancel,cannot,cant,cap,capital,capitalize,car,care,career,case,cash,cat,cause,cc,certain,certainly,chance,change,changing,charge,chase,cheaper,check,checking,choice,choose,chose,chunk,circumstance,citizen,city,claim,claiming,class,clear,close,closer,closing,co,collect,collection,collector,college,come,coming,comment,community,company,complete,completely,computer,con,concept,concern,confirm,confirmation,confusing,contact,continue,contract,contribute,contributing,contribution,control,corona,coronavirus,correct,cosigner,cost,count,country,couple,course,court,cover,covid,crazy,crisis,curious,current,currently,customer,cut,day,debt,december,department,direct,discover,dr,due,earlier,early,earn,earnest,earning,earnings,easily,easy,economy,ed,edit,effect,either,eligibility,eligible,else,email,emergency,employee,employer,employment,enjoy,enough,entire,entirely,equifax,equity,error,escrow,especially,essentially,estate,estimate,etc,etf,europe,even,event,eventually,ever,every,everyone,everything,ex,exact,exactly,example,except,existing,expect,expecting,expense,expensive,experian,experience,explain,extra,extremely,fact,factor,fafsa,fair,fairly,fall,family,far,fargo,fast,faster,father,fear,february,fee,feel,feeling,fell,felt,ffel,fiance,fico,figure,figuring,file,filing,fill,final,finally,finance,financial,financially,fine,finish,finishing,first,five,fix,focus,folk,follow,following,foot,forbearance,forgiven,forgiveness,form,fortunate,four,free,freshman,front,frozen,fsa,full,fully,fun,future,gain,gas,gave,general,generally,get,getting,give,given,giving,go,goal,gone,gonna,google,got,gotten,gov,government,grace,grant,great,greatly,grocery,gross,group,grow,growing,growth,guess,guy,half,happen,happening,happens,happy,hate,health,healthcare,hear,hello,help,helpful,helping,hey,hi,high,higher,highest,hire,history,hit,hoa,home,honestly,hope,hopefully,hoping,hospital,hour,house,housing,however,hr,hsa,huge,hurt,hysa,ibr,idr,im,impact,important,impossible,improvement,income,increase,info,information,initial,input,insight,institution,insurance,interest,international,internet,internship,interview,invest,investment,ira,irs,ish,issue,item,january,job,july,june,karma,keep,keeping,kept,know,knowing,lake,large,larger,last,late,later,law,le,learn,learning,lease,least,leave,leaving,left,legal,let,letter,level,life,light,likely,limit,line,link,list,literally,little,live,living,loan,loansi,local,location,log,lol,long,longer,look,looking,lose,losing,loss,lost,lot,love,low,lower,lowest,luck,lucky,lump,mae,mail,main,major,make,making,management,many,march,mark,market,master,match,math,matter,max,maximum,maxing,may,maybe,mba,mean,meaning,meet,member,mental,mention,merchant,mess,message,might,mile,mine,minimal,minimum,minor,minute,miss,missing,mistake,mo,mobile,mom,moment,money,month,monthly,morning,mortgage,mostly,mother,move,moving,much,multiple,must,mutual,name,navient,nbsp,near,nearly,negative,negotiate,nelnet,net,network,never,new,news,next,nice,non,none,normal,normally,note,nothing,notice,november,number,nurse,nursing,nyc,obtain,obvious,obviously,october,offer,offering,office,offset,often,ok,okay,one,online,onto,open,opening,opinion,opportunity,option,original,originally,others,otherwise,overall,owe,owing,owner,pa,package,page,paper,paperwork,parent,part,particular,partner,party,past,path,pay,paycheck,paye,paying,payment,payoff,paypal,pell,penalty,people,per,percentage,perkins,person,personal,personally,phone,physical,pick,place,plan,planning,play,please,plus,pmi,pocket,point,policy,poor,portfolio,portion,position,possible,possibly,post,potential,potentially,pre,premium,prepare,pretty,previous,previously,price,primary,principal,principle,prior,private,pro,probably,problem,process,processing,professional,profit,program,promotion,property,pslf,pt,public,pull,pulling,purchase,purchasing,purpose,pursue,put,putting,qualify,qualifying,question,quick,quickly,quit,quite,raise,range,rate,rather,ratio,reach,real,realize,really,reason,reasonable,receipt,receive,receiving,recent,recently,recertification,recession,reference,refinance,refinancing,regular,relationship,relatively,relief,remaining,remember,remote,remove,rent,rental,renting,repair,repay,repaye,repayment,report,reporting,request,require,requirement,research,response,responsible,rest,result,retire,retirement,return,review,rich,right,risk,role,roll,rolling,rollover,room,roth,roughly,rule,run,running,safe,salary,sale,sallie,save,saving,saw,say,saying,scam,scenario,scholarship,school,schwab,science,score,search,searching,section,sector,secure,security,seeing,seeking,seem,seems,seen,self,sell,seller,selling,semester,senior,sense,sent,separate,separately,sept,september,serious,service,servicer,servicers,servicing,set,settle,settlement,several,share,shipping,shit,short,show,showing,sign,signer,significant,significantly,similar,simple,simply,since,single,sister,site,sitting,situation,six,slightly,small,smaller,smart,social,sofi,somehow,someone,something,somewhat,somewhere,son,soon,sorry,sort,source,specific,specifically,spent,split,spoke,spouse,spring,st,stable,start,starting,state,statement,stating,status,stay,staying,step,stimulus,stock,stop,story,straight,strategy,struggling,stuck,stuff,sub,submit,suggest,suggestion,sum,summer,super,support,sure,suspension,switch,switching,system,take,taken,taking,talk,talking,target,tax,taxable,teacher,teaching,technically,tell,telling,temporarily,temporary,ten,term,terrible,texas,text,th,thank,thankfully,thanks,therefore,thing,thinking,though,thought,three,throughout,throw,thus,till,time,tip,title,tl,together,tomorrow,ton,took,tool,top,topic,total,totally,town,track,training,transaction,transfer,transferring,travel,trip,trouble,true,trust,trying,tuition,turn,two,type,typically,uk,unable,unemployment,unfortunately,union,university,unless,unsub,unsure,upfront,upon,usa,use,using,usual,usually,utility,value,variable,vehicle,venmo,verify,via,virus,wage,wait,waiting,want,wanting,water,way,website,week,weekly,well,went,whatever,whether,whole,wife,wiki,willing,wise,wish,within,without,work,worker,working,worry,worse,worth,write,wrong,xb,year,yearly,yes,yet,young,yr,zero
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


### Logistic regression count vectorizer

In [14]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver='lbfgs',max_iter=500)

# Fit model to training data.
lr.fit(X_train_cvec_df,y_train)

# Evaluate model on training data.
lr.score(X_train_cvec_df,y_train) #Accuracy

0.9938101788170564

In [15]:
# Evaluate model on testing data.

lr.score(X_test_cvec_df,y_test)

0.8742268041237113

### Multinomial Bayes count vectorizer

In [16]:
# Instantiate our Multinomial Bayes and fit

nb = MultinomialNB()
multi = nb.fit(X_train_cvec_df, y_train)

In [17]:
# Generate our predictions

predictions = multi.predict(X_test_cvec_df)

In [18]:
# Score our model on the training set.

multi.score(X_train_cvec_df, y_train)

0.9270976616231087

In [19]:
# Score our model on the test set.

multi.score(X_test_cvec_df, y_test)

0.9195876288659793

In [20]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[219,  18],
       [ 21, 227]])

In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [22]:
#print out each element in the confusion matrix
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 219
False Positives: 18
False Negatives: 21
True Positives: 227


In [23]:
#prob for positive class
pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()
#prob for negative class
neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
#getting the top features 
neg_top_features = np.take(cvec.get_feature_names(), neg_class_prob_sorted)
pos_top_features = np.take(cvec.get_feature_names(), pos_class_prob_sorted)

print(pos_class_prob_sorted)
print(neg_class_prob_sorted)
print(neg_top_features)
print(pos_top_features)

In [24]:
# Instantiate the TFIDF vectorizer transformer.
tvec = TfidfVectorizer()

In [25]:
# Fit the vectorizer on our corpus.
tvec.fit(X_train)
X_train_tvec = tvec.transform(X_train)

In [26]:
tvec_train_df = pd.DataFrame(X_train_tvec.toarray(),
                  columns=tvec.get_feature_names())

In [27]:
X_test_tvec = tvec.transform(X_test)

tvec_test_df = pd.DataFrame(X_test_tvec.toarray(),
                  columns=tvec.get_feature_names())

### Logistic Regression TFIDF vectorizer

In [28]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver='lbfgs',max_iter=500)

# Fit model to training data.
lr.fit(tvec_train_df,y_train)

# Evaluate model on training data.
lr.score(tvec_train_df,y_train) #Accuracy

0.9456671251719395

In [29]:
# Evaluate model on testing data.

lr.score(tvec_test_df,y_test)

0.9278350515463918

### Multinomial Bayes TFIDF vectorizer

In [30]:
# Instantiate our Multinomial Bayes and fit

nb = MultinomialNB()
multi = nb.fit(tvec_train_df, y_train)

In [31]:
# Generate our predictions

predictions = multi.predict(tvec_test_df)
multi.score(tvec_train_df, y_train)

0.9580467675378267

In [32]:
# Score our model on the test set.

multi.score(tvec_test_df, y_test)

0.9154639175257732

In [33]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[215,  22],
       [ 19, 229]])

In [34]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [35]:
#print out each element in the confusion matrix
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 215
False Positives: 22
False Negatives: 19
True Positives: 229


In [36]:
#prob for positive class
pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()
#prob for negative class
neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
#getting the top features 
neg_top_features = np.take(tvec.get_feature_names(), neg_class_prob_sorted)
pos_top_features = np.take(tvec.get_feature_names(), pos_class_prob_sorted)

## Support vector machine count vectorizer

In [37]:
# Instantiate support vector machine.
svc = SVC()
# Fit support vector machine to training data.
svc.fit(X_train_cvec_df, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
# Generate predictions.
pred = svc.predict(X_test_cvec_df)

In [39]:
# Measure performance based on accuracy.
print(svc.score(X_train_cvec_df, y_train))
svc.score(X_test_cvec_df,y_test)

0.953232462173315


0.911340206185567

## Support vector machine TFIDF vectorizer

In [40]:
# Fit support vector machine to training data.
svc.fit(tvec_train_df, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
# Generate predictions.
pred = svc.predict(tvec_test_df)

In [42]:
# Measure performance based on accuracy.
print(svc.score(tvec_train_df, y_train))
svc.score(tvec_test_df,y_test)

0.9931224209078404


0.9278350515463918

## Logistic Regression count vectorizer CV

In [43]:
#cross val score for logistic regression count vectorizer
lr_model_cvec = make_pipeline(
    cvec,
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)
)
lrcvec_cvscore = cross_val_score(lr_model_cvec, X_train, y_train, cv=5, scoring='roc_auc')
lr_model_cvec.fit(X_train, y_train)
y_pred = lr_model_cvec.predict(X_test)
print('Logistic regression model cvec')
print(f'Train score: {round(lrcvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(lr_model_cvec.named_steps.countvectorizer.get_feature_names())}')

Logistic regression model cvec
Train score: 0.9465
Test score: 0.8745
Number of features: 1000


## Logistic Regression TFIDF vectorizer CV

In [44]:
lr_model_tvec = make_pipeline(
    tvec,
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)
)
lrtvec_cvscore = cross_val_score(lr_model_tvec, X_train, y_train, cv=5, scoring='roc_auc')
lr_model_tvec.fit(X_train, y_train)
y_pred = lr_model_tvec.predict(X_test)
print('Logistic regression model')
print(f'Train score: {round(lrtvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(lr_model_tvec.named_steps.tfidfvectorizer.get_feature_names())}')

Logistic regression model
Train score: 0.962
Test score: 0.9276
Number of features: 5540


## Multinomial Bayes count vectorizer CV

In [45]:
multi_model_cvec = make_pipeline(
    cvec,
    MultinomialNB()
)
multicvec_cvscore = cross_val_score(multi_model_cvec, X_train, y_train, cv=5, scoring='roc_auc')
multi_model_cvec.fit(X_train, y_train)
y_pred = multi_model_cvec.predict(X_test)
print('Multinomial naive Bayes classifier')
print(f'Train score: {round(multicvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(multi_model_cvec.named_steps.countvectorizer.get_feature_names())}')

Multinomial naive Bayes classifier
Train score: 0.9552
Test score: 0.9197
Number of features: 1000


## Multinomial Bayes TFIDF vectorizer CV

In [46]:
multi_model_tvec = make_pipeline(
    tvec,
    MultinomialNB()
)
multitvec_cvscore = cross_val_score(multi_model_tvec, X_train, y_train, cv=5, scoring='roc_auc')
multi_model_tvec.fit(X_train, y_train)
y_pred = multi_model_tvec.predict(X_test)
print('Multinomial naive Bayes classifier')
print(f'Train score: {round(multitvec_cvscore.mean(), 4)}')
print(f'Test score: {round(roc_auc_score(y_test, y_pred), 4)}')
print(f'Number of features: {len(multi_model_tvec.named_steps.tfidfvectorizer.get_feature_names())}')

Multinomial naive Bayes classifier
Train score: 0.9638
Test score: 0.9153
Number of features: 5540


## GridsearchCV logistic regression count vectorizer

In [47]:
#Gridsearch CV logistic regresssion count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'lr__solver': ['lbfgs'],
    'lr__multi_class': ['auto'],
    'lr__random_state': [42]
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:   59.1s finished


Best params: {'cvec__max_df': 0.75, 'cvec__max_features': 10000, 'cvec__ngram_range': (1, 2), 'lr__multi_class': 'auto', 'lr__random_state': 42, 'lr__solver': 'lbfgs'}
Train score: 0.9058
Test score: 0.9031
Number of features: 10000


## Gridsearch CV logistic regresssion TFIDF vectorizer

In [48]:
#Gridsearch CV logistic regresssion TFIDF vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'lr__solver': ['lbfgs'],
    'lr__multi_class': ['auto'],
    'lr__random_state': [42]
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:   53.6s finished


Best params: {'lr__multi_class': 'auto', 'lr__random_state': 42, 'lr__solver': 'lbfgs', 'tvec__max_df': 0.75, 'tvec__max_features': 10000, 'tvec__ngram_range': (1, 3), 'tvec__stop_words': 'english'}
Train score: 0.9078
Test score: 0.9175
Number of features: 10000


## Gridsearch CV multinomial bayes count vectorizer

In [49]:
#Gridsearch CV multinomial bayes count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'nb__alpha': [0.1,0.25,0.5,0.75,1]

}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed:  3.9min finished


Best params: {'cvec__max_df': 0.25, 'cvec__max_features': 4000, 'cvec__ngram_range': (1, 3), 'nb__alpha': 0.25}
Train score: 0.9195
Test score: 0.9216
Number of features: 4000


## Gridsearch CV multinomial TFIDF vectorizer

In [50]:
#Gridsearch CV multinomial TFIDF vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'nb__alpha': [0.1,0.25,0.5,0.75,1]

}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed:  3.4min finished


Best params: {'nb__alpha': 0.25, 'tvec__max_df': 0.25, 'tvec__max_features': 4000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Train score: 0.9223
Test score: 0.9237
Number of features: 4000


## Gridsearch CV support vector machine count vectorizer

In [52]:
#Gridsearch CV support vector machine count vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('svc', SVC())
])

params = {
    
    'cvec__max_df': [0.25, 0.5, 0.75],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__max_features': [2000,4000,6000,8000,10000],
    'svc__kernel': ['linear','poly','rbf']
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.cvec.get_feature_names())}')

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:  5.3min finished


Best params: {'cvec__max_df': 0.75, 'cvec__max_features': 10000, 'cvec__ngram_range': (1, 3), 'svc__kernel': 'linear'}
Train score: 0.8927
Test score: 0.8907
Number of features: 10000


## Gridsearch CV support vector machine TFIDF vectorizer

In [51]:
#Gridsearch CV support vector machine count vectorizer
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('svc', SVC())
])

params = {
    
    'tvec__max_df': [0.25, 0.5, 0.75],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__max_features': [2000,4000,6000,8000,10000],
    'svc__kernel': ['linear','poly','rbf']
}

gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs = -1, verbose=1)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f'Train score: {round(gs.best_score_, 4)}')
print(f'Test score: {round(gs.best_estimator_.score(X_test, y_test), 4)}')
print(f'Number of features: {len(gs.best_estimator_.named_steps.tvec.get_feature_names())}')

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:  5.3min finished


Best params: {'svc__kernel': 'linear', 'tvec__max_df': 0.75, 'tvec__max_features': 10000, 'tvec__ngram_range': (1, 2)}
Train score: 0.914
Test score: 0.9258
Number of features: 10000



|Model|Train Score|Test Score|
|---|---|---|
|Logistic Regression count vectorizer CV|0.9941|0.9745|
|Logistic Regression TFIDF vectorizer CV|0.9978|0.9704|
|Multinomial Bayes count vectorizer CV|0.9972|0.9828|
|Multinomial Bayes TFIDF vectorizer CV|0.9979|0.9784|
||||

In [54]:
#Get the predictions for the model that gave the best score to create confusion matrix
svc_tvec = make_pipeline(
    TfidfVectorizer(max_df=0.75,max_features=10000,ngram_range=(1, 2)),
    SVC(kernel='linear')
)
svc_tvec.fit(X_train, y_train)
y_pred = svc_tvec.predict(X_test)

#Generate confusion matrix
confusion_matrix(y_test, y_pred)

array([[219,  18],
       [ 18, 230]])

In [55]:
# Generate classification metrics 
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy: {round((tp+tn)/(tp+fp+tn+fn),4)}')
print(f'Misclassification rate: {round((fp+fn)/(tp+fp+tn+fn),4)}')
print(f'Precision: {round(tp/(tp+fp),4)}')
print(f'Recall: {round(tp/(tp+fn),4)}')
print(f'Specificity: {round(tn/(tn+fp),4)}')

Accuracy: 0.9258
Misclassification rate: 0.0742
Precision: 0.9274
Recall: 0.9274
Specificity: 0.9241


In [56]:
# Create DataFrame with column for predicted values.
results = pd.DataFrame(y_pred, columns=['predicted'])



In [57]:
yframe = y_test

In [58]:
yframe.to_frame()

Unnamed: 0,y
1233,0
1676,0
684,1
913,1
1073,0
...,...
10,1
1736,0
968,1
12,1


In [59]:
yframe.head()

1233    0
1676    0
684     1
913     1
1073    0
Name: y, dtype: int64

In [60]:
# Create column for observed values.
results['y_test'] = yframe


In [61]:
results.head()

Unnamed: 0,predicted,y_test
0,0,1.0
1,0,
2,1,
3,1,1.0
4,0,1.0


In [62]:
results.groupby('y_test')['y_test'].count()

y_test
1.0    115
Name: y_test, dtype: int64

In [64]:
results['y_test'].isnull().sum()

370

In [67]:
y_pred.shape

(485,)