In [1]:
from sklearn.naive_bayes import GaussianNB,CategoricalNB,MultinomialNB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
cat_inc_df=pd.read_csv("income_evaluation_cat.csv ")



In [3]:
for col in cat_inc_df:
    print(cat_inc_df[col].unique())

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' <=50K' ' >50K']


In [4]:
#Prior class probabilities
priors=(cat_inc_df[' income'].value_counts())/len(cat_inc_df)
priors

 <=50K    0.75919
 >50K     0.24081
Name:  income, dtype: float64

In [5]:
under_df=cat_inc_df[cat_inc_df[' income']==' <=50K'] 
over_df=cat_inc_df[cat_inc_df[' income']==' >50K'] 
print(len(under_df),len(over_df))

24720 7841


In [6]:
under_post_probs=([(under_df[' workclass'][under_df[' workclass']==' Private'].count())/len(under_df),(under_df[' education'][under_df[' education']==' Bachelors'].count())/len(under_df),
      (under_df[' race'][under_df[' race']==' White'].count())/len(under_df),(under_df['gender'][under_df['gender']==' Female'].count())/len(under_df)])
under_post_probs

[0.7173543689320389,
 0.1267799352750809,
 0.8373381877022654,
 0.38802588996763754]

In [7]:
over_post_probs=([(over_df[' workclass'][over_df[' workclass']==' Private'].count())/len(over_df),(over_df[' education'][over_df[' education']==' Bachelors'].count())/len(over_df),
      (over_df[' race'][over_df[' race']==' White'].count())/len(over_df),(over_df['gender'][over_df['gender']==' Female'].count())/len(over_df)])
over_post_probs

[0.6329549802321133,
 0.28325468690218086,
 0.9076648386685372,
 0.15036347404667771]

In [8]:
under_prod = priors[0]*(np.prod(under_post_probs))
over_prod = priors[1]*(np.prod(over_post_probs))
print(under_prod,over_prod)
#Since .022 > .0059, the instance should be classified as <$50000

0.022433477852309085 0.005892390339262951


In [9]:
inc_enc_df=pd.get_dummies(cat_inc_df, columns=[' workclass',' education',' race','gender'],drop_first=True)
inc_label_vec=inc_enc_df[' income']
inc_enc_df.drop(columns=' income',inplace=True)

In [10]:
X_train, X_test, inc_train, inc_test = train_test_split(inc_enc_df, inc_label_vec, test_size=0.3, random_state=42)

In [11]:
cnb=CategoricalNB()
cnb.fit(X_train, inc_train)
inc_train_pred = cnb.predict(X_train)
inc_test_pred = cnb.predict(X_test)
print(accuracy_score(inc_train,inc_train_pred),accuracy_score(inc_test,inc_test_pred))

0.7853194103194103 0.7885146893233699


In [12]:
cont_inc_df=pd.read_csv('income_evaluation_continuous.csv')
cont_under_df=cont_inc_df[cont_inc_df[' income']==' <=50K'] 
cont_over_df=cont_inc_df[cont_inc_df[' income']==' >50K'] 
print(len(cont_under_df),len(cont_over_df))

24720 7841


In [13]:

stats_df = pd.DataFrame(cont_under_df.mean())
stats_df['under_sd']=cont_under_df.std()
stats_df['over_mean']=cont_over_df.mean()
stats_df['over_sd']=cont_over_df.std()
stats_df.rename(columns={0: "under_mean"},inplace=True)
stats_df

Unnamed: 0,under_mean,under_sd,over_mean,over_sd
age,36.783738,14.020088,44.249841,10.519028
education_num,9.595065,2.436147,11.611657,2.385129
hours_per_week,38.84021,12.318995,45.473026,11.012971


In [14]:
cont_priors=(cont_inc_df[' income'].value_counts())/len(cont_inc_df)
cont_priors

 <=50K    0.75919
 >50K     0.24081
Name:  income, dtype: float64

In [15]:
X = [30, 10, 45]
cont_under_post_probs=norm(stats_df['under_mean'],stats_df['under_sd']).pdf(X)
cont_over_post_probs=norm(stats_df['over_mean'],stats_df['over_sd']).pdf(X)
print(cont_under_post_probs,cont_over_post_probs)

[0.02531168 0.16151284 0.02857872] [0.01515093 0.1331227  0.03619137]


In [16]:
cont_under_prod = cont_priors[0]*(np.prod(cont_under_post_probs))
cont_over_prod = cont_priors[1]*(np.prod(cont_over_post_probs))
print(cont_under_prod ,cont_over_prod )
#Since .000087 > .0000176, instance should be classified as <=50K

8.869959848024929e-05 1.7578024002484486e-05


In [17]:
cont_inc_label_vec=cont_inc_df[' income']
cont_inc_df.drop(columns=' income',inplace=True)
cont_inc_df.sample(8)
#X_train, X inc_train, inc_test = _test,train_test_split(inc_enc_df, inc_label_vec, test_size=0.3, random_state=42)

Unnamed: 0,age,education_num,hours_per_week
15261,37,13,48
4435,40,10,30
9307,29,10,45
10607,25,9,40
16348,57,9,40
19426,31,13,40
22231,50,10,36
17526,36,9,50


In [18]:
stsc=StandardScaler()
cont_inc_df=stsc.fit_transform(cont_inc_df)
X2_train, X2_test, y2_train, y2_test =train_test_split(cont_inc_df, cont_inc_label_vec, test_size=0.3, random_state=42)

In [19]:
gnb=GaussianNB() 
gnb.fit(X2_train, y2_train)
cont_train_pred = gnb.predict(X2_train)
cont_test_pred = gnb.predict(X2_test)
print(accuracy_score(y2_train,cont_train_pred),accuracy_score(y2_test,cont_test_pred))

0.7993594243594243 0.7988535162247927


In [36]:
true_df=pd.read_csv("true.csv")
fake_df=pd.read_csv("fake.csv")

In [37]:
true_df.drop(columns =['subject', 'date'],inplace=True)
fake_df.drop(columns =['subject', 'date'],inplace=True)
true_df['news_type']='True'
fake_df['news_type']='Fake'

In [38]:
news_df=pd.concat([true_df, fake_df], ignore_index=True, axis=0)
news_df[:5]

Unnamed: 0,title,text,news_type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,True


In [23]:
news_df["news"] = news_df['title']+" "+news_df["text"]
news_df["news"] = news_df["news"].apply(str.lower)
news_df.drop(columns =['title', 'text'],inplace=True)

In [25]:
news_df["news"] = news_df["news"].str.replace('[^\w\s]','')
news_df["news"] = news_df["news"].str.replace('\d+', '')
news_df["news_list"]= news_df["news"].str.split(" ")


In [26]:
news_df=news_df[news_df["news_list"].apply(len) >= 50]
news_df=news_df.reset_index(drop=True)


In [27]:
eng_stopwords = stopwords.words('english') 
news_df["news_list"] = news_df["news_list"].apply(lambda words: [word for word in words if word not in eng_stopwords])


In [28]:
def listToString(s):
    str1 = " " 
    return (str1.join(s))

news_df["clean_news"]  = news_df["news_list"] .apply(listToString)

In [29]:
news_df.drop(columns=['news','news_list'], inplace=True)
news_df.sample(6)

Unnamed: 0,news_type,clean_news
28618,Fake,north carolina gop makes state crappier place...
21869,Fake,trump lashes ceos suggests hell replace ones ...
21536,Fake,puerto ricos governor pleads help trump threa...
14577,True,zimbabwes army seizes power mugabe confined sa...
41850,Fake,boiler room ep deeper game masters chaos str...
24631,Fake,trump supporter unleashes explosive racism hi...


In [30]:
tfid = TfidfVectorizer()
feat_vecs = tfid.fit_transform(news_df["clean_news"])
news_df.drop(columns=['clean_news'], inplace=True)
news_df.sample(5)

Unnamed: 0,news_type
37825,Fake
15390,True
36745,Fake
8338,True
19659,True


In [31]:
X3_train,X3_test, y3_train,y3_test=train_test_split(feat_vecs, news_df, test_size=0.3, random_state=42)
y3_train.sample(5)

Unnamed: 0,news_type
171,True
7011,True
11534,True
18333,True
42476,Fake


In [32]:
mnb=MultinomialNB() 
mnb.fit(X3_train, y3_train)
news_train_pred = mnb.predict(X3_train)
news_test_pred = mnb.predict(X3_test)
print(accuracy_score(y3_train,news_train_pred),accuracy_score(y3_test,news_test_pred))

0.9524313245843112 0.9429232549170845


In [33]:
scores = cross_val_score(mnb,X3_train, y3_train, cv=10, scoring="accuracy")

print(scores,"\n","mean=",np.mean(scores),"sd=",np.std(scores))

[0.9365499  0.94876033 0.93785124 0.94347107 0.94413223 0.94578512
 0.94082645 0.94214876 0.93818182 0.94545455] 
 mean= 0.9423161471107153 sd= 0.003749236738357575


In [34]:
param_grid={'alpha':[.1,.2,.3,.4,.5,.6,.7,.8,.9,1]}
clf = GridSearchCV(mnb,param_grid,scoring='accuracy')
clf.fit(X3_train, y3_train)
print(clf.best_params_,clf.score(X3_train, y3_train))

{'alpha': 0.1} 0.9692902713959869
