In [2]:
"""
@Author: Divyansh.Gupta
"""
import pandas as pd
import numpy as np

In [2]:
path_to_zip_file="train.csv.zip"
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall("")

In [3]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# Drop all Nan Values
data = data.dropna()
# Drop label variable to get independent variables
X = data.drop('label',axis=1)
# Get dependent variable
y = data['label']
print("Shape of independent features: {} and dependent features {}".format(X.shape,y.shape))


Shape of independent features: (18285, 4) and dependent features (18285,)


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [6]:
# Creating copy of dataset
message = X.copy() # 2 type of copy: Shallow and deep copy(). 
#Shallow copy the values and object while deep copy copies the reference of the object of value
message.reset_index(inplace=True)

In [7]:
import nltk # NLP processsing library
import re # Regular Expression 
from nltk.corpus import stopwords, wordnet # Get stopwords, wordnet from nltk
nltk.download('stopwords') # Download list of stopwords
nltk.download('wordnet') # Download list of wordnetmb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Preprocessing and Stemming
from nltk.stem.porter import PorterStemmer # Stemming
stemm = PorterStemmer() # create object of PorterStemmer
corpus=[] # List to add words to create corpus of words
for i in range(0,len(message)):
  rev = re.sub('[^a-zA-Z]'," ",message['title'][i]) # Replace all words with space except a-z and A-z
  rev = rev.lower() # Lowercase all the text so that "USA" and "usa" get same index
  rev = rev.split() # Split sentences
  rev = [stemm.stem(word) for word in rev if word not in stopwords.words('english')] # Doing stemming and removing stopwords
  rev = " ".join(rev) # Join all words to get sentences back
  corpus.append(rev) # Appended to corpus

# CountVectorization

In [9]:
voc_size=5000
cv = CountVectorizer(max_features=voc_size, ngram_range=(1,3))
x = cv.fit_transform(corpus).toarray()

In [10]:
x.shape

(18285, 5000)

In [11]:
# Splitting dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)

In [12]:
cv.get_feature_names()[:10]

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut']

In [13]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [14]:
count_df = pd.DataFrame(X_train,columns=cv.get_feature_names())
count_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,abstain,absurd,abus,abus new,abus new york,academi,accept,access,access pipelin,access pipelin protest,accid,accident,accord,account,accus,accus trump,achiev,acknowledg,acknowledg emf,acknowledg emf damag,aclu,acquit,acquitt,acr,across,act,act like,act new,act new york,action,...,yahoo,yale,ye,year,year ago,year breitbart,year eve,year later,year new,year new york,year old,year old girl,year sinc,yemen,yet,yet anoth,yiannopoulo,yield,yo,york,york citi,york new,york new york,york state,york time,yorker,young,youth,youtub,zealand,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Multinomial Naive Bayes Algo

In [15]:
from sklearn import metrics
import numpy as np
import itertools

In [16]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [17]:
classifier.fit(X_train,y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test,pred)
cm = metrics.confusion_matrix(y_test,pred)
print("Accuracy:",score)
print("Confusion Matrix \n",cm)

Accuracy: 0.8941176470588236
Confusion Matrix 
 [[3057  362]
 [ 277 2339]]


# Passive Aggressive Classifier Algo

In [18]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier()


In [19]:
pac.fit(X_train,y_train)
pred1 = pac.predict(X_test)
score1 = metrics.accuracy_score(y_test,pred1)
cm1 = metrics.confusion_matrix(y_test,pred1)
print("Accuracy:",score1)
print("Confusion Matrix \n",cm1)

Accuracy: 0.9125103562551782
Confusion Matrix 
 [[3126  293]
 [ 235 2381]]


In [20]:
classifier.coef_[0]

array([ -9.09858165,  -8.62857802,  -9.09858165, ..., -10.70801957,
        -8.76210942,  -9.3217252 ])

In [21]:
# Most Real 20 words
sorted(zip(classifier.coef_[0], cv.get_feature_names()), reverse=True)[:20]

[(-3.9865938652693202, 'trump'),
 (-4.289654630123752, 'hillari'),
 (-4.381870092904864, 'clinton'),
 (-4.873208828997359, 'elect'),
 (-5.227380642717972, 'new'),
 (-5.239959424924832, 'comment'),
 (-5.309856864542211, 'video'),
 (-5.370481486358646, 'war'),
 (-5.385009586921555, 'hillari clinton'),
 (-5.414714741335471, 'fbi'),
 (-5.435020007496217, 'us'),
 (-5.482272892346762, 'email'),
 (-5.493083808450978, 'vote'),
 (-5.5722211290097015, 'obama'),
 (-5.6266152010755, 'world'),
 (-5.759259675681795, 'donald'),
 (-5.7807658809027584, 'donald trump'),
 (-5.81767043783821, 'russia'),
 (-5.832822242858812, 'presid'),
 (-5.848207161698292, 'america')]

In [22]:
# Most Fake 20 words
sorted(zip(classifier.coef_[0], cv.get_feature_names()), reverse=False)[:20]

[(-10.708019566059964, 'abroad'),
 (-10.708019566059964, 'abus new'),
 (-10.708019566059964, 'abus new york'),
 (-10.708019566059964, 'accid'),
 (-10.708019566059964, 'act new'),
 (-10.708019566059964, 'act new york'),
 (-10.708019566059964, 'adopt'),
 (-10.708019566059964, 'advic'),
 (-10.708019566059964, 'advis new'),
 (-10.708019566059964, 'advis new york'),
 (-10.708019566059964, 'age new'),
 (-10.708019566059964, 'age new york'),
 (-10.708019566059964, 'agenda breitbart'),
 (-10.708019566059964, 'aleppo new'),
 (-10.708019566059964, 'aleppo new york'),
 (-10.708019566059964, 'ali'),
 (-10.708019566059964, 'amazon'),
 (-10.708019566059964, 'america breitbart'),
 (-10.708019566059964, 'america new york'),
 (-10.708019566059964, 'american breitbart')]

# Using TFIDF Vectorizer

In [23]:
tf = TfidfVectorizer()
X_tf = tf.fit_transform(corpus).toarray()

In [24]:
# Splitting dataset into training and testing
from sklearn.model_selection import train_test_split
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf,y, test_size=0.33, random_state=42)

In [25]:
count_df_tf = pd.DataFrame(X_train_tf,columns=tf.get_feature_names())
count_df_tf.head()

Unnamed: 0,aa,aaa,aap,aaron,ab,abandon,abandonado,abba,abbi,abbott,abc,abd,abdel,abdeslam,abdic,abduct,abdul,abe,abedi,abedin,abellera,aber,abgelegenen,abgesprochen,abhorr,abil,abl,aboard,abolish,abolit,aborigin,abort,abound,abram,abramovi,abramson,abran,abridg,abroad,abrog,...,ziff,zika,zilch,zimbabw,zineb,zinedin,zing,zink,zio,zion,zionism,zionist,zip,zodiac,zoe,zoey,zombi,zone,zoo,zoot,zorn,zorunda,zsa,zu,zubaydah,zucker,zuckerberg,zuess,zugleich,zuhdi,zulema,zulu,zumwalt,zur,zuschraubt,zvezda,zw,zweden,zweit,zwo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Multinomial Naive Bayes TFIDF Vectorizer

In [26]:
classifier_tf = MultinomialNB()

In [27]:
classifier_tf.fit(X_train_tf,y_train_tf)
pred_tf = classifier_tf.predict(X_test_tf)
score_tf = metrics.accuracy_score(y_test_tf,pred_tf)
cm_tf = metrics.confusion_matrix(y_test_tf,pred_tf)
print("Accuracy:",score_tf)
print("Confusion Matrix \n",cm_tf)

Accuracy: 0.8475559237779619
Confusion Matrix 
 [[3312  107]
 [ 813 1803]]


# Passive Aggressive Algo

In [28]:
pac_tf = PassiveAggressiveClassifier()

In [29]:
pac_tf.fit(X_train_tf,y_train_tf)
pred_tf1 = pac_tf.predict(X_test_tf)
score_tf1 = metrics.accuracy_score(y_test_tf,pred_tf1)
cm_tf1 = metrics.confusion_matrix(y_test_tf,pred_tf1)
print("Accuracy:",score_tf1)
print("Confusion Matrix \n",cm_tf1)

Accuracy: 0.9216238608119304
Confusion Matrix 
 [[3106  313]
 [ 160 2456]]
