In [1]:
import numpy as np # for array manipulation and numerical computing
import pandas as pd # for creating and loading dataframes
import re # Provide functions to work with regular expression and use for finding the matching text pattern
from sklearn.metrics import accuracy_score # for evaluting the accuracy of the model
from sklearn.model_selection import train_test_split # for spliting the train and test dataset
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer # for converting the text document  into numeric matrics
from nltk.stem.porter import PorterStemmer # for Stemming Process
from nltk.corpus import stopwords # Stop words

In [2]:
# importint Natural Language Toolkit library and downloading the stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\GAURAV
[nltk_data]     PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english')) # Printing stopwords

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# Loading training dataset
news_dataset = pd.read_csv('train.csv')

In [5]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
news_dataset.shape

(20800, 5)

In [7]:
# finding the null values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# Fill the null values with empty String
news_dataset = news_dataset.fillna('')

In [9]:
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [10]:
# Merge author and title column to form content column
news_dataset['content'] = news_dataset['author']+news_dataset['title']

In [11]:
# Stemming: It is a process of reducing a word to its root word Eg. acting, actor, actress-> act
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('^a-zA-Z',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [12]:
# Applying the stemming process to content column
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [13]:
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [14]:
# Convert the text document into numeric matrix
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [15]:
# Splitting the Dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [16]:
# Train the model with LoLogistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)

In [17]:
# Prediction on train dataset
training_data_prediction = model.predict(x_train)

In [18]:
# Evaluting the Accuracy
training_data_accuracy = accuracy_score(training_data_prediction, y_train)

In [19]:
print("Training Data Accuracy: ", training_data_accuracy)

Training Data Accuracy:  0.9715745192307692


In [20]:
# Prediction on test dataset
testing_data_prediction = model.predict(x_test)

In [21]:
testing_data_accuracy = accuracy_score(testing_data_prediction, y_test)

In [22]:
print("Testing Data Accuracy: ", testing_data_accuracy)

Testing Data Accuracy:  0.9550480769230769
