In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cosmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cosmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cosmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
real_news = pd.read_csv('/Users/cosmi/OneDrive/Desktop/True.csv')
fake_news = pd.read_csv('/Users/cosmi/OneDrive/Desktop/Fake.csv')

real_news['label'] = 1
fake_news['label'] = 0

def format_data(data):
   
    #Join title and text into a single column
    data['text'] = data['title'] + data['text']
    
    #Keep only the text and label columns
    data = data[['text','label']]
    return data
    
real_news = format_data(real_news)
fake_news = format_data(fake_news)


#Merge both datasets into a single dataset
news_data = pd.concat([real_news, fake_news])

In [3]:
news_data['text'] = news_data['text'].fillna('')

def clean_text(text):
    
    #convert to lowercase
    text = text.lower()
    
    #remove punctuation
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    
    #remove stopwords
    stop = stopwords.words('english')
    text = ' '.join(word for word in text.split() if word not in stop)
    
    #remove non-ASCII characters
    text = ''.join(ch for ch in text if ord(ch)<128)
    
    #remove apostrophes
    text = text.replace("'", "")
    
    #remove single characters 
    ' '.join(word for word in text.split() if len(word) > 1 )
    
    return text
        
news_data['text'] = news_data['text'].apply(clean_text)

In [4]:
#Tokenization 
news_data['text'] = news_data['text'].apply(nltk.word_tokenize)

In [5]:
#Lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

news_data['text'] = news_data['text'].apply(lemmatize)

In [6]:
#Train, Test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(news_data['text'], news_data['label'], test_size=0.3, 
                                                    stratify=news_data['label'])

In [7]:
#td-idf vectorization 

def identity_function(text):
    return text

vectorizer = TfidfVectorizer(tokenizer = identity_function, lowercase = False, ngram_range = (1,3))

x_train_tfidf = vectorizer.fit_transform(x_train) #Only fit vectorizer to train data
x_test_tfidf = vectorizer.transform(x_test) 


In [8]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [9]:
#Preparing an input to be classified 
input_text = ""
def prepare_input(text):
       
    text = clean_text(text)
    text = nltk.word_tokenize(text)
    text = lemmatize(text)
    text = [text]
    text = pd.Series(text)
    text = vectorizer.transform(text)
    return text
    
tfidf_input = prepare_input(input_text)
tfidf_input

<1x8903248 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

# Logistic Regression Classifier

In [11]:
#Model fitting and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logReg_model = LogisticRegression()
logReg_model.fit(x_train_tfidf, y_train)

y_predictions = logReg_model.predict(x_test_tfidf)
accuracy_score(y_test, y_predictions)

0.9852264291017075

# Passive Aggresive Classifier 

In [12]:
from sklearn.linear_model import PassiveAggressiveClassifier
pa_model = PassiveAggressiveClassifier()
pa_model.fit(x_train_tfidf, y_train)

y_predictions = pa_model.predict(x_test_tfidf)
accuracy_score(y_test, y_predictions)

0.9932442464736452

# Random Forest Classifer

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(x_train_tfidf, y_train)

y_predictions = rf_model.predict(x_test_tfidf)
accuracy_score(y_test, y_predictions)


0.9853006681514477

In [14]:
#Saving Models 
import pickle
lg = open('LogReg_Classifier.sav', 'wb')
pa = open('PA_Classifier.sav', 'wb')
rf = open('RF_Classifier.sav', 'wb')

pickle.dump(logReg_model, lg)
pickle.dump(pa_model, pa)
pickle.dump(rf_model, rf)

lg.close()
pa.close()
rf.close()