In [None]:
# Ignore warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Computations
import itertools

# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Visualization
import matplotlib.pyplot as plt

* ## Load Data 

In [None]:
train = pd.read_csv("/kaggle/input/fake-news/train.csv")
test  = pd.read_csv ("/kaggle/input/fake-news/test.csv")


In [None]:
train.head()

In [None]:
print(f"Train Shape : {train.shape}")
print(f"Test Shape : {test.shape}")


## Handeling missing values

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.dtypes.value_counts()

# not removing the missing values; as studied in the prior kernal 13% data is missing, removing that from training will cause 13% diversified learning to the model. So filling the missing data with empty string.

In [None]:
# Handle missing values
test=test.fillna(' ')
train=train.fillna(' ')

# We will train the model with concatenating of the title, the author and the main text, the model would be more generalized because adding more words to the input might increase the reliablity of the model.

In [None]:
# Create a column with all the data available
test['total']=test['title']+' '+test['author']+' '+test['text']
train['total']=train['title']+' '+train['author']+' '+train['text']

In [None]:
# Have a glance at our training set
train.info()
train.head()

In [None]:
# Dividing the training set by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['total'], train.label, test_size=0.20, random_state=0)

## Vectorizing Data

### 1. Count Vectorizer 

In [None]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english') 
# Fit and transform the training data.
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [None]:
test_counts_vector = count_vectorizer.transform(test['total'].values)

### 2. Tf-IDF Vectorizer 

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
#Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
#Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
test_tfidf_vector = tfidf_vectorizer.transform(test['total'].values)

# Playing with the algorithms

In [None]:
# Creating a function that outputs a confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def precision_recall(true_label, predicted_label):
    precision = precision_score(true_label, predicted_label)
    print('Precision: %f' % precision)

    recall = recall_score(true_label, predicted_label)
    print('Recall: %f' % recall)
    
    accuracy = metrics.accuracy_score(true_label, predicted_label)
    print('Accuracy: %f' % accuracy)

### 1. Multinomial Naive Bayes with Count Vectorizer (BagofWords)

In [None]:
nb_classifier = MultinomialNB(alpha = 0.1)
nb_classifier.fit(count_train, y_train)
pred_nb_count = nb_classifier.predict(count_test)
precision_recall(y_test, pred_nb_count)


In [None]:
# tune the hyperparameter alpha for the naive bayes classifier
for alpha in np.arange(0,1,.05):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(count_train, y_train)
    pred_tune = nb_classifier_tune.predict(count_test)
    precision_recall(y_test, pred_tune)
    print("Alpha: {:.2f} ".format(alpha))

The best score is obtained for alpha = 0.15, and is equal to 0.94279.

In [None]:
# Let's re-run our fine-tuned model and plot the confusion matrix
nb_classifier = MultinomialNB(alpha = 0.15)
nb_classifier.fit(count_train, y_train)
pred_nb_count = nb_classifier.predict(count_test)
precision_recall(y_test, pred_nb_count)
cm = metrics.confusion_matrix(y_test, pred_nb_count, labels=[0,1])
    
    
plot_confusion_matrix(cm, classes=['TRUE','FAKE'], title ='Confusion matrix for a MultinomialNB with Count Vectorizer')

We see that although our model has a general accuracy of 94.3 %, which is good, but it does not really score well in view of number of false negative. 223 fake news are classified as true news with this model, which is not pleasing to see. So we will try to use the Tf-IDF vectorizer on this same model to see if it performs better.

### 2. Multinomial Naive Bayes with TF-IDF Vectorizer 

In [None]:
nb_classifier = MultinomialNB(alpha = 0.1)
nb_classifier.fit(tfidf_train, y_train)
pred_nb_tfidf = nb_classifier.predict(tfidf_test)
precision_recall(y_test, pred_nb_tfidf)


In [None]:
# tune the hyperparameter alpha for the naive bayes classifier
for alpha in np.arange(0,0.1,.01):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(tfidf_train, y_train)
    pred_tune = nb_classifier_tune.predict(tfidf_test)
    precision_recall(y_test, pred_tune)
    print("Alpha: {:.2f} ".format(alpha))

In [None]:
# Let's run the optimized model with best value of hyperparameter and check the confusion matrix
nb_classifier = MultinomialNB(alpha = 0.01)
nb_classifier.fit(tfidf_train, y_train)
pred_nb_tfidf = nb_classifier.predict(tfidf_test)
precision_recall(y_test, pred_nb_tfidf)
cm2 = metrics.confusion_matrix(y_test, pred_nb_tfidf, labels=[0,1])
plot_confusion_matrix(cm2, classes=['TRUE','FAKE'], title ='Confusion matrix for a MultinomialNB with Tf-IDF')

This confusion matrix above confirms that this new model is slightly better (and its accuracy score is 94.4 %).
However, too many fake news are still labeled as true news.
Let's try with another model called PassiveAgressive Classifier which is special for text classification purposes. 

### 3. Passive Agressive Classifier With Count Vectorizer

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_classifier = PassiveAggressiveClassifier(max_iter=10)
linear_classifier.fit(count_train, y_train)
pred_linear_count = linear_classifier.predict(count_test)
precision_recall(y_test, pred_linear_count)
cm6 = metrics.confusion_matrix(y_test, pred_linear_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a PA Classifier with Count Vectorizer')

We get much better results than with the MultinomialNB model, both in terms of accuracy and in terms of false negative. Only 60 fake news were labeled as true news this time.
Let's try with the Tf-IDF method.

### 4. Passive Agressive Classifier With TF-IDF Vectorizer

In [None]:
linear_classifier = PassiveAggressiveClassifier(max_iter=10)
linear_classifier.fit(tfidf_train, y_train)
pred_linear_tfidf = linear_classifier.predict(tfidf_test)
precision_recall(y_test, pred_linear_tfidf)

cm5 = metrics.confusion_matrix(y_test, pred_linear_tfidf, labels=[0,1])
plot_confusion_matrix(cm5, classes=['TRUE','FAKE'], title ='Confusion matrix for a PA Classifier with Tf-IDF')

Although we observe more false negative, the overall accuracy is much better, hence so far this is our best model.
Let's try with Logistic Regression now !

### 5. Logistic Regression with TF-IDF Vectorizer 

In [None]:
logreg = LogisticRegression(C=1e5)
logreg.fit(tfidf_train, y_train)
pred_logreg_tfidf = logreg.predict(tfidf_test)
pred_logreg_tfidf_proba = logreg.predict_proba(tfidf_test)[:,1]
precision_recall(y_test, pred_logreg_tfidf)

cm4 = metrics.confusion_matrix(y_test, pred_logreg_tfidf, labels=[0,1])
plot_confusion_matrix(cm4, classes=['TRUE','FAKE'], title ='Confusion matrix for a Logistic Regression with Tf-IDF')

In [None]:
predictions = logreg.predict(test_tfidf_vector)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()

In [None]:
pred.to_csv('tfidf_pred.csv', index=False)


This model has a very high accuracy score, and only 58 records were misclassified. So far, Logistic Regression played it best ! Let's see the same with CountVectorizer ie. Bag of Words concept.

### 6.  Logistic Regression with CountVectorizer

In [None]:
logreg = LogisticRegression(C=1e5)
logreg.fit(count_train, y_train)
pred_logreg_count = logreg.predict(count_test)
precision_recall(y_test, pred_logreg_count)

cm3 = metrics.confusion_matrix(y_test, pred_logreg_count, labels=[0,1])
plot_confusion_matrix(cm3, classes=['TRUE','FAKE'], title ='Confusion matrix for a Logistic Regression with Count Vectorizer')

In [None]:
predictions1 = logreg.predict(test_counts_vector)
pred1 = pd.DataFrame(predictions1, columns=['label'])
pred1['id'] = test['id']
pred1.groupby('label').count()

In [None]:
pred.to_csv('countvect_pred.csv', index=False)


# It's THE BEST MODEL. Even though the accuracy score is a bit lower, we have less fake news labeled as true news ie. only 44. Therefore, I choose this model because it seems to maximize the accuracy while minimizing the false negative rate!

# The accuracy according to the kaggle submission on the Test set is 97.82%