# FINAL PROJECT

Input data files are available in the "../data/input/" directory.

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd
import itertools

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

## Helper Functions

In [None]:
def plot_points(X, y):
    cm = ListedColormap(['blue', 'orange'])
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cm)


    plt.show()

In [None]:
def plot_DB_and_test_data(model, X_train, X_test, y_test):

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                             np.arange(y_min, y_max, 0.02))

    cm = ListedColormap(['royalblue', 'moccasin'])
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    plt.title('Decision Boundary')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')

    plot_points(X_test, y_test)

    plt.show()

In [None]:
### https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def confusion_matrix(model, X_train, X_test, y_test):

    disp = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap=plt.cm.Blues,)
    disp.ax_.set_title("Confusion Matrix")

    print("Confusion Matrix")
    print(disp.confusion_matrix)

    plt.show()

# Fake News Classifier

##### We construct word vectors using various ways and evaluate classification performance against them to develop our NLP models.

In [None]:
# Load data and perform some preprocessing

df = pd.read_csv('data/input/combinedData.csv')
df.dropna()
df['label'] = df['label'].map({'REAL': 1, 'FAKE': 0})
# df

In [None]:
df

In [None]:
X = df['text']
y = df.label

#split data into 80/20 for training and testing respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Using CountVectorizer

The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

In [None]:
cv_vectorizer = CountVectorizer(stop_words='english')
cv_train = cv_vectorizer.fit_transform(X_train.values).toarray()
cv_test = cv_vectorizer.transform(X_test.values).toarray()

### Using TfidfVectorizer

The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, and allow you to encode new documents. Alternately, if you already have a learned CountVectorizer, you can use it with a TfidfTransformer to just calculate the inverse document frequencies and start encoding documents.

In [None]:
tv_vectorizer = TfidfVectorizer(stop_words='english')
tv_train = tv_vectorizer.fit_transform(X_train).toarray()
tv_test = tv_vectorizer.transform(X_test).toarray()

# Comparing Classification Models using Count Vectorizer

In [None]:
%%time
### Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(cv_train, y_train)
pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Gaussian Naive Bayes):", accuracy_score(y_test, pred))


In [None]:
%%time
### Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(cv_train, y_train)
pred = clf.predict(cv_test)

# plot_DB_and_test_data(clf,X_train,X_test,y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Multinomial Naive Bayes):", accuracy_score(y_test, pred))

In [None]:
%%time
### Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(cv_train, y_train)

pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Decision Tree):", accuracy_score(y_test, pred))

In [None]:
%%time
### Linear Support Vector Machine

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(cv_train, y_train)

pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Linear Support Vector Machine):", accuracy_score(y_test, pred))

In [None]:
%%time
### Passive Aggressive Classifier

from sklearn.linear_model import PassiveAggressiveClassifier

clf = make_pipeline(StandardScaler(), PassiveAggressiveClassifier(random_state=0, tol=1e-5))
clf.fit(cv_train, y_train)

pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Linear Support Vector Machine):", accuracy_score(y_test, pred))

In [None]:
%%time
### Support Vector Machine with RBF Kernel
from sklearn import svm

clf = make_pipeline(StandardScaler(), svm.SVC(kernel='rbf'))
clf.fit(cv_train, y_train)

pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
print("Model's Accuracy (Support Vector Machine with RBF Kernel):", accuracy_score(y_test, pred))

In [None]:
%%time
### Multi-Layer Perceptron Network
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='tanh', hidden_layer_sizes=(20,), max_iter=2000)
clf.fit(cv_train, y_train)

pred = clf.predict(cv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,cv_train, cv_test, y_test)
# print(pred)
print("Model's Accuracy (Multi-Layer Perceptron Neural Network):", accuracy_score(y_test, pred))

# Comparing Classification Models using TfidfVectorizer

In [None]:
%%time
### Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Gaussian Naive Bayes):", accuracy_score(y_test, pred))

In [None]:
%%timeit
### Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Multinomial Naive Bayes):", accuracy_score(y_test, pred))

In [None]:
%%time
### Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Decision Tree):", accuracy_score(y_test, pred))

In [None]:
%%time
### Linear Support Vector Machine

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Linear Support Vector Machine):", accuracy_score(y_test, pred))

In [None]:
%%timeit

### Passive Aggressive Classifier

from sklearn.linear_model import PassiveAggressiveClassifier

clf = make_pipeline(StandardScaler(), PassiveAggressiveClassifier(random_state=0, tol=1e-5))
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Linear Support Vector Machine):", accuracy_score(y_test, pred))

In [None]:
%%time

### Support Vector Machine with RBF Kernel

from sklearn import svm

clf = make_pipeline(StandardScaler(), svm.SVC(kernel='rbf'))
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
print("Model's Accuracy (Support Vector Machine with RBF Kernel):", accuracy_score(y_test, pred))

In [None]:
%%time

### Multi-Layer Perceptron Network
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='tanh', hidden_layer_sizes=(20,), max_iter=2000)
clf.fit(tv_train, y_train)

pred = clf.predict(tv_test)
#plot_DB_and_test_data(clf,cv_train, cv_test, y_test)
confusion_matrix(clf,tv_train, tv_test, y_test)
# print(pred)
print("Model's Accuracy (Multi-Layer Perceptron Neural Network):", accuracy_score(y_test, pred))