In [1]:
import os
import pandas as pd
from time import time

In [2]:
dataset_folder_name = "DM_dataset_hw2"
dataset_file_name = "fake_news_dataset.csv"

In [3]:
ws_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
dataset_path = os.path.join(ws_path, dataset_folder_name, dataset_file_name)

In [4]:
dataset = pd.read_csv(dataset_path)

In [5]:
# Null check
dataset

Unnamed: 0,target,tweet,score
0,True,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,5
1,True,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,3
2,True,THE SUPREME COURT is siding with super rich pr...,4
3,True,@POTUS Biden Blunders\n\nBroken campaign promi...,5
4,True,@OhComfy I agree. The confluence of events rig...,4
...,...,...,...
134193,False,Joe Biden's family owned African slaves....\n\...,5
134194,False,"Joe Bidens great, great grandfather was a slav...",4
134195,False,"@ChevyChaseToGo ""Joe Bidens great-grandfather ...",5
134196,False,@JoeBiden Facts are Bidens VP Kamala Harris Gr...,3


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD

In [11]:
def display_scores(model, X_train, X_test, y_train, y_test):
    train_start = time()
    model.fit(X_train, y_train)
    print("Train time: ", time() - train_start)
    test_start = time()
    prediction = model.predict(X_test)
    print("Test time: ", time() - test_start)
    print(classification_report(prediction, y_test))

# TF-IDF Only

In [62]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [37]:
X

<134198x169270 sparse matrix of type '<class 'numpy.float64'>'
	with 2504114 stored elements in Compressed Sparse Row format>

In [23]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.03248858451843262
Test time:  79.83325672149658
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     12543
        True       0.99      0.95      0.97     14297

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [24]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  76.22720861434937
Test time:  0.032271623611450195
              precision    recall  f1-score   support

       False       0.98      0.98      0.98     13079
        True       0.98      0.98      0.98     13761

    accuracy                           0.98     26840
   macro avg       0.98      0.98      0.98     26840
weighted avg       0.98      0.98      0.98     26840



In [63]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.06399273872375488
Test time:  0.012064933776855469
              precision    recall  f1-score   support

       False       0.93      0.96      0.95     12728
        True       0.96      0.94      0.95     14112

    accuracy                           0.95     26840
   macro avg       0.95      0.95      0.95     26840
weighted avg       0.95      0.95      0.95     26840



# TF-IDF + FS

In [38]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [39]:
X

<134198x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1684751 stored elements in Compressed Sparse Row format>

In [27]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.018564224243164062
Test time:  86.1012933254242
              precision    recall  f1-score   support

       False       0.70      0.99      0.82      9178
        True       1.00      0.78      0.87     17662

    accuracy                           0.85     26840
   macro avg       0.85      0.88      0.85     26840
weighted avg       0.89      0.85      0.85     26840



In [28]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  39.396164417266846
Test time:  0.027048349380493164
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     12991
        True       0.97      0.97      0.97     13849

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [29]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.035776615142822266
Test time:  0.0049402713775634766
              precision    recall  f1-score   support

       False       0.92      0.95      0.93     12673
        True       0.95      0.92      0.94     14167

    accuracy                           0.94     26840
   macro avg       0.93      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



# TF-IDF PCA

In [57]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(dataset["tweet"])
svd = TruncatedSVD(n_components=1000, random_state=42)
X = svd.fit_transform(X)

In [58]:
X.shape

(134198, 1000)

In [59]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.04211544990539551
Test time:  56.516252517700195
              precision    recall  f1-score   support

       False       0.70      0.99      0.82      9178
        True       1.00      0.78      0.87     17662

    accuracy                           0.85     26840
   macro avg       0.85      0.88      0.85     26840
weighted avg       0.89      0.85      0.85     26840



In [60]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  37.894444942474365
Test time:  0.02352142333984375
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     12996
        True       0.97      0.97      0.97     13844

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [61]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.022018909454345703
Test time:  0.0049817562103271484
              precision    recall  f1-score   support

       False       0.92      0.95      0.93     12673
        True       0.95      0.92      0.94     14167

    accuracy                           0.94     26840
   macro avg       0.93      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



# N-Gram

In [7]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [8]:
X

<134198x3265511 sparse matrix of type '<class 'numpy.float64'>'
	with 7422383 stored elements in Compressed Sparse Row format>

In [12]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.042620182037353516
Test time:  80.97906064987183
              precision    recall  f1-score   support

       False       0.93      0.99      0.96     12349
        True       0.99      0.94      0.96     14491

    accuracy                           0.96     26840
   macro avg       0.96      0.96      0.96     26840
weighted avg       0.96      0.96      0.96     26840



In [13]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  1046.8513236045837
Test time:  0.09778380393981934
              precision    recall  f1-score   support

       False       0.98      0.98      0.98     13055
        True       0.98      0.98      0.98     13785

    accuracy                           0.98     26840
   macro avg       0.98      0.98      0.98     26840
weighted avg       0.98      0.98      0.98     26840



In [14]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.40999269485473633
Test time:  0.07758069038391113
              precision    recall  f1-score   support

       False       0.96      0.98      0.97     12788
        True       0.99      0.97      0.98     14052

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



# N-Gram FS

In [15]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=2000)
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [16]:
X

<134198x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1770041 stored elements in Compressed Sparse Row format>

In [17]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.02704167366027832
Test time:  66.8125422000885
              precision    recall  f1-score   support

       False       0.76      0.99      0.86     10059
        True       0.99      0.82      0.90     16781

    accuracy                           0.88     26840
   macro avg       0.88      0.90      0.88     26840
weighted avg       0.91      0.88      0.88     26840



In [18]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  44.02996802330017
Test time:  0.04000425338745117
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     13015
        True       0.97      0.97      0.97     13825

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [19]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.03705430030822754
Test time:  0.002951383590698242
              precision    recall  f1-score   support

       False       0.93      0.95      0.94     12735
        True       0.95      0.93      0.94     14105

    accuracy                           0.94     26840
   macro avg       0.94      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



# N-Gram PCA

In [21]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
X = vectorizer.fit_transform(dataset["tweet"])
svd = TruncatedSVD(n_components=1000, random_state=42)
X = svd.fit_transform(X)

KeyboardInterrupt: 

In [None]:
X

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

In [None]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

In [None]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)