In [1]:
import os
import pandas as pd
from time import time

In [2]:
dataset_folder_name = "DM_dataset_hw2"
dataset_file_name = "fake_news_dataset.csv"

In [3]:
ws_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
dataset_path = os.path.join(ws_path, dataset_folder_name, dataset_file_name)

In [4]:
dataset = pd.read_csv(dataset_path)

In [5]:
# Null check
dataset

Unnamed: 0,target,tweet,score
0,True,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,5
1,True,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,3
2,True,THE SUPREME COURT is siding with super rich pr...,4
3,True,@POTUS Biden Blunders\n\nBroken campaign promi...,5
4,True,@OhComfy I agree. The confluence of events rig...,4
...,...,...,...
134193,False,Joe Biden's family owned African slaves....\n\...,5
134194,False,"Joe Bidens great, great grandfather was a slav...",4
134195,False,"@ChevyChaseToGo ""Joe Bidens great-grandfather ...",5
134196,False,@JoeBiden Facts are Bidens VP Kamala Harris Gr...,3


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [7]:
def display_scores(model, X_train, X_test, y_train, y_test):
    train_start = time()
    model.fit(X_train, y_train)
    print("Train time: ", time() - train_start)
    test_start = time()
    prediction = model.predict(X_test)
    print("Test time: ", time() - test_start)
    print(classification_report(prediction, y_test))

In [15]:
def display_regression_scores(model, X_train, X_test, y_train, y_test):
    train_start = time()
    model.fit(X_train, y_train)
    print("Train time: ", time() - train_start)
    test_start = time()
    prediction = model.predict(X_test)
    print("Test time: ", time() - test_start)
    mae = mean_absolute_error(y_test, prediction)
    rmse = mean_squared_error(y_test, prediction, squared=False)
    print("MAE: ", mae, "RMSE: ", rmse)

# TF-IDF Only

In [9]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [10]:
X

<134198x169270 sparse matrix of type '<class 'numpy.float64'>'
	with 2504114 stored elements in Compressed Sparse Row format>

In [23]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.03248858451843262
Test time:  79.83325672149658
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     12543
        True       0.99      0.95      0.97     14297

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [16]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

Train time:  0.016558170318603516
Test time:  97.08044815063477
MAE:  0.14270988574267263 RMSE:  0.23763510289134437


In [24]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  76.22720861434937
Test time:  0.032271623611450195
              precision    recall  f1-score   support

       False       0.98      0.98      0.98     13079
        True       0.98      0.98      0.98     13761

    accuracy                           0.98     26840
   macro avg       0.98      0.98      0.98     26840
weighted avg       0.98      0.98      0.98     26840



In [17]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

Train time:  115.1116418838501
Test time:  0.02310466766357422
MAE:  0.019485842026825632 RMSE:  0.13959169755693077


In [63]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.06399273872375488
Test time:  0.012064933776855469
              precision    recall  f1-score   support

       False       0.93      0.96      0.95     12728
        True       0.96      0.94      0.95     14112

    accuracy                           0.95     26840
   macro avg       0.95      0.95      0.95     26840
weighted avg       0.95      0.95      0.95     26840



In [20]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)

Train time:  92.33088374137878
Test time:  0.003396272659301758
MAE:  0.2569401164122587 RMSE:  0.3718085277789252


# TF-IDF + FS

In [21]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [39]:
X

<134198x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1684751 stored elements in Compressed Sparse Row format>

In [27]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.018564224243164062
Test time:  86.1012933254242
              precision    recall  f1-score   support

       False       0.70      0.99      0.82      9178
        True       1.00      0.78      0.87     17662

    accuracy                           0.85     26840
   macro avg       0.85      0.88      0.85     26840
weighted avg       0.89      0.85      0.85     26840



In [22]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

Train time:  0.02972698211669922
Test time:  155.02567195892334
MAE:  0.14654744162940886 RMSE:  0.36207438998740177


In [28]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  39.396164417266846
Test time:  0.027048349380493164
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     12991
        True       0.97      0.97      0.97     13849

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [23]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

Train time:  67.03870296478271
Test time:  0.03938603401184082
MAE:  0.03087800915669768 RMSE:  0.17519291556510336


In [29]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.035776615142822266
Test time:  0.0049402713775634766
              precision    recall  f1-score   support

       False       0.92      0.95      0.93     12673
        True       0.95      0.92      0.94     14167

    accuracy                           0.94     26840
   macro avg       0.93      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



In [24]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)

Train time:  1.2061469554901123
Test time:  0.0016748905181884766
MAE:  0.16350531272912402 RMSE:  0.21647101403660274


# TF-IDF PCA

In [25]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(dataset["tweet"])
svd = TruncatedSVD(n_components=1000, random_state=42)
X = svd.fit_transform(X)

In [58]:
X.shape

(134198, 1000)

In [59]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.04211544990539551
Test time:  56.516252517700195
              precision    recall  f1-score   support

       False       0.70      0.99      0.82      9178
        True       1.00      0.78      0.87     17662

    accuracy                           0.85     26840
   macro avg       0.85      0.88      0.85     26840
weighted avg       0.89      0.85      0.85     26840



In [26]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

Train time:  0.11029982566833496
Test time:  85.5843997001648
MAE:  0.14654744162940886 RMSE:  0.36207438998740177


In [60]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  37.894444942474365
Test time:  0.02352142333984375
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     12996
        True       0.97      0.97      0.97     13844

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [27]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

Train time:  52.03927493095398
Test time:  0.02969217300415039
MAE:  0.031214336542205397 RMSE:  0.17620764250037513


In [61]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.022018909454345703
Test time:  0.0049817562103271484
              precision    recall  f1-score   support

       False       0.92      0.95      0.93     12673
        True       0.95      0.92      0.94     14167

    accuracy                           0.94     26840
   macro avg       0.93      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



In [28]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)

Train time:  0.9429681301116943
Test time:  0.0014371871948242188
MAE:  0.16350531272912402 RMSE:  0.21647101403660274


# N-Gram

In [29]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [None]:
X

<134198x3265511 sparse matrix of type '<class 'numpy.float64'>'
	with 7422383 stored elements in Compressed Sparse Row format>

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.042620182037353516
Test time:  80.97906064987183
              precision    recall  f1-score   support

       False       0.93      0.99      0.96     12349
        True       0.99      0.94      0.96     14491

    accuracy                           0.96     26840
   macro avg       0.96      0.96      0.96     26840
weighted avg       0.96      0.96      0.96     26840



In [30]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

Train time:  0.043862104415893555
Test time:  90.23232293128967
MAE:  0.1610655737704918 RMSE:  0.25398789857905973


In [None]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  1046.8513236045837
Test time:  0.09778380393981934
              precision    recall  f1-score   support

       False       0.98      0.98      0.98     13055
        True       0.98      0.98      0.98     13785

    accuracy                           0.98     26840
   macro avg       0.98      0.98      0.98     26840
weighted avg       0.98      0.98      0.98     26840



In [31]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

Train time:  478.3501591682434
Test time:  0.0937800407409668
MAE:  0.018964232488822653 RMSE:  0.13771068400390238


In [None]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.40999269485473633
Test time:  0.07758069038391113
              precision    recall  f1-score   support

       False       0.96      0.98      0.97     12788
        True       0.99      0.97      0.98     14052

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [32]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)

Train time:  239.56717801094055
Test time:  0.006571054458618164
MAE:  0.14034388413244994 RMSE:  0.18852741434996928


# N-Gram FS

In [33]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=2000)
X = vectorizer.fit_transform(dataset["tweet"])
X_train, X_test, y_train, y_test = train_test_split(X, dataset["target"], test_size=0.2, random_state=42)

In [None]:
X

<134198x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1770041 stored elements in Compressed Sparse Row format>

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

Train time:  0.02704167366027832
Test time:  66.8125422000885
              precision    recall  f1-score   support

       False       0.76      0.99      0.86     10059
        True       0.99      0.82      0.90     16781

    accuracy                           0.88     26840
   macro avg       0.88      0.90      0.88     26840
weighted avg       0.91      0.88      0.88     26840



In [34]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

Train time:  0.007241010665893555
Test time:  84.15569376945496
MAE:  0.11346249379036263 RMSE:  0.25705801095638664


In [None]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

Train time:  44.02996802330017
Test time:  0.04000425338745117
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     13015
        True       0.97      0.97      0.97     13825

    accuracy                           0.97     26840
   macro avg       0.97      0.97      0.97     26840
weighted avg       0.97      0.97      0.97     26840



In [35]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

Train time:  72.89871954917908
Test time:  0.08632493019104004
MAE:  0.03264059586712344 RMSE:  0.18012726730285405


In [None]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

Train time:  0.03705430030822754
Test time:  0.002951383590698242
              precision    recall  f1-score   support

       False       0.93      0.95      0.94     12735
        True       0.95      0.93      0.94     14105

    accuracy                           0.94     26840
   macro avg       0.94      0.94      0.94     26840
weighted avg       0.94      0.94      0.94     26840



In [36]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)

Train time:  4.1034369468688965
Test time:  0.0016241073608398438
MAE:  0.15610730194561637 RMSE:  0.21010360115666923


# N-Gram PCA

In [37]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
X = vectorizer.fit_transform(dataset["tweet"])
svd = TruncatedSVD(n_components=1000, random_state=42)
X = svd.fit_transform(X)

In [None]:
X

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
display_scores(knn, X_train, X_test, y_train, y_test)

In [None]:
knnR = KNeighborsRegressor(n_neighbors=3)
display_regression_scores(knnR, X_train, X_test, y_train, y_test)

In [None]:
dc = DecisionTreeClassifier()
display_scores(dc, X_train, X_test, y_train, y_test)

In [None]:
dcR = DecisionTreeRegressor()
display_regression_scores(dcR, X_train, X_test, y_train, y_test)

In [None]:
nb = MultinomialNB()
display_scores(nb, X_train, X_test, y_train, y_test)

In [None]:
nbR = LinearRegression()
display_regression_scores(nbR, X_train, X_test, y_train, y_test)