In [None]:
import os
import pandas as pd
from textblob import *
import nltk
import numpy as np
import openpyxl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import *
'''
    WELFAKE
'''



data = pd.read_csv(r"../data/Des_fake_news/WELFAKE_PROCESSED_FULL.csv")
data["flag"] = data["label"]



In [None]:
from openpyxl import load_workbook, Workbook

'''
    Define filtered dataset, classes, features, dataframe for model accuracies, and excel file for results
'''

'''Drop NA'''
filtered = data.dropna()
classes = filtered["flag"].to_numpy()

#just tweet sentiment classifier
tweet_features = filtered[["text_NN_tweets", "title_NN_tweets", "text_log_tweets", "title_log_tweets", "text_tb_sub_class", "title_tb_sub_class"]].to_numpy()

#just imbd sentiment classifier
imdb_features = filtered[["text_NN_imdb", "title_NN_imdb", "text_log_imdb", "title_log_imdb"]].to_numpy()

# both sentiment classifiers
tweet_and_imdb = filtered[["text_NN_tweets", "title_NN_tweets", "text_log_tweets", "title_log_tweets", "text_tb_sub_class", "title_tb_sub_class",
                           "text_NN_imdb", "title_NN_imdb", "text_log_imdb", "title_log_imdb"]].to_numpy()

# raw polarity and subjectivity scores from Textblob, Vader
raw_features = filtered[["text_tb_pol",	"text_tb_sub",	"title_tb_pol",	"title_tb_sub",	"title_vader_comp",	"title_vader_neg",	
              "title_vader_neu",	"title_vader_pos",	"text_vader_comp",	"text_vader_neg",	"text_vader_neu",	"text_vader_pos"]].to_numpy()

# dataframe to store accuracies for NN and log regression
accuracy_df = pd.DataFrame(columns=["tweet_classifier",
               "imdb_classifier",
               "combined_classifier",
               "raw_sentiments"])


EXCEL_FILE = r"../data/Des_fake_news/Sentiment_Analysis_Results/WELFAKE_RESULTS.xlsx"
# overwrite book if exists
book = Workbook()
book.save(filename=EXCEL_FILE)
book.close()


In [None]:
'''
  Logistic Regression work
'''


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix




# average accuracies
logOutput = {}
logMatrices = {"tweet_classifier" : [],
               "imdb_classifier" : [],
               "combined_classifier": [],
               "raw_sentiments" : []}

# sums
tweet_scores = 0
imdb_scores = 0
tweet_imdb_scores = 0
raw_scores = 0

# log regression model, LBFGS with L2 penalty
lbgfs = LogisticRegression(penalty="l2", solver="lbfgs")

log_combined_pred = None
log_raw_pred = None

# run 100 iterations
#for i in range(1):
  #print(f"Iteration {i}")
  #tweet_accuracy
x_train, x_test, y_train, y_test = train_test_split(tweet_features, classes, test_size=0.15, random_state=42)
lbgfs.fit(x_train, y_train)
y_pred = lbgfs.predict(x_test)
tweet_scores += accuracy_score(y_test, y_pred)
logMatrices["tweet_classifier"].append(confusion_matrix(y_test, y_pred))

x_train, x_test, y_train, y_test = train_test_split(imdb_features, classes, test_size=0.15, random_state=42)
lbgfs.fit(x_train, y_train)
y_pred = lbgfs.predict(x_test)
imdb_scores += accuracy_score(y_test, y_pred)
logMatrices["imdb_classifier"].append(confusion_matrix(y_test, y_pred))

x_train, x_test, y_train, y_test = train_test_split(tweet_and_imdb, classes, test_size=0.15, random_state=42)
lbgfs.fit(x_train, y_train)
log_combined_pred = lbgfs.predict(x_test)
tweet_imdb_scores += accuracy_score(y_test, log_combined_pred)
logMatrices["combined_classifier"].append(confusion_matrix(y_test, log_combined_pred))

x_train, x_test, y_train, y_test = train_test_split(raw_features, classes, test_size=0.15, random_state=42)
lbgfs.fit(x_train, y_train)
log_raw_pred = lbgfs.predict(x_test)
raw_scores += accuracy_score(y_test, log_raw_pred)
logMatrices["raw_sentiments"].append(confusion_matrix(y_test, log_raw_pred))


log_y_actual = y_test

logOutput["tweet_classifier"] = tweet_scores / 1
logOutput["imdb_classifier"] = imdb_scores / 1
logOutput["combined_classifier"] = tweet_imdb_scores / 1
logOutput["raw_sentiments"] = raw_scores / 1

logMatrices["tweet_classifier"] = np.mean(np.array(logMatrices["tweet_classifier"]), axis=0)
logMatrices["imdb_classifier"] = np.mean(np.array(logMatrices["imdb_classifier"]), axis=0)
logMatrices["combined_classifier"] = np.mean(np.array(logMatrices["combined_classifier"]), axis=0)
logMatrices["raw_sentiments"] = np.mean(np.array(logMatrices["raw_sentiments"]), axis=0)


# write confusion matrices and save
book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book

for i in logMatrices.keys():
  pd.DataFrame(logMatrices[i]).to_excel(writer, sheet_name=f"matrix_log_{i}")

book.save(filename=EXCEL_FILE)
book.close()

accuracy_df.loc["Log Regression"] = logOutput







In [None]:
'''
    Compile and save neural net models
'''

import tensorflow as tf

tweet_len = tweet_features.shape[1]
imdb_len = imdb_features.shape[1]
combined_len = tweet_and_imdb.shape[1]
raw_len = raw_features.shape[1]


tweet_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(tweet_len, 1)),
  tf.keras.layers.Dense(tweet_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
tweet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


imdb_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(imdb_len, 1)),
  tf.keras.layers.Dense(imdb_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
imdb_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


combined_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(combined_len, 1)),
  tf.keras.layers.Dense(combined_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



raw_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(raw_len, 1)),
  tf.keras.layers.Dense(raw_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

raw_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])










In [None]:
'''
    NN results
'''

EPOCHS = 40
nnOutput = {}
nnMatrices = {"tweet_classifier" : None,
               "imdb_classifier" : None,
               "combined_classifier": None,
               "raw_sentiments" : None}


# tweet NN
x_train, x_test, y_train, y_test = train_test_split(tweet_features, classes, test_size=0.15, random_state=42)
tweet_model.fit(x_train, y_train, epochs=1, verbose=2)
model_loss1, model_acc1 = tweet_model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = tweet_model.evaluate(x_test,  y_test, verbose=2)
nnOutput["tweet_classifier"] = model_acc2
tweet_pred = [1 if i >= 0.5 else 0 for i in tweet_model.predict(x_test)]
nnMatrices["tweet_classifier"] = confusion_matrix(y_test, tweet_pred)

# imdb NN
x_train, x_test, y_train, y_test = train_test_split(imdb_features, classes, test_size=0.15, random_state=42)
imdb_model.fit(x_train, y_train, epochs=1, verbose=2)
model_loss1, model_acc1 = imdb_model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = imdb_model.evaluate(x_test,  y_test, verbose=2)
nnOutput["imdb_classifier"] = model_acc2
imdb_pred = [1 if i >= 0.5 else 0 for i in imdb_model.predict(x_test)]
nnMatrices["imdb_classifier"] = confusion_matrix(y_test, imdb_pred)

# combined NN
x_train, x_test, y_train, y_test = train_test_split(tweet_and_imdb, classes, test_size=0.15, random_state=42)
combined_model.fit(x_train, y_train, epochs=EPOCHS, verbose=2)
model_loss1, model_acc1 = combined_model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = combined_model.evaluate(x_test,  y_test, verbose=2)
nnOutput["combined_classifier"] = model_acc2
combined_pred = [1 if i >= 0.5 else 0 for i in combined_model.predict(x_test)]
nnMatrices["combined_classifier"] = confusion_matrix(y_test, combined_pred)

# raw NN
x_train, x_test, y_train, y_test = train_test_split(raw_features, classes, test_size=0.15, random_state=42)
raw_model.fit(x_train, y_train, epochs=EPOCHS, verbose=2)
model_loss1, model_acc1 = raw_model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = raw_model.evaluate(x_test,  y_test, verbose=2)
nnOutput["raw_sentiments"] = model_acc2
raw_pred = [1 if i >= 0.5 else 0 for i in raw_model.predict(x_test)]
nnMatrices["raw_sentiments"] = confusion_matrix(y_test, raw_pred)


# write confusion matrices and save
book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book

for i in nnMatrices.keys():
  pd.DataFrame(nnMatrices[i]).to_excel(writer, sheet_name=f"matrix_NN_{i}")

book.save(filename=EXCEL_FILE)
book.close()
accuracy_df.loc["Neural Net"] = nnOutput





In [None]:
'''
	Categorical feature sets for Naive Bayes and Random Forests
'''


# both sentiment classifiers
categorical_tweet_and_imdb = filtered[["text_tb_sub_class", "title_tb_sub_class", "text_NN_imdb", "text_NN_tweets",	"title_NN_imdb",
                                       	"title_NN_tweets", "text_log_imdb", "text_log_tweets", "title_log_imdb", "title_log_tweets"]].to_numpy()

# raw polarity and subjectivity scores from Textblob, Vader
categorical_raw_features = filtered[["title_vader_class", "text_vader_class", "text_tb_pol_class", "text_tb_sub_class", "title_tb_pol_class", "title_tb_sub_class"]].to_numpy()

In [None]:
'''
  Naive Bayes work
'''


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix




# average accuracies
nbOutput = {}
nbMatrices = {"combined_classifier": None,
               "raw_sentiments" : None}

# sums
tweet_scores = 0
imdb_scores = 0
tweet_imdb_scores = 0
raw_scores = 0

# log regression model, LBFGS with L2 penalty
nb = CategoricalNB()

nb_combined_pred = None
nb_raw_pred = None

# run 100 iterations
#for i in range(1):
  #print(f"Iteration {i}")
  #tweet_accuracy

x_train, x_test, y_train, y_test = train_test_split(categorical_tweet_and_imdb, classes, test_size=0.15, random_state=42)
nb.fit(x_train, y_train)
nb_combined_pred = nb.predict(x_test)
tweet_imdb_scores += accuracy_score(y_test, nb_combined_pred)
nbMatrices["combined_classifier"] = confusion_matrix(y_test, nb_combined_pred)

x_train, x_test, y_train, y_test = train_test_split(categorical_raw_features, classes, test_size=0.15, random_state=42)
nb.fit(x_train, y_train)
nb_raw_pred = nb.predict(x_test)
raw_scores += accuracy_score(y_test, nb_raw_pred)
nbMatrices["raw_sentiments"] = confusion_matrix(y_test, nb_raw_pred)


log_y_actual = y_test

nbOutput["tweet_classifier"] = 0
nbOutput["imdb_classifier"] = 0
nbOutput["combined_classifier"] = tweet_imdb_scores / 1
nbOutput["raw_sentiments"] = raw_scores / 1


nbMatrices["combined_classifier"] = np.mean(np.array(nbMatrices["combined_classifier"]), axis=0)
nbMatrices["raw_sentiments"] = np.mean(np.array(nbMatrices["raw_sentiments"]), axis=0)


# write confusion matrices and save
book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book

for i in nbMatrices.keys():
  pd.DataFrame(nbMatrices[i]).to_excel(writer, sheet_name=f"matrix_nb_{i}")

book.save(filename=EXCEL_FILE)
book.close()

accuracy_df.loc["Naive Bayes"] = nbOutput

accuracy_df


In [None]:
'''
    Random forests work
'''

from sklearn.ensemble import RandomForestClassifier

# average accuracies
rfOutput = {}
rfMatrices = {"combined_classifier": None,
               "raw_sentiments" : None}

# sums
tweet_scores = 0
imdb_scores = 0
tweet_imdb_scores = 0
raw_scores = 0

# log regression model, LBFGS with L2 penalty
rf = RandomForestClassifier()

rf_combined_pred = None
rf_raw_pred = None

# run 100 iterations
#for i in range(1):
  #print(f"Iteration {i}")
  #tweet_accuracy

x_train, x_test, y_train, y_test = train_test_split(categorical_tweet_and_imdb, classes, test_size=0.15, random_state=42)
rf.fit(x_train, y_train)
rf_combined_pred = rf.predict(x_test)
tweet_imdb_scores += accuracy_score(y_test, rf_combined_pred)
rfMatrices["combined_classifier"] = confusion_matrix(y_test, rf_combined_pred)

x_train, x_test, y_train, y_test = train_test_split(categorical_raw_features, classes, test_size=0.15, random_state=42)
rf.fit(x_train, y_train)
rf_raw_pred = rf.predict(x_test)
raw_scores += accuracy_score(y_test, rf_raw_pred)
rfMatrices["raw_sentiments"] = confusion_matrix(y_test, rf_raw_pred)


log_y_actual = y_test

rfOutput["tweet_classifier"] = 0
rfOutput["imdb_classifier"] = 0
rfOutput["combined_classifier"] = tweet_imdb_scores / 1
rfOutput["raw_sentiments"] = raw_scores / 1


rfMatrices["combined_classifier"] = np.mean(np.array(rfMatrices["combined_classifier"]), axis=0)
rfMatrices["raw_sentiments"] = np.mean(np.array(rfMatrices["raw_sentiments"]), axis=0)


# write confusion matrices and save
book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book

for i in rfMatrices.keys():
  pd.DataFrame(rfMatrices[i]).to_excel(writer, sheet_name=f"matrix_rf_{i}")

book.save(filename=EXCEL_FILE)
book.close()

accuracy_df.loc["Random Forest"] = rfOutput

accuracy_df



In [None]:
'''
    Finally, save accuracy metrics to the spreadsheet
'''

book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book
accuracy_df.to_excel(writer, sheet_name=f"predicion_accuracies")
book.save(filename=EXCEL_FILE)
book.close()



In [None]:
import openpyxl.drawing
from sklearn import metrics
import matplotlib.pyplot as plt
import io

y = y_test
log_y = log_y_actual

raw_scores = np.array(raw_pred)
combined_scores = np.array(combined_pred)

raw_fpr, raw_tpr, raw_thresh = metrics.roc_curve(y, raw_scores, pos_label=1)
raw_roc_auc = metrics.auc(raw_fpr, raw_tpr)

com_fpr, com_tpr, com_thresh = metrics.roc_curve(y, combined_scores, pos_label=1)
com_roc_auc = metrics.auc(com_fpr, com_tpr)

raw_fpr_log, raw_tpr_log, raw_thresh_log = metrics.roc_curve(y, log_raw_pred, pos_label=1)
raw_roc_auc_log = metrics.auc(raw_fpr_log, raw_tpr_log)

com_fpr_log, com_tpr_log, com_thresh_log = metrics.roc_curve(y, log_combined_pred, pos_label=1)
com_roc_auc_log = metrics.auc(com_fpr_log, com_tpr_log)

raw_fpr_nb, raw_tpr_nb, raw_thresh_nb = metrics.roc_curve(y, nb_raw_pred, pos_label=1)
raw_roc_auc_nb = metrics.auc(raw_fpr_nb, raw_tpr_nb)

com_fpr_nb, com_tpr_nb, com_thresh_nb = metrics.roc_curve(y, nb_combined_pred, pos_label=1)
com_roc_auc_nb = metrics.auc(com_fpr_nb, com_tpr_nb)

raw_fpr_rf, raw_tpr_rf, raw_thresh_rf = metrics.roc_curve(y, rf_raw_pred, pos_label=1)
raw_roc_auc_rf = metrics.auc(raw_fpr_rf, raw_tpr_rf)

com_fpr_rf, com_tpr_rf, com_thresh_rf = metrics.roc_curve(y, rf_combined_pred, pos_label=1)
com_roc_auc_rf = metrics.auc(com_fpr_rf, com_tpr_rf)


plt.figure()
lw = 2
plt.plot(raw_fpr, raw_tpr,
 lw=lw, label='Raw NN (%0.2f)' % raw_roc_auc)
plt.plot(com_fpr, com_tpr,
 lw=lw, label='Sentiment Classifier NN  (%0.2f)' % com_roc_auc)
plt.plot(raw_fpr_log, raw_tpr_log,
 lw=lw, label='Raw Log (%0.2f)' % raw_roc_auc_log)
plt.plot(com_fpr_log, com_tpr_log,
 lw=lw, label='Sentiment Classifier Log  (%0.2f)' % com_roc_auc_log)

plt.plot(raw_fpr_nb, raw_tpr_nb,
 lw=lw, label='Raw Naive Bayes (%0.2f)' % raw_roc_auc_nb)
plt.plot(com_fpr_nb, com_tpr_nb,
 lw=lw, label='Sentiment Classifier Naive Bayes  (%0.2f)' % com_roc_auc_nb)

plt.plot(raw_fpr_rf, raw_tpr_rf,
 lw=lw, label='Raw Naive Random Forest (%0.2f)' % raw_roc_auc_rf)
plt.plot(com_fpr_rf, com_tpr_rf,
 lw=lw, label='Sentiment Classifier Random Forest  (%0.2f)' % com_roc_auc_rf)




plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
#plt.show()


# save figure as PNG
png = io.BytesIO()
plt.savefig(png, format="png")


# write PNG to excel file
book = load_workbook(EXCEL_FILE)
ws = book.active

img = openpyxl.drawing.image.Image(png)
img.anchor = "A1"
ws.add_image(img)
book.save(filename=EXCEL_FILE)
plt.close()
book.close()



In [None]:
unique, counts = np.unique(y_test, return_counts=True)
counts[0] / sum(counts)