In [1]:
import os
import pandas as pd
from textblob import *
import nltk
import numpy as np
import openpyxl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import *
'''
    ISOT
'''



data = pd.read_csv(r"../data/Des_fake_news/ISOT_PROCESSED.csv")



In [2]:
from openpyxl import load_workbook, Workbook

'''
    Define filtered dataset, classes, features, dataframe for model accuracies, and excel file for results
'''

'''Drop NA'''
filtered = data.dropna()
classes = filtered["flag"].to_numpy()

#just tweet sentiment classifier
tweet_features = filtered[["text_NN_tweets", "title_NN_tweets", "text_log_tweets", "title_log_tweets", "text_tb_sub_class", "title_tb_sub_class"]].to_numpy()

#just imbd sentiment classifier
imdb_features = filtered[["text_NN_imdb", "title_NN_imdb", "text_log_imdb", "title_log_imdb"]].to_numpy()

# both sentiment classifiers
tweet_and_imdb = filtered[["text_NN_tweets", "title_NN_tweets", "text_log_tweets", "title_log_tweets", "text_tb_sub_class", "title_tb_sub_class",
                           "text_NN_imdb", "title_NN_imdb", "text_log_imdb", "title_log_imdb"]].to_numpy()

# raw polarity and subjectivity scores from Textblob, Vader
raw_features = filtered[["text_tb_pol",	"text_tb_sub",	"title_tb_pol",	"title_tb_sub",	"title_vader_comp",	"title_vader_neg",	
              "title_vader_neu",	"title_vader_pos",	"text_vader_comp",	"text_vader_neg",	"text_vader_neu",	"text_vader_pos"]].to_numpy()

# dataframe to store accuracies for NN and log regression
accuracy_df = pd.DataFrame(columns=["tweet_classifier",
               "imdb_classifier",
               "combined_classifier",
               "raw_sentiments"])


EXCEL_FILE = r"../data/Des_fake_news/Sentiment_Analysis_Results/ISOT_RESULTS.xlsx"
# overwrite book if exists
book = Workbook()
book.save(filename=EXCEL_FILE)


In [3]:
'''
  Logistic Regression work
'''


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix




# average accuracies
logOutput = {}
logMatrices = {"tweet_classifier" : [],
               "imdb_classifier" : [],
               "combined_classifier": [],
               "raw_sentiments" : []}

# sums
tweet_scores = 0
imdb_scores = 0
tweet_imdb_scores = 0
raw_scores = 0

# log regression model, LBFGS with L2 penalty
lbgfs = LogisticRegression(penalty="l2", solver="lbfgs")

# run 100 iterations
for i in range(1):
  print(f"Iteration {i}")
  #tweet_accuracy
  x_train, x_test, y_train, y_test = train_test_split(tweet_features, classes, test_size=0.15, random_state=i)
  lbgfs.fit(x_train, y_train)
  y_pred = lbgfs.predict(x_test)
  tweet_scores += accuracy_score(y_test, y_pred)
  logMatrices["tweet_classifier"].append(confusion_matrix(y_test, y_pred))

  x_train, x_test, y_train, y_test = train_test_split(imdb_features, classes, test_size=0.15, random_state=i)
  lbgfs.fit(x_train, y_train)
  y_pred = lbgfs.predict(x_test)
  imdb_scores += accuracy_score(y_test, y_pred)
  logMatrices["imdb_classifier"].append(confusion_matrix(y_test, y_pred))

  x_train, x_test, y_train, y_test = train_test_split(tweet_and_imdb, classes, test_size=0.15, random_state=i)
  lbgfs.fit(x_train, y_train)
  y_pred = lbgfs.predict(x_test)
  tweet_imdb_scores += accuracy_score(y_test, y_pred)
  logMatrices["combined_classifier"].append(confusion_matrix(y_test, y_pred))

  x_train, x_test, y_train, y_test = train_test_split(raw_features, classes, test_size=0.15, random_state=i)
  lbgfs.fit(x_train, y_train)
  y_pred = lbgfs.predict(x_test)
  raw_scores += accuracy_score(y_test, lbgfs.predict(x_test))
  logMatrices["raw_sentiments"].append(confusion_matrix(y_test, y_pred))

logOutput["tweet_classifier"] = tweet_scores / 1
logOutput["imdb_classifier"] = imdb_scores / 1
logOutput["combined_classifier"] = tweet_imdb_scores / 1
logOutput["raw_sentiments"] = raw_scores / 1

logMatrices["tweet_classifier"] = np.mean(np.array(logMatrices["tweet_classifier"]), axis=0)
logMatrices["imdb_classifier"] = np.mean(np.array(logMatrices["imdb_classifier"]), axis=0)
logMatrices["combined_classifier"] = np.mean(np.array(logMatrices["combined_classifier"]), axis=0)
logMatrices["raw_sentiments"] = np.mean(np.array(logMatrices["raw_sentiments"]), axis=0)


# write confusion matrices and save
book = load_workbook(EXCEL_FILE)
writer = pd.ExcelWriter(EXCEL_FILE, engine="openpyxl")
writer.book = book

for i in logMatrices.keys():
  pd.DataFrame(logMatrices[i]).to_excel(writer, sheet_name=f"matrix_log_{i}")

book.save(filename=EXCEL_FILE)

accuracy_df.loc["Log Regression"] = logOutput
accuracy_df






Iteration 0


  writer.book = book


Unnamed: 0,tweet_classifier,imdb_classifier,combined_classifier,raw_sentiments
Log Regression,0.640172,0.525776,0.640024,0.709404


In [20]:
'''
    Compile and save neural net models
'''
DIR = r"Fake_news_nn"
import tensorflow as tf
saved_models = os.listdir(DIR)

tweets_name = "ISOT_tweets.keras"
imdb_name = "ISOT_imdb.keras"
combined_name = "ISOT_combined.keras"
raw_name = "ISOT_raw.keras"

tweet_len = tweet_features.shape[1]
imdb_len = imdb_features.shape[1]
combined_len = tweet_and_imdb.shape[1]
raw_len = raw_features.shape[1]


#if tweets_name not in saved_models:
tweet_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(tweet_len, 1)),
  tf.keras.layers.Dense(tweet_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
tweet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  #tweet_model.save(f"{DIR}/{tweets_name}")

#if imdb_name not in saved_models:
imdb_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(imdb_len, 1)),
  tf.keras.layers.Dense(imdb_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
imdb_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#imdb_model.save(f"{DIR}/{imdb_name}")

#if combined_name not in saved_models:
combined_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(combined_len, 1)),
  tf.keras.layers.Dense(combined_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#combined_model.save(f"{DIR}/{combined_name}")

#if raw_name not in saved_models:
raw_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(raw_len, 1)),
  tf.keras.layers.Dense(raw_len, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

raw_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#raw_model.save(f"{DIR}/{raw_name}")









  super().__init__(**kwargs)


In [22]:
'''
    NN results
'''

# tweet NN
#x_train, x_test, y_train, y_test = train_test_split(tweet_features, classes, test_size=0.15, random_state=42)
#tweet_model.fit(x_train, y_train, epochs=10, verbose=2)

x_train, x_test, y_train, y_test = train_test_split(raw_features, classes, test_size=0.15, random_state=42)
raw_model.fit(x_train, y_train, epochs=100, verbose=2)



Epoch 1/100
1192/1192 - 1s - 1ms/step - accuracy: 0.7692 - loss: 0.4871
Epoch 2/100
1192/1192 - 1s - 996us/step - accuracy: 0.7730 - loss: 0.4828
Epoch 3/100
1192/1192 - 1s - 969us/step - accuracy: 0.7736 - loss: 0.4794
Epoch 4/100
1192/1192 - 1s - 974us/step - accuracy: 0.7761 - loss: 0.4767
Epoch 5/100
1192/1192 - 1s - 954us/step - accuracy: 0.7775 - loss: 0.4742
Epoch 6/100
1192/1192 - 1s - 950us/step - accuracy: 0.7781 - loss: 0.4723
Epoch 7/100
1192/1192 - 1s - 955us/step - accuracy: 0.7796 - loss: 0.4706
Epoch 8/100
1192/1192 - 1s - 961us/step - accuracy: 0.7799 - loss: 0.4692
Epoch 9/100
1192/1192 - 1s - 988us/step - accuracy: 0.7807 - loss: 0.4682
Epoch 10/100
1192/1192 - 1s - 962us/step - accuracy: 0.7821 - loss: 0.4671
Epoch 11/100
1192/1192 - 1s - 1ms/step - accuracy: 0.7823 - loss: 0.4660
Epoch 12/100
1192/1192 - 1s - 1ms/step - accuracy: 0.7823 - loss: 0.4651
Epoch 13/100
1192/1192 - 1s - 973us/step - accuracy: 0.7822 - loss: 0.4645
Epoch 14/100
1192/1192 - 1s - 973us/step

<keras.src.callbacks.history.History at 0x24f22e0cc10>

In [23]:
model_loss1, model_acc1 = raw_model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = raw_model.evaluate(x_test,  y_test, verbose=2)
print(f"Train / Test Accuracy: {model_acc1*100:.1f}% / {model_acc2*100:.1f}%")

1192/1192 - 1s - 1ms/step - accuracy: 0.7880 - loss: 0.4479
211/211 - 0s - 955us/step - accuracy: 0.7884 - loss: 0.4515
Train / Test Accuracy: 78.8% / 78.8%


In [None]:
predicted = raw_model.predict(x_test)
predicted = [1 if i >= 0.50 else 0 for i in predicted]
matrix = confusion_matrix(y_test, predicted)

uniques, counts = np.unique(y_test, return_counts=True)



[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


0.5214678353885009