In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Load covid_training
covid_training = pd.read_csv("data/covid_training.tsv", delimiter="\t") 
# Load covid_test_public
covid_test_public = pd.read_csv("data/covid_test_public.tsv", header=None, delimiter="\t")

In [3]:
# Training set 399 tweets
X_covid_training = covid_training.iloc[:, 1].to_numpy()
Y_covid_training = covid_training.iloc[:, 2].to_numpy()

# Test set 55 tweet
X_covid_test_public = covid_test_public.iloc[:, 1].to_numpy()
Y_covid_test_public = covid_test_public.iloc[:, 2].to_numpy()

X_covid_test_public_tweet_ids = covid_test_public.iloc[:, 0].to_numpy()

In [4]:
# Counting tokens in OV and FV

total_count = Counter()
for tweet in X_covid_training:
    tokens = tweet.lower().split()
    total_count += Counter(tokens)

total_count = total_count.most_common()
words = [i[0] for i in total_count]

def two_or_more_words(word_count):
    return True if word_count[1] >= 2 else False

total_count_fv = list(filter(two_or_more_words, total_count))
words_fv = [i[0] for i in total_count_fv]

In [5]:
# Splitting the dataset into 2 classes (yes, no)

X_covid_training
Y_covid_training

X_covid_training_yes = []
X_covid_training_no = []

for i, c in enumerate(Y_covid_training):
    if c == 'yes':
        X_covid_training_yes.append(X_covid_training[i])
    elif c == 'no':
        X_covid_training_no.append(X_covid_training[i])

P_prior_yes = len(X_covid_training_yes) / (len(X_covid_training_yes) + len(X_covid_training_no))
P_prior_no = len(X_covid_training_no) / (len(X_covid_training_yes) + len(X_covid_training_no))

In [6]:
# yes
X_covid_training_tokens_yes = np.empty((0,0))

total_count_yes = Counter()
for tweet in X_covid_training_yes:
    tokens = tweet.lower().split()
    total_count_yes += Counter(tokens)
    X_covid_training_tokens_yes = np.append(X_covid_training_tokens_yes, np.array(tokens))

total_count_yes = total_count_yes.most_common()
words_yes = [i[0] for i in total_count_yes]
vocab_yes_len = len(X_covid_training_tokens_yes)
frequencies_yes = [i[1] for i in total_count_yes]


def two_or_more(count):
    return True if count >= 2 else False

frequencies_yes_fv = list(filter(two_or_more, frequencies_yes))
words_yes_fv = words_yes[0:len(frequencies_yes_fv)]

def filter_for_fv_yes(token):
    return True if token in words_yes_fv else False
    
X_covid_training_tokens_yes_fv = list(filter(filter_for_fv_yes, X_covid_training_tokens_yes))
vocab_yes_len_fv = len(X_covid_training_tokens_yes_fv)

# no
X_covid_training_tokens_no = np.empty((0,0))

total_count_no = Counter()
for tweet in X_covid_training_no:
    tokens = tweet.lower().split()
    total_count_no += Counter(tokens)
    X_covid_training_tokens_no = np.append(X_covid_training_tokens_no, np.array(tokens))

total_count_no = total_count_no.most_common()
words_no = [i[0] for i in total_count_no]
vocab_no_len = len(X_covid_training_tokens_no)
frequencies_no = [i[1] for i in total_count_no]

frequencies_no_fv = list(filter(two_or_more, frequencies_no))
words_no_fv = words_no[0:len(frequencies_no_fv)]

def filter_for_fv_no(token):
    return True if token in words_no_fv else False
    
X_covid_training_tokens_no_fv = list(filter(filter_for_fv_no, X_covid_training_tokens_no))
vocab_no_len_fv = len(X_covid_training_tokens_no_fv)


In [7]:
# Computing conditional probabilities
P_cond_yes = []
P_cond_no = []
delta = 0.01
vocab_size = len(words)
for word in total_count:
    if word[0] in words_yes:
#         print(f"{words_yes.index(word[0])} {frequencies_yes[words_yes.index(word[0])]} {frequencies_yes[words_yes.index(word[0])] / vocab_yes_len}")
        prob = (frequencies_yes[words_yes.index(word[0])] + delta) / (vocab_yes_len + vocab_size * delta)
        prob = math.log10(prob)
        P_cond_yes.append(prob)
    else:
        prob = delta / (vocab_yes_len + vocab_size * delta)
        prob = math.log10(prob)
        P_cond_yes.append(prob)
    if word[0] in words_no:
        prob = (frequencies_no[words_no.index(word[0])] + delta) / (vocab_no_len + vocab_size * delta)
        prob = math.log10(prob)
        P_cond_no.append(prob)
    else:
        prob = delta / (vocab_no_len + vocab_size * delta)
        prob = math.log10(prob)
        P_cond_no.append(prob)
        
# Computing conditional probabilities
P_cond_yes_fv = []
P_cond_no_fv = []
delta = 0.01
vocab_size_fv = len(words_fv)
for word in total_count_fv:
    if word[0] in words_yes_fv:
#         print(f"{words_yes.index(word[0])} {frequencies_yes[words_yes.index(word[0])]} {frequencies_yes[words_yes.index(word[0])] / vocab_yes_len}")
        prob = (frequencies_yes_fv[words_yes_fv.index(word[0])] + delta) / (vocab_yes_len_fv + vocab_size_fv * delta)
        prob = math.log10(prob)
        P_cond_yes_fv.append(prob)
    else:
        prob = delta / (vocab_yes_len + vocab_size_fv * delta)
        prob = math.log10(prob)
        P_cond_yes_fv.append(prob)
    if word[0] in words_no_fv:
        prob = (frequencies_no_fv[words_no_fv.index(word[0])] + delta) / (vocab_no_len_fv + vocab_size_fv * delta)
        prob = math.log10(prob)
        P_cond_no_fv.append(prob)
    else:
        prob = delta / (vocab_no_len + vocab_size_fv * delta)
        prob = math.log10(prob)
        P_cond_no_fv.append(prob)


In [8]:
# Compute Scores for Test set

Scores_yes = []
Scores_no = []
Scores_yes_fv = []
Scores_no_fv = []

for tweet in X_covid_test_public:
    tokens = tweet.lower().split()
    Score_yes = P_prior_yes
    Score_no = P_prior_no
    Score_yes_fv = P_prior_yes
    Score_no_fv = P_prior_no
    for token in tokens:
        if token in words:
            prob_yes = P_cond_yes[words.index(token)]
            prob_no = P_cond_no[words.index(token)]
            Score_yes *= prob
            Score_no *= prob
        if token in words_fv:
            prob_yes = P_cond_yes_fv[words_fv.index(token)]
            prob_no = P_cond_no_fv[words_fv.index(token)]
            Score_yes_fv *= prob
            Score_no_fv *= prob
            
    Scores_yes.append(Score_yes)
    Scores_no.append(Score_no)
    Scores_yes_fv.append(Score_yes_fv)
    Scores_no_fv.append(Score_no_fv)
    
    
# find yes or no from max score

In [9]:
corrects = 0
wrongs = 0

# format 
# yes = 0
# no = 1
Y_pred_yes = []
Y_pred_no = []
Y_test = []

scientific_notation = "{:.2e}"
out_trace_1 = []
for i in range(0, len(Scores_yes)):
    predicted = "yes" if Scores_yes[i] >= Scores_no[i] else "no"
    actual = Y_covid_test_public[i]
    
    Y_pred_yes_val = 0 if Scores_yes[i] >= Scores_no[i] else 1
    Y_pred_yes.append(Y_pred_yes_val)
    Y_pred_no.append(1 if Y_pred_yes_val == 0 else 0)
    Y_test.append(0 if Y_covid_test_public[i] == "yes" else 1)
    
    
    correct = "correct" if predicted == actual else "wrong"
    if correct == "correct":
        corrects += 1
    elif correct == "wrong":
        wrongs += 1
        
    
    score = max(Scores_yes[i], Scores_no[i])
    line = f"{X_covid_test_public_tweet_ids[i]}  {predicted}  {scientific_notation.format(score)}  {actual}  {correct}"
    out_trace_1.append(line)
    
report_yes = classification_report(Y_test, Y_pred_yes, digits=4)
report_no = classification_report(Y_test, Y_pred_no, digits=4)
conf = confusion_matrix(Y_test, Y_pred_yes)
print(report_yes)
print(report_no)
print(conf)

accuracy = accuracy_score(Y_test, Y_pred_yes)
precision_yes = precision_score(Y_test, Y_pred_yes)
precision_no = precision_score(Y_test, Y_pred_no)
recall_yes = recall_score(Y_test, Y_pred_yes)
recall_no = recall_score(Y_test, Y_pred_no)
f1_yes = f1_score(Y_test, Y_pred_yes)
f1_no = f1_score(Y_test, Y_pred_no)
out_eval_1 = f"""{'{0:.4f}'.format(accuracy)}
{'{0:.4f}'.format(precision_yes)}  {'{0:.4f}'.format(precision_no)}
{'{0:.4f}'.format(recall_yes)}  {'{0:.4f}'.format(recall_no)}
{'{0:.4f}'.format(f1_yes)}  {'{0:.4f}'.format(f1_no)}"""

print(out_eval_1)

              precision    recall  f1-score   support

           0     0.6087    0.4242    0.5000        33
           1     0.4062    0.5909    0.4815        22

    accuracy                         0.4909        55
   macro avg     0.5075    0.5076    0.4907        55
weighted avg     0.5277    0.4909    0.4926        55

              precision    recall  f1-score   support

           0     0.5938    0.5758    0.5846        33
           1     0.3913    0.4091    0.4000        22

    accuracy                         0.5091        55
   macro avg     0.4925    0.4924    0.4923        55
weighted avg     0.5128    0.5091    0.5108        55

[[14 19]
 [ 9 13]]
0.4909
0.4062  0.3913
0.5909  0.4091
0.4815  0.4000


In [10]:
corrects_fv = 0
wrongs_fv = 0

# format 
# yes = 0
# no = 1
Y_pred_yes_fv = []
Y_pred_no_fv = []

scientific_notation = "{:.2e}"
out_trace_2 = []
for i in range(0, len(Scores_yes_fv)):
    predicted = "yes" if Scores_yes_fv[i] >= Scores_no_fv[i] else "no"
    actual = Y_covid_test_public[i]
    
    Y_pred_yes_val = 0 if Scores_yes_fv[i] >= Scores_no_fv[i] else 1
    Y_pred_yes_fv.append(Y_pred_yes_val)
    Y_pred_no_fv.append(1 if Y_pred_yes_val == 0 else 0)
    
    correct = "correct" if predicted == actual else "wrong"
    if correct == "correct":
        corrects_fv += 1
    elif correct == "wrong":
        wrongs_fv += 1
        
    
    score = max(Scores_yes_fv[i], Scores_no_fv[i])
    line = f"{X_covid_test_public_tweet_ids[i]}  {predicted}  {scientific_notation.format(score)}  {actual}  {correct}"
    out_trace_2.append(line)
    
report_yes_fv = classification_report(Y_test, Y_pred_yes_fv, digits=4)
report_no_fv = classification_report(Y_test, Y_pred_no_fv, digits=4)
conf_fv = confusion_matrix(Y_test, Y_pred_yes_fv)
print(report_yes_fv)
print(report_no_fv)
print(conf_fv)

accuracy_fv = accuracy_score(Y_test, Y_pred_yes_fv)
precision_yes_fv = precision_score(Y_test, Y_pred_yes_fv)
precision_no_fv = precision_score(Y_test, Y_pred_no_fv)
recall_yes_fv = recall_score(Y_test, Y_pred_yes_fv)
recall_no_fv = recall_score(Y_test, Y_pred_no_fv)
f1_yes_fv = f1_score(Y_test, Y_pred_yes_fv)
f1_no_fv = f1_score(Y_test, Y_pred_no_fv)
out_eval_2 = f"""{'{0:.4f}'.format(accuracy_fv)}
{'{0:.4f}'.format(precision_yes_fv)}  {'{0:.4f}'.format(precision_no_fv)}
{'{0:.4f}'.format(recall_yes_fv)}  {'{0:.4f}'.format(recall_no_fv)}
{'{0:.4f}'.format(f1_yes_fv)}  {'{0:.4f}'.format(f1_no_fv)}"""

print(out_eval_2)

              precision    recall  f1-score   support

           0     0.6429    0.5455    0.5902        33
           1     0.4444    0.5455    0.4898        22

    accuracy                         0.5455        55
   macro avg     0.5437    0.5455    0.5400        55
weighted avg     0.5635    0.5455    0.5500        55

              precision    recall  f1-score   support

           0     0.5556    0.4545    0.5000        33
           1     0.3571    0.4545    0.4000        22

    accuracy                         0.4545        55
   macro avg     0.4563    0.4545    0.4500        55
weighted avg     0.4762    0.4545    0.4600        55

[[18 15]
 [10 12]]
0.5455
0.4444  0.3571
0.5455  0.4545
0.4898  0.4000


In [11]:
# True Positive TP     False Negative FN
# False Positive FP    True Negative TN

# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
# Accuracy = (TP + TN) / total


In [12]:
trace_1 = open("output_files/traceNB-BOW-OV.txt", "w")
for line in out_trace_1:
    trace_1.write(line)
    trace_1.write("\n")
trace_1.close()

In [13]:
eval_1 = open("output_files/evalNB-BOW-OV.txt", "w")
eval_1.write(out_eval_1)

eval_1.close()

In [14]:
trace_2 = open("output_files/traceNB-BOW-FV.txt", "w")
for line in out_trace_2:
    trace_2.write(line)
    trace_2.write("\n")
trace_2.close()

In [15]:
eval_2 = open("output_files/evalNB-BOW-FV.txt", "w")
eval_2.write(out_eval_2)

eval_2.close()