# Data Cleaning and Pre-Processing

##### First, we import our data and necessary libraries. 

In [41]:
import numpy as np
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn import metrics
import pandas as pd

combo_df = pd.read_csv("data/monkeypox.csv", encoding='latin-1')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloadi

##### Next, we set a random seed to keep our results consistent each time the script is ran. We then perform basic data cleaning, including removing blank rows, conversion to lowercase, and tokenization of text stream into meaningful elements.

In [42]:
np.random.seed(333)

combo_df = combo_df.rename(columns={'ï»¿number': 'id'})

#remove any blank rows
combo_df['text'].dropna(inplace=True)

#convert all to lowercase
combo_df['text'] = combo_df['text'].astype(str)
combo_df['text'] = combo_df['text'].str.lower()
print(combo_df.head(10))

#tokenization
combo_df['tokenized_text'] = combo_df.apply(lambda row: nltk.word_tokenize(row["text"]), axis=1)
print(combo_df['tokenized_text'].head(10))

   id           created_at                                               text  \
0  18  2022-07-08 10:12:04  much of the focus on #monkeypox recently has b...   
1  31  2022-07-09 00:00:00  icymi: the first probable case of monkeypox in...   
2  32  2022-07-09 00:00:01  who: #monkeypox outbreak not yet a global publ...   
3  33  2022-07-09 00:00:04  according to the cdc, monkeypox is usually spr...   
4  34  2022-07-09 00:00:07  lgbtq advocates and health care organizations ...   
5  35  2022-07-09 00:00:13  @slinderboy @smg4official anyone can contract ...   
6  37  2022-07-09 00:00:17  new info: the number of monkeypox cases in tex...   
7  39  2022-07-09 00:00:19  here's how you can get tested for monkeypox if...   
8  41  2022-07-09 00:00:27  the u.s. may be losing the fight against monke...   
9  42  2022-07-09 00:00:31  07/08/2022 11:11 pm utc  :newspaper: monkeypox...   

                  source  user is verified  user has url  \
0        Twitter Web App             False      

##### The following step includes word lemmatization. This process reduces words into a common meaning and removes stop words and non alphabetic terms in order to have a tokenized version of the text where we have reduced strings to their basic meaning.

In [43]:
#WordNetLemmatizer: remove stop words, non-alpha text, and word lemmatization
pos_map = defaultdict(lambda : wn.NOUN)
pos_map['J'] = wn.ADJ
pos_map['V'] = wn.VERB
pos_map['R'] = wn.ADV

In [44]:
for i, text in enumerate(combo_df['tokenized_text']):
    final_words = []
    word_lem = WordNetLemmatizer()
    for word, tag in pos_tag(text):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lem.lemmatize(word, pos=pos_map[tag[0]])
            final_words.append(word_final)
    combo_df.loc[i, 'tokenized_text'] = str(final_words)

##### The following three cells prepare data to be used in machine learning techniques. The first is splitting data into training data used to build the model and testing data to predict outcomes of misinformation or not misinformation. 

##### The next cell leverages LabelEncoder to transform words into numerical values, and the following cell vectorizes data into numerical feature vectors that summarize frequency of words.

In [45]:
#split data into training and testing set: 30% testing, 70% training
train_id, test_id, train_x, test_x, train_y, test_y = model_selection.train_test_split(combo_df['id'],combo_df['tokenized_text'], combo_df['binary_class'], test_size=0.3)

In [46]:
#encoding to numerical values that the model can understand
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

In [47]:
#word vectorization--turn collection of text into numerical feature vectors using
# term frequency -- inverse document (TF-IDF)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(combo_df['tokenized_text'])

train_x_Tfidf = Tfidf_vect.transform(train_x)
test_x_Tfidf = Tfidf_vect.transform(test_x)

print(Tfidf_vect.vocabulary_)
print(train_x_Tfidf)

  (0, 2564)	0.0951777891421927
  (0, 2146)	0.45899488805972394
  (0, 1777)	0.2904729848549999
  (0, 1764)	0.3526569415494313
  (0, 1756)	0.2363152438141047
  (0, 1691)	0.48871321572176873
  (0, 446)	0.5261538806807776
  (1, 4453)	0.75316683159535
  (1, 2984)	0.21143592817681348
  (1, 2702)	0.5863414306065816
  (1, 2563)	0.2103290251364713
  (2, 4980)	0.5417729324755914
  (2, 4644)	0.39871080046720786
  (2, 2975)	0.3258306308218311
  (2, 2667)	0.3954364610072977
  (2, 2634)	0.3140995437373291
  (2, 2564)	0.06826823130574446
  (2, 1957)	0.3034305519609314
  (2, 768)	0.26237142612146186
  (2, 681)	0.14404253768220682
  (3, 4184)	0.3250652924372735
  (3, 3897)	0.6000306758470783
  (3, 3748)	0.3728961976181629
  (3, 3045)	0.25040714245622825
  (3, 2984)	0.10468285424889852
  :	:
  (4046, 2564)	0.09365685663918087
  (4046, 2241)	0.29535865718734816
  (4046, 1585)	0.546989420490643
  (4047, 4119)	0.4168046177863536
  (4047, 2984)	0.12107597551839042
  (4047, 2563)	0.12044212219663318
  (4047,

# Using a Support Vector Machine to Classify Tweets
We have 91.48% accuracy in classifying Tweets as misinformation or not misinformation with 92.75% precision and 57.19% recall. Higher precision means that the algorithm is more accurate in returning positive predictions, whereas higher recall means that there is a higher quantity in positive predictions. We have a lower recall score, which means that there may be fewer positive predictions. However, with our high precision, the positive predictions that we do make are generally accurate. 

In this SVM, there are 1623 support vectors in the data, meaning that those are the data points closest to the hyperplane in the SVM. These data points are the most difficult to classify as misinformation or not misinformation. 
### How to use in recommender system: 
In a content-based filtering recommendation algorithm, we can use the misinformation classification results made by the SVM to not recommend content that has been classified as "misinformation", which is labeled by "1". `prediction_SVM` is an array containing the classification of Tweets as "0"s and "1"s. By flagging the "1"s, we can filter out the misinformation recommended to users.

In [48]:
def run_SVM(x_tfidf, y):
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(x_tfidf, y)
    prediction_SVM = SVM.predict(test_x_Tfidf)
    return prediction_SVM, SVM

prediction_SVM, SVM = run_SVM(train_x_Tfidf, train_y)

prediction_df = pd.DataFrame({'id': test_id, 'Testing Text': test_x, 'SVM Classifier': prediction_SVM})
combo_df_testingset = prediction_df.merge(combo_df, on='id', how='inner', suffixes=('_1', '_2'))
    
print('{} support vectors in the data'.format(len(SVM.support_)))
print('SVM Accuracy Score -> {:.2f} percent'.format(metrics.accuracy_score(prediction_SVM, test_y)*100))
print('SVM Precision Score -> {:.2f} percent'.format(metrics.precision_score(test_y, prediction_SVM) * 100))
print('SVM Recall Score -> {:.2f} percent'.format(metrics.recall_score(test_y, prediction_SVM) * 100))

1623 support vectors in the data
SVM Accuracy Score -> 91.48 percent
SVM Precision Score -> 92.75 percent
SVM Recall Score -> 57.19 percent


# Calculating Social Credibility Scores for Users
We use Kang et al.'s method of calculating social credibility, a formula using user metadata such as retweet count and follower count, to calculate. The credibility score runs on a scale from 0 to 10, with 10 being most credible and 0 least credible. 
### How to use in recommender system:
In a collaborative filtering recommendation algorithm, we can use credibility scores to have those with less credibility to show up less in one's social media feed. 

In [49]:
#data passed in should be the dataset with the tweets, id, and data about the tweets like retweets etc.
def get_credibility(data):
    mean_retweet_count = data['retweet_count'].mean()
    mean_follower_count = data['followers count'].mean()
    total_tweet_count = data.size
    def cred_score(retweets, followers, user_tweet_count):
        utility = abs(((retweets * followers)/user_tweet_count) - ((mean_retweet_count * mean_follower_count)/total_tweet_count))
        standardized = np.sqrt((utility ** 2)/(total_tweet_count-1))
        return 10 - (standardized * 100)
    # data['credibility'] = cred_score(data['retweet_count'],
    #                                         data['followers count'],
    #                                         data['tweet count'])
    data['credibility'] = cred_score(data['retweet_count'],
                                         data['followers count'],
                                         data['tweet count'])
    #if the score is less than 0, then make it 0 because the score is low enough to not recommend the user in the algorithm
    data.loc[combo_df_testingset['credibility'] < 0, "credibility"] = 1.0

get_credibility(combo_df_testingset)
print(combo_df_testingset['credibility'])
print("The maximum social credibility score is: ", combo_df_testingset['credibility'].max())
print("The minimumsocial credibility score is: ", combo_df_testingset['credibility'].min())
print("The mean social credibility score is: ", combo_df_testingset['credibility'].mean())


0       7.370248
1       7.370248
2       7.370248
3       1.000000
4       4.511804
          ...   
1732    7.370248
1733    7.370248
1734    7.370248
1735    7.370248
1736    7.370248
Name: credibility, Length: 1737, dtype: float64
The maximum social credibility score is:  9.953450871490194
The minimumsocial credibility score is:  0.4662727786163856
The mean social credibility score is:  7.235849981491154


In [50]:
# combo_df[['id', 'credibility']].to_csv('data/credibility_scores.csv', index=False)
combo_df_testingset.head()

Unnamed: 0,id,Testing Text,SVM Classifier,created_at,text,source,user is verified,user has url,user description,user created at,...,quote_count,followers count,following count,tweet count,listed_count,user location,binary_class,ternary_class,tokenized_text,credibility
0,4,"['money', 'pox', 'spread', 'wonder', 'gon', 'n...",1,2022-07-07 18:14:40,money pox is spreading... and i'm wondering if...,Twitter Web App,False,True,( -_-)ï¸»ãâä¸:collision: I'm jealous of m...,2018-03-05 00:25:50,...,0,268,133,9062,3,On a Screen Near You!,1,1,"['money', 'pox', 'spread', 'wonder', 'gon', 'n...",7.370248
1,958,"['man', 'likely', 'spread', 'monkeypox', 'texa...",0,2022-07-09 03:41:56,uhâ¦. man likely spread #monkeypox at texas b...,Twitter for iPhone,False,False,We think in generalities; we live in details. ...,2009-04-30 11:59:23,...,0,543,4532,35606,33,Somewhere,0,0,"['man', 'likely', 'spread', 'monkeypox', 'texa...",7.370248
2,1188,"['u', 'monkey', 'monkeypoxvaccine', 'dos', 'de...",0,2022-07-09 05:08:48,#us has 17m :monkey: #monkeypoxvaccine doses i...,Twitter Web App,False,False,International Health Programmes 3 Decades #HI...,2018-03-29 16:07:40,...,0,100,152,12315,1,Global,0,9,"['u', 'monkey', 'monkeypoxvaccine', 'dos', 'de...",7.370248
3,301,"['nyc', 'democrat', 'blast', 'biden', 'monkeyp...",0,2022-07-09 00:50:57,nyc democrats blast biden over monkeypox vacci...,SocialFlow,True,True,Breaking news & features from The New York Pos...,2008-11-18 19:46:36,...,2,2794294,10965,485468,20966,"New York, NY",0,0,"['nyc', 'democrat', 'blast', 'biden', 'monkeyp...",1.0
4,2402,"['monkeypox', 'touch', 'state', 'yeah', 'mask'...",0,2022-07-09 13:08:43,monkeypox just touched down in my state yeah t...,Twitter for iPhone,False,True,"Jusme Kamil :kiss_mark: 5'5"", 6'8"" worth of ga...",2019-01-31 03:21:30,...,1,52555,1221,46365,65,soufside,0,9,"['monkeypox', 'touch', 'state', 'yeah', 'mask'...",4.511804


Here we are creating a dictionary that maps an ID for each tweet to the text of the tweet, the SVM classifier which determines if it's misinformation (1) or not (0), and the credibility score of the user who tweeted it.

In [51]:
id_info_dict = combo_df_testingset.set_index('id')[['text', 'SVM Classifier', 'credibility']].apply(tuple, axis=1).to_dict()
#this dictionary maps each tweet id to it's text, the misinfo classifier, and the credibility score.
#text is position 0, SVM is 1, credibility is 2

Here, we are generating interactions to each tweet for 200 users. Interactions include likes, retweets, and comments. 

In [326]:
import pandas as pd
from faker import Faker
import random

fake = Faker()

# generate a fake user with a specified misinformation preference
def generate_user(user_id, misinformation_preference):
    return {
        'user_id': user_id,
        'misinformation_preference': misinformation_preference,
    }

def generate_interactions(users, tweets, num_interactions, misinformation_preference):
    interactions = []

    tweets['binary_class'] = tweets['binary_class'].astype(int)

    # filter tweets based on misinformation_preference
    if misinformation_preference == 'misinformation':
        filtered_tweets = tweets[tweets['binary_class'] == 1]
    elif misinformation_preference == 'non-misinformation':
        filtered_tweets = tweets[tweets['binary_class'] == 0]
    else:
        filtered_tweets = tweets

    for _ in range(num_interactions):
        user_id = random.choice(users)['user_id']
        
        if not filtered_tweets.empty:

            tweet_ids = filtered_tweets['id'].tolist()
            tweet_id = random.choice(tweet_ids)

            interaction_type = random.choice(['like', 'retweet', 'comment'])
            interactions.append({'user_id': user_id, 'tweet_id': tweet_id, 'interaction_type': interaction_type})

    return pd.DataFrame(interactions)

def generate_fake_data(num_users, real_tweets_df, num_interactions_per_user):
    users = []
    interactions = []

    for user_id in range(1, num_users + 1):
        if user_id <= 75:
            misinformation_preference = 'misinformation'
        elif user_id <= 125:
            misinformation_preference = 'non-misinformation'
        else:
            misinformation_preference = None

        users.append(generate_user(user_id, misinformation_preference))
        
        user_interactions = generate_interactions(users[-1:], real_tweets_df, num_interactions_per_user, misinformation_preference)
        interactions.append(user_interactions)

    users_df = pd.DataFrame(users)
    interactions_df = pd.concat(interactions, ignore_index=True)

    return users_df, interactions_df


real_tweets_df = combo_df_testingset
# print(len(combo_df_testingset))

# generate a fake dataset with 200 users and 50 interactions per user
num_users = 200
num_interactions_per_user = 50

fake_users_df, fake_interactions_df = generate_fake_data(num_users, real_tweets_df, num_interactions_per_user)

print("Fake Users DataFrame:")
print(fake_users_df.head())

print("\nFake Interactions DataFrame:")
print(fake_interactions_df.head())

fake_users_df.to_csv("data/preference_fake_users.csv", index=False)
fake_interactions_df.to_csv("data/preference_fake_interactions.csv", index=False)

Fake Users DataFrame:
   user_id misinformation_preference
0        1            misinformation
1        2            misinformation
2        3            misinformation
3        4            misinformation
4        5            misinformation

Fake Interactions DataFrame:
   user_id  tweet_id interaction_type
0        1      3500          comment
1        1        24             like
2        1      4634             like
3        1      2585          comment
4        1      5778          comment


The next few cells outline our recommendation system. We use a k-nearest-neighbors approach and both a content-filtering system (the SVM score) and a collaborative-filtering system (the credibility score) to craft recommendations. The standard recommender only uses KNN, but recommender2 uses the two scores.

In [327]:
from collections import defaultdict
import math
import csv

interactions = []

with open("data/preference_fake_interactions.csv", "r") as file:
    k = 0
    for line in file:
        if (k == 0):
            pass
        else:
            li = line.strip().split(",")
            interactions.append(li)
        k = k + 1

interactions_dict = {}
userset = set()

for interaction in interactions:
    user = int(interaction[0])
    tweet = int(interaction[1])
    interaction_type = interaction[2]
    userset.add(user)
    if (user not in interactions_dict.keys()):
        interactions_dict[user] = {}
    if (tweet not in interactions_dict[user].keys()):
        interactions_dict[user][tweet] = 0           
    if (interaction_type == "like"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 2
    if (interaction_type == "comment"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 3
    if (interaction_type == "retweet"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 2

def angulardistance(combinedlist):
    a = 0
    b = 0
    c = 0
    for i in combinedlist:
        x = i[0]
        y = i[1]
        a += x * y
        b += x * x
        c += y * y
    return 1 - a/((math.sqrt(b)*math.sqrt(c)))

def ratingdistance(user1, user2, threshold):
    combined = defaultdict(list)
    combinedlist = []
    for tweet in interactions_dict[user1].keys():
        combined[tweet].append(interactions_dict[user1][tweet])
    for tweet in interactions_dict[user2].keys():
        combined[tweet].append(interactions_dict[user2][tweet])
        if (len(combined[tweet]) > 1):
            combinedlist.append(combined[tweet])
    if (len(combinedlist) > threshold - 1):
    #    print(user2, len(combinedlist))
       return angulardistance(combinedlist)
    else:
        return 1

def knearestneighbor(u, S, threshold, k):
    if (k > len(S)):
        return S
    neighbors = []
    for user in S:
        if (user != u):
            distance = ratingdistance(u, user, threshold)
            neighbors.append((user, distance))
    # print(neighbors)
    neighbors.sort(key=lambda x: x[1])
    ret = []
    for i in range(k):
        ret.append(neighbors[i][0])
    # print(ret)
    return(ret)

def get_credibility_scores():
    credibility_dict = {id: info[2] for id, info in id_info_dict.items()}
    return credibility_dict

This standard recommender doesn't integrate the credibility or SVM scores

In [328]:
def recommender_standard(u, nrecs, k):
    interactions_copy = interactions_dict.copy()
    neighbors = knearestneighbor(u, userset, 3, k)
    tweets = defaultdict(list)
    for tweet in interactions_copy[u].keys():
        tweets[tweet] = ["PASS"]
    for neighbor in neighbors:
        for tweet in interactions_copy[neighbor].keys():
            if (tweet in tweets.keys()):
                if (tweets[tweet] != ["PASS"]):
                    tweets[tweet][0] += 1  # adjust the interaction count
                    tweets[tweet][1] = tweets[tweet][1] * interactions_copy[neighbor][tweet]
            else:
                tweets[tweet] = [1, interactions_copy[neighbor][tweet]]
    smoothedprediction = []
    for tweet, data in tweets.items():
        if (data != ["PASS"]):
            average = data[1]/data[0]
            prediction = (1 + (data[0]*average))/(1 + data[0])
            smoothedprediction.append([id_info_dict[tweet][0], prediction, id_info_dict[tweet][1]])
            # smoothedprediction.append([id_info_dict[tweet][0], data, id_info_dict[tweet][1]])
    smoothedprediction.sort(key=lambda x: x[1], reverse=True)
    return smoothedprediction[0:nrecs]

In [329]:
recommendations = recommender_standard(1, 10, 10)
for line in recommendations:
    print(line)

['@sparkybru @crystal92075604 no. i voted for hillary, and thought trump was full of shit about the swamp. now i see how wrong i was, and would not be surprised to see an outbreak of monkeypox before the 2024 election and 100% vote by mail.', 41.285714285714285, 1]
["@tonyhinton2016 sa and portugal are ahead of us with their wave, it peaked and is falling without restrictions or bits of cloth, had little impact on anything. we should be the same, but yeah, fear is what they thrive on. they're not getting anywhere with monkeypox so let's get covid going!", 27.166666666666668, 0]
['@cynthiamckinney monkey pox was caught exclusively by promiscuous homosexuals which is why when this news was revealed in pride month they shut down the story of its contagion.', 20.166666666666668, 0]
["@kprather88 and start making monkeypox tests widely available &amp; let people know it isn't a gay disease and is airborne", 10.333333333333334, 1]
["leaked document between bill gates and who from 2021 correc

In [330]:
recommendations = recommender_standard(51, 10, 10)
for line in recommendations:
    print(line)

["monkeypox is airborne, fabric borne, can pass via skin to skin contact, is infectious once symptoms occur. needless to say, our minimizers are telling us it's not airborne, but it is. there is a 1-11% fatality rate, so this is not something to fool around with.", 16.4, 1]
["@drericding @kavitapmd @yaneerbaryam is anyone going to address the issue that, like sars, monkeypox was also recently researched by the nih and wuhan institute, and that the virus's dna muted 12 times faster than you'd expect a dna based virus normally would?", 14.6, 1]
['starting to think the monkeypox is actually a new hiv and maybe not related to a pox', 12.166666666666666, 0]
['@theeconomist almost anyone = every one paid to talk nonsense and spread the propaganda and the dummies who got scared. everyone else knew it was rubbish. just like monkey pox is now.', 9.25, 0]
["leaked document between bill gates and who from 2021 correctly 'predicted' the exact day monkeypox outbreak would begin â\x80\x93 mic drop p

In [331]:
from collections import defaultdict

def recommender2(u, nrecs, k):
    interactions_copy = interactions_dict.copy()
    neighbors = knearestneighbor(u, userset, 3, k)
    tweets = defaultdict(list)
    credibility_scores = get_credibility_scores()
    penalty = 1.0
    for tweet in interactions_copy[u].keys():
        tweets[tweet] = ["PASS"]
    for neighbor in neighbors:
        for tweet in interactions_copy[neighbor].keys():
            credibility_score = credibility_scores.get(tweet, 1.0)
            if tweet in tweets:
                if tweets[tweet] != ["PASS"]:
                    if is_misinformation(tweet):
                        # print("misinformation penalty")
                        penalty = 0.2
                    else:
                        # print("no penalty")
                        penalty = 1.0
                    tweets[tweet][0] += 1
                    tweets[tweet][1] += credibility_score * penalty * interactions_copy[neighbor][tweet]
            else:
                tweets[tweet] = [credibility_score * penalty, credibility_score * penalty * interactions_copy[neighbor][tweet]]

    smoothed_prediction = []

    #generate smoothed predictions based on interactions
    for tweet, data in tweets.items():
        if data != ["PASS"]:
            credibility_adjusted_count = data[0]
            credibility_adjusted_score = data[1] / data[0]  # adjusted average based on credibility
            prediction = (1 + (credibility_adjusted_count * credibility_adjusted_score)) / (1 + credibility_adjusted_count)
            smoothed_prediction.append([id_info_dict[tweet][0], prediction, id_info_dict[tweet][1]])

    # sort and return the top nrecs recommendations
    smoothed_prediction.sort(key=lambda x: x[1], reverse=True)
    return smoothed_prediction[:nrecs]

def is_misinformation(tweet_id):
    misinformation_data = id_info_dict[tweet_id][1]
    return misinformation_data == 1

def read_misinformation_csv(file_path):
    misinformation_dict = {}
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)
        for row in reader:
            tweet_id, prediction = row[0], int(row[1])
            misinformation_dict[tweet_id] = prediction
    return misinformation_dict

In [332]:
recommendations = recommender2(1, 10, 20)
for line in recommendations:
    print(line)

["leaked document between bill gates and who from 2021 correctly 'predicted' the exact day monkeypox outbreak would begin â\x80\x93 mic drop politics _url_", 17.68627516846826, 0]
["my mom just warned me about monkey pox and encouraged me to get a vaccine if it's available and i think it's very cute she thinks i am a sexually active individual :face_with_tears_of_joy::face_with_tears_of_joy:", 15.98705914642687, 0]
['@freedomkiwi well covid, ukraine and now monkey pox... feels like a distraction technique used by a magician to cover what is really hapening', 14.185216995216447, 0]
["monkeypox is basically an std at this point. here's why. _url_", 11.53875655455364, 0]
['@foxnews if you take it up the dirt road you might get monkey pox #fuckgates&amp;soros. follow the $ and you find the criminal', 11.49242144058525, 0]
['@awokezombie try to find out if they printed the ballots for november ....yet... you know monkey pox is going to make us use paperballots... last time they started prin

In [333]:
recommendations = recommender2(51, 10, 10)
for line in recommendations:
    print(line)

['@theeconomist almost anyone = every one paid to talk nonsense and spread the propaganda and the dummies who got scared. everyone else knew it was rubbish. just like monkey pox is now.', 12.743239655144585, 0]
['new - #who\'s tedros is "concerned by the scale and spread" of the #monkeypox virus outbreak. #plandemic2.0 #bioweapons #inplainsight #nuremberg2.0 #faucilied #globalistelites #medicaltyranny2.0 -vs- #constitutionalduty #freedom #humanrights _url_', 12.168332233609288, 0]
['starting to think the monkeypox is actually a new hiv and maybe not related to a pox', 11.994128303732499, 0]
['. . very good observation thomas.  they also will not report the vaxx status of the people who have contracted "monkey pox" (aka, side effect of the jab) _url_', 11.870927639505707, 0]
['@nosoul_first for the prevention monkey pox, keep you mouth and ass shut and we should be just fine. :grimacing_face::grinning_squinting_face::grinning_squinting_face:', 11.744029243296222, 0]
["leaked document be

Finally, we run trials over each of the 100 generated users in the set. We run each user through both the standard recommender and our updated recommender that utilizes SVM and credibility. We generate a timeline of 10 recommended tweets per user. Then, we run a proportions hypothesis test over the average number of tweets in each timeline that are misinformation. The hypothesis test shows that we can reject the null hypothesis, and that the difference in proportions is significant and therefore our update recommendation system is better.

In [336]:
#statistical analysis

import numpy as np
from statsmodels.stats.proportion import proportions_ztest

misinfo_control_total = 0
misinfo_adjusted_total = 0

for i in range(1,200):
    group_control = recommender_standard(i, 10, 10)
    misinfo_control = [inner_list[2] for inner_list in group_control]
    misinfo_control_total += np.sum(misinfo_control)

for j in range(1,200):
    group_adjusted = recommender2(j, 10, 10)
    misinfo_adjusted = [inner_list[2] for inner_list in group_adjusted]
    misinfo_adjusted_total += np.sum(misinfo_adjusted)

misinfo_control_total = misinfo_control_total/200
misinfo_adjusted_total = misinfo_adjusted_total/200

count = np.array([misinfo_control_total, misinfo_adjusted_total])
nobs = np.array([10, 10])

print(count)
print(nobs)

stat, pval = proportions_ztest(count, nobs)

print(f"z-stat: {stat}")
print(f"p-value: {pval}")

# alpha significance level 0.05
alpha = 0.05
if pval <= alpha:
    print("Reject the null hypothesis. There is a significant difference in proportions.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in proportions.")

[3.79 0.25]
[10 10]
z-stat: 1.9715651543998296
p-value: 0.04865926833860468
Reject the null hypothesis. There is a significant difference in proportions.
