# Data Cleaning and Pre-Processing

##### First, we import our data and necessary libraries. 

In [8]:
import numpy as np
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn import metrics
import pandas as pd

combo_df = pd.read_csv("data/monkeypox.csv", encoding='latin-1')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/ameyarao/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloadi

##### Next, we set a random seed to keep our results consistent each time the script is ran. We then perform basic data cleaning, including removing blank rows, conversion to lowercase, and tokenization of text stream into meaningful elements.

In [9]:
np.random.seed(333)

combo_df = combo_df.rename(columns={'ï»¿number': 'id'})

#remove any blank rows
combo_df['text'].dropna(inplace=True)

#convert all to lowercase
combo_df['text'] = combo_df['text'].astype(str)
combo_df['text'] = combo_df['text'].str.lower()
print(combo_df.head(10))

#tokenization
combo_df['tokenized_text'] = combo_df.apply(lambda row: nltk.word_tokenize(row["text"]), axis=1)
print(combo_df['tokenized_text'].head(10))

   id           created_at                                               text  \
0  18  2022-07-08 10:12:04  much of the focus on #monkeypox recently has b...   
1  31  2022-07-09 00:00:00  icymi: the first probable case of monkeypox in...   
2  32  2022-07-09 00:00:01  who: #monkeypox outbreak not yet a global publ...   
3  33  2022-07-09 00:00:04  according to the cdc, monkeypox is usually spr...   
4  34  2022-07-09 00:00:07  lgbtq advocates and health care organizations ...   
5  35  2022-07-09 00:00:13  @slinderboy @smg4official anyone can contract ...   
6  37  2022-07-09 00:00:17  new info: the number of monkeypox cases in tex...   
7  39  2022-07-09 00:00:19  here's how you can get tested for monkeypox if...   
8  41  2022-07-09 00:00:27  the u.s. may be losing the fight against monke...   
9  42  2022-07-09 00:00:31  07/08/2022 11:11 pm utc  :newspaper: monkeypox...   

                  source  user is verified  user has url  \
0        Twitter Web App             False      

##### The following step includes word lemmatization. This process reduces words into a common meaning and removes stop words and non alphabetic terms in order to have a tokenized version of the text where we have reduced strings to their basic meaning.

In [10]:
#WordNetLemmatizer: remove stop words, non-alpha text, and word lemmatization
pos_map = defaultdict(lambda : wn.NOUN)
pos_map['J'] = wn.ADJ
pos_map['V'] = wn.VERB
pos_map['R'] = wn.ADV

In [15]:
for i, text in enumerate(combo_df['tokenized_text']):
    final_words = []
    word_lem = WordNetLemmatizer()
    for word, tag in pos_tag(text):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lem.lemmatize(word, pos=pos_map[tag[0]])
            final_words.append(word_final)
    combo_df.loc[i, 'tokenized_text'] = str(final_words)

##### The following three cells prepare data to be used in machine learning techniques. The first is splitting data into training data used to build the model and testing data to predict outcomes of misinformation or not misinformation. 

##### The next cell leverages LabelEncoder to transform words into numerical values, and the following cell vectorizes data into numerical feature vectors that summarize frequency of words.

In [16]:
#split data into training and testing set: 30% testing, 70% training
train_id, test_id, train_x, test_x, train_y, test_y = model_selection.train_test_split(combo_df['id'],combo_df['tokenized_text'], combo_df['binary_class'], test_size=0.3)

In [17]:
#encoding to numerical values that the model can understand
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

In [18]:
#word vectorization--turn collection of text into numerical feature vectors using
# term frequency -- inverse document (TF-IDF)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(combo_df['tokenized_text'])

train_x_Tfidf = Tfidf_vect.transform(train_x)
test_x_Tfidf = Tfidf_vect.transform(test_x)

print(Tfidf_vect.vocabulary_)
print(train_x_Tfidf)

  (0, 2743)	0.45950478989660204
  (0, 2564)	0.1083987701617995
  (0, 1131)	0.5701658852605342
  (0, 114)	0.6723212907342443
  (1, 4943)	0.21443797478366788
  (1, 4579)	0.354752899096195
  (1, 4172)	0.16924502178410306
  (1, 2919)	0.38599054674481903
  (1, 2726)	0.38599054674481903
  (1, 2564)	0.058521788999674466
  (1, 2261)	0.31055045628413247
  (1, 1773)	0.3078183228668068
  (1, 1049)	0.29619889326250354
  (1, 952)	0.38599054674481903
  (1, 903)	0.2649612456138795
  (2, 4420)	0.26313880540589885
  (2, 4321)	0.22534177930547833
  (2, 3432)	0.19668270164664636
  (2, 2984)	0.11526324100975391
  (2, 2953)	0.33673936866111925
  (2, 2863)	0.21325704853995484
  (2, 2563)	0.11465981834165044
  (2, 1951)	0.23835677566083874
  (2, 1871)	0.16610801961767707
  (2, 1628)	0.22881706474315022
  :	:
  (4047, 2015)	0.1858766769532917
  (4047, 1894)	0.24161723072402713
  (4047, 1720)	0.22528175045135596
  (4047, 1496)	0.17590083465907788
  (4047, 1380)	0.176679294869653
  (4047, 360)	0.523701125678053

# Using a Support Vector Machine to Classify Tweets
We have 91.48% accuracy in classifying Tweets as misinformation or not misinformation with 92.75% precision and 57.19% recall. Higher precision means that the algorithm is more accurate in returning positive predictions, whereas higher recall means that there is a higher quantity in positive predictions. We have a lower recall score, which means that there may be fewer positive predictions. However, with our high precision, the positive predictions that we do make are generally accurate. 

In this SVM, there are 1623 support vectors in the data, meaning that those are the data points closest to the hyperplane in the SVM. These data points are the most difficult to classify as misinformation or not misinformation. 
### How to use in recommender system: 
In a content-based filtering recommendation algorithm, we can use the misinformation classification results made by the SVM to not recommend content that has been classified as "misinformation", which is labeled by "1". `prediction_SVM` is an array containing the classification of Tweets as "0"s and "1"s. By flagging the "1"s, we can filter out the misinformation recommended to users.

In [19]:
def run_SVM(x_tfidf, y):
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(x_tfidf, y)
    prediction_SVM = SVM.predict(test_x_Tfidf)
    return prediction_SVM, SVM

prediction_SVM, SVM = run_SVM(train_x_Tfidf, train_y)

prediction_df = pd.DataFrame({'id': test_id, 'Testing Text': test_x, 'SVM Classifier': prediction_SVM})
combo_df_testingset = prediction_df.merge(combo_df, on='id', how='inner', suffixes=('_1', '_2'))
    
print('{} support vectors in the data'.format(len(SVM.support_)))
print('SVM Accuracy Score -> {:.2f} percent'.format(metrics.accuracy_score(prediction_SVM, test_y)*100))
print('SVM Precision Score -> {:.2f} percent'.format(metrics.precision_score(test_y, prediction_SVM) * 100))
print('SVM Recall Score -> {:.2f} percent'.format(metrics.recall_score(test_y, prediction_SVM) * 100))

1581 support vectors in the data
SVM Accuracy Score -> 91.13 percent
SVM Precision Score -> 89.25 percent
SVM Recall Score -> 59.32 percent


# Calculating Social Credibility Scores for Users
We use Kang et al.'s method of calculating social credibility, a formula using user metadata such as retweet count and follower count, to calculate. The credibility score runs on a scale from 0 to 10, with 10 being most credible and 0 least credible. 
### How to use in recommender system:
In a collaborative filtering recommendation algorithm, we can use credibility scores to have those with less credibility to show up less in one's social media feed. 

In [30]:
#data passed in should be the dataset with the tweets, id, and data about the tweets like retweets etc.
def get_credibility(data):
    mean_retweet_count = data['retweet_count'].mean()
    mean_follower_count = data['followers count'].mean()
    total_tweet_count = data.size
    def cred_score(retweets, followers, user_tweet_count):
        utility = abs(((retweets * followers)/user_tweet_count) - ((mean_retweet_count * mean_follower_count)/total_tweet_count))
        standardized = np.sqrt((utility ** 2)/(total_tweet_count-1))
        return 10 - (standardized * 100)
    # data['credibility'] = cred_score(data['retweet_count'],
    #                                         data['followers count'],
    #                                         data['tweet count'])
    data['credibility'] = cred_score(data['retweet_count'],
                                         data['followers count'],
                                         data['tweet count'])
    #if the score is less than 0, then make it 0 because the score is low enough to not recommend the user in the algorithm
    data.loc[combo_df_testingset['credibility'] < 0, "credibility"] = 1.0

get_credibility(combo_df_testingset)
print(combo_df_testingset['credibility'])
print("The maximum social credibility score is: ", combo_df_testingset['credibility'].max())
print("The minimumsocial credibility score is: ", combo_df_testingset['credibility'].min())
print("The mean social credibility score is: ", combo_df_testingset['credibility'].mean())


0       8.820066
1       8.971598
2       8.820066
3       8.820066
4       8.820066
          ...   
1732    8.820066
1733    8.955235
1734    8.820066
1735    1.000000
1736    8.820066
Name: credibility, Length: 1737, dtype: float64
The maximum social credibility score is:  9.986691620982931
The minimumsocial credibility score is:  0.5202831909619423
The mean social credibility score is:  8.625018736748268


In [31]:
# combo_df[['id', 'credibility']].to_csv('data/credibility_scores.csv', index=False)
combo_df_testingset.head()

Unnamed: 0,id,Testing Text,SVM Classifier,created_at,text,source,user is verified,user has url,user description,user created at,...,quote_count,followers count,following count,tweet count,listed_count,user location,binary_class,ternary_class,tokenized_text,credibility
0,4186,"['monkey', 'pox', 'real', 'part', 'two', 'covi...",0,2022-07-09 19:43:37,"monkey pox is the real part two for covid, was...",Twitter for iPhone,False,False,,2016-09-18 17:19:25,...,0,300,198,2298,0,"Kingston, JA",0,9,"['monkey', 'pox', 'real', 'part', 'two', 'covi...",8.820066
1,302,"['top', 'democrat', 'run', 'represent', 'heart...",0,2022-07-09 00:51:11,"""top democrats running to represent the heart ...",TweetDeck,False,True,City Hall for The NY Post. Under that cynical ...,2008-10-09 02:55:10,...,1,13695,578,180867,489,Room 9,0,0,"['top', 'democrat', 'run', 'represent', 'heart...",8.971598
2,897,"['epicenter', 'u', 'monkeypox', 'outbreak']",0,2022-07-09 03:21:50,n.y. now the epicenter of us monkeypox outbrea...,Twitter for iPhone,False,False,"Christian first, then a freedom-loving Patriot...",2022-04-27 02:18:57,...,0,82,584,303,0,,0,0,"['epicenter', 'u', 'monkeypox', 'outbreak']",8.820066
3,5357,"['wheww', 'read', 'thread', 'mutual', 'monkeyp...",0,2022-07-09 23:32:38,wheww after reading that thread from my mutual...,Twitter for iPhone,False,False,"just a shy, awkward, unpopular kid who grew up...",2021-01-10 18:43:27,...,1,1804,935,26247,1,"Washington, DC",0,9,"['wheww', 'read', 'thread', 'mutual', 'monkeyp...",8.820066
4,5287,"['enoughofthisnon', 'dailmamamoose', 'mommabea...",1,2022-07-09 23:19:54,@twk4usa @enoughofthisnon @dailmamamoose @mich...,Twitter for iPad,False,False,"Conservatives, happily married, No DM, No trai...",2020-03-11 08:05:33,...,0,23322,15533,179461,8,south carolina,1,1,"['enoughofthisnon', 'dailmamamoose', 'mommabea...",8.820066


In [32]:
id_info_dict = combo_df_testingset.set_index('id')[['text', 'SVM Classifier', 'credibility']].apply(tuple, axis=1).to_dict()
#this dictionary maps each tweet id to it's text, the misinfo classifier, and the credibility score.
print(id_info_dict)
#text is 0, SVM is 1, credibility is 2



In [33]:
import pandas as pd
from faker import Faker
import random

# Set up Faker to generate fake data
fake = Faker()

# Function to generate a fake user with a specified misinformation preference
def generate_user(user_id, misinformation_preference):
    return {
        'user_id': user_id,
        'misinformation_preference': misinformation_preference,
    }

def generate_interactions(users, tweets, num_interactions, misinformation_preference):
    interactions = []

    # Cast the 'binary_class' column to int
    tweets['binary_class'] = tweets['binary_class'].astype(int)

    # Filter tweets based on misinformation_preference
    if misinformation_preference == 'misinformation':
        filtered_tweets = tweets[tweets['binary_class'] == 1]
    elif misinformation_preference == 'non-misinformation':
        filtered_tweets = tweets[tweets['binary_class'] == 0]
    else:
        filtered_tweets = tweets  # No specific preference, use all tweets

    for _ in range(num_interactions):
        user_id = random.choice(users)['user_id']
        
        # Check if there are tweets with the specified condition
        if not filtered_tweets.empty:
            # Convert the 'number' column to a list for random.choice
            tweet_ids = filtered_tweets['id'].tolist()

            # Use random.choice on the list of tweet_ids
            tweet_id = random.choice(tweet_ids)

            interaction_type = random.choice(['like', 'retweet', 'comment'])
            interactions.append({'user_id': user_id, 'tweet_id': tweet_id, 'interaction_type': interaction_type})

    return pd.DataFrame(interactions)

def generate_fake_data(num_users, real_tweets_df, num_interactions_per_user):
    users = []
    interactions = []

    for user_id in range(1, num_users + 1):
        if user_id <= 25:
            misinformation_preference = 'misinformation'
        elif user_id <= 75:
            misinformation_preference = 'non-misinformation'
        else:
            misinformation_preference = None

        users.append(generate_user(user_id, misinformation_preference))
        
        user_interactions = generate_interactions(users[-1:], real_tweets_df, num_interactions_per_user, misinformation_preference)
        interactions.append(user_interactions)

    users_df = pd.DataFrame(users)
    interactions_df = pd.concat(interactions, ignore_index=True)

    return users_df, interactions_df

# Replace this with your actual real dataset
# real_tweets_df = pd.read_csv("data/monkeypox-followup.csv")

real_tweets_df = combo_df_testingset

# Generate a fake dataset with 100 users and 5 interactions per user
num_users = 100
num_interactions_per_user = 25

fake_users_df, fake_interactions_df = generate_fake_data(num_users, real_tweets_df, num_interactions_per_user)

# Display the generated dataframes
print("Fake Users DataFrame:")
print(fake_users_df.head())

print("\nFake Interactions DataFrame:")
print(fake_interactions_df.head())

fake_users_df.to_csv("data/preference_fake_users.csv", index=False)
fake_interactions_df.to_csv("data/preference_fake_interactions.csv", index=False)

Fake Users DataFrame:
   user_id misinformation_preference
0        1            misinformation
1        2            misinformation
2        3            misinformation
3        4            misinformation
4        5            misinformation

Fake Interactions DataFrame:
   user_id  tweet_id interaction_type
0        1      1066          retweet
1        1      1939             like
2        1      5269          retweet
3        1      5259          retweet
4        1      1579          comment


In [34]:
from collections import defaultdict
import math
import csv

interactions = []

with open("data/preference_fake_interactions.csv", "r") as file:
    k = 0
    for line in file:
        if (k == 0):
            pass
        else:
            li = line.strip().split(",")
            interactions.append(li)
        k = k + 1

interactions_dict = {}
userset = set()

for interaction in interactions:
    user = int(interaction[0])
    tweet = int(interaction[1])
    interaction_type = interaction[2]
    userset.add(user)
    if (user not in interactions_dict.keys()):
        interactions_dict[user] = {}
    if (tweet not in interactions_dict[user].keys()):
        interactions_dict[user][tweet] = 0           
    if (interaction_type == "like"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 10
    if (interaction_type == "comment"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 15
    if (interaction_type == "retweet"):
        interactions_dict[user][tweet] = interactions_dict[user][tweet] + 10

def angulardistance(combinedlist):
    a = 0
    b = 0
    c = 0
    for i in combinedlist:
        x = i[0]
        y = i[1]
        a += x * y
        b += x * x
        c += y * y
    return 1 - a/((math.sqrt(b)*math.sqrt(c)))

def ratingdistance(user1, user2, threshold):
    combined = defaultdict(list)
    combinedlist = []
    for tweet in interactions_dict[user1].keys():
        combined[tweet].append(interactions_dict[user1][tweet])
    for tweet in interactions_dict[user2].keys():
        combined[tweet].append(interactions_dict[user2][tweet])
        if (len(combined[tweet]) > 1):
            combinedlist.append(combined[tweet])
    if (len(combinedlist) > threshold - 1):
       return angulardistance(combinedlist)
    else:
        return 1.0

def knearestneighbor(u, S, threshold, k):
    if (k > len(S)):
        return S
    neighbors = []
    for user in S:
        if (user != u):
            distance = ratingdistance(u, user, threshold)
            neighbors.append((user, distance))
    neighbors.sort(key=lambda x: x[1])
    ret = []
    for i in range(k):
        ret.append(neighbors[i][0])
    return(ret)

def get_credibility_scores():
    credibility_dict = {id: info[2] for id, info in id_info_dict.items()}
    return credibility_dict

def recommender(u, nrecs, k):
    interactions_copy = interactions_dict.copy()
    neighbors = knearestneighbor(u, userset, 3, k)
    tweets = defaultdict(list) #keeps track of the tweets that neighbors have interacted with
    credibility_scores = get_credibility_scores()
    for tweet in interactions_copy[u].keys():
        tweets[tweet] = ["PASS"]
    for neighbor in neighbors:
        for tweet in interactions_copy[neighbor].keys():
            credibility_score = credibility_scores.get(tweet, 1.0)  # Default to 1.0 if credibility score is not available
            if (tweet in tweets.keys()):
                if (tweets[tweet] != ["PASS"]):
                    tweets[tweet][0] += credibility_score  # Adjust the interaction count
                    tweets[tweet][1] = credibility_score * interactions_copy[neighbor][tweet]
            else:
                tweets[tweet] = [credibility_score, credibility_score * interactions_copy[neighbor][tweet]]
    smoothedprediction = []
    for tweet, data in tweets.items():
        if (data != ["PASS"]):
            credibility_adjusted_count = data[0]
            credibility_adjusted_score = data[1] / data[0]  # Adjusted average based on credibility
            prediction = (1 + (credibility_adjusted_count * credibility_adjusted_score)) / (1 + credibility_adjusted_count)
#         if (data != ["PASS"]):
#             average = data[1]/data[0]
#             prediction = (10 + (data[0]*average))/(1 + data[0])
            smoothedprediction.append([id_info_dict[tweet][0], prediction])
    smoothedprediction.sort(key=lambda x: x[1], reverse=True)
    return smoothedprediction[0:nrecs]

In [35]:
recommendations = recommender(1, 10, 10)
for line in recommendations:
    print(line)

['they tried with monkey pox and polio and failed. next up is this marburg virus. _url_', 18.065186147544797]
['stfu there isnt no such thing as monkey poxâ\x80¦.its called your trying to lock people down again. its not and i repeat not going to work _url_', 18.065186147544797]
["us - washington state: the monkeypox virus continues to circulate in king county, the state's most populous county, and is now likely spreading between residents, local health officials said. _url_ h/t shiloh", 13.661945470035363]
["doesn't this sound like the hiv/aids of the 1980's? isn't hiv a side effect of the jab? so is monkey pox just a cover for the poison jabs adverse reactions? _url_", 13.598463783186311]
['@william91520642 @jglankford17 same thing with monkey pox. only men sleeping with other men catch it. someone was injected with it either without their knowledge or volunteered for a clinical trial. they want all gay, trans men to get the monkey pox jab which in turn will affect other men they slee

In [36]:
recommendations = recommender(26, 10, 10)
for line in recommendations:
    print(line)

['. . very good observation thomas.  they also will not report the vaxx status of the people who have contracted "monkey pox" (aka, side effect of the jab) _url_', 22.59098070164202]
["@genjibear @cdcdirector issue of masks maybe less impt than another glitch-stating #monkeypox is contagious only after someone's symptomatic-nope. body fluids (semen) contagious before someone's symptomatic.they're needlessly spreading monkeypox. is it desire to deceive or failure to read or think?", 22.55602460742501]
['#ebola #monkeypox professional saltwater pool for prevention or use the bathtub with good salt. no drink salt water, no. just bath with a little salt. with fever or otherwise. i used a cloth with warm saltwater for covid fever and it worked. _url_', 22.55602460742501]
['@jeff_kaye moneypox is but another deception.', 22.55602460742501]
['stfu there isnt no such thing as monkey poxâ\x80¦.its called your trying to lock people down again. its not and i repeat not going to work _url_', 18.06

This standard recommender doesn't integrate the credibility or SVM scores

In [53]:
def recommender_standard(u, nrecs, k):
    interactions_copy = interactions_dict.copy()
    neighbors = knearestneighbor(u, userset, 3, k)
    tweets = defaultdict(list)
    for tweet in interactions_copy[u].keys():
        tweets[tweet] = ["PASS"]
    for neighbor in neighbors:
        for tweet in interactions_copy[neighbor].keys():
            if (tweet in tweets.keys()):
                if (tweets[tweet] != ["PASS"]):
                    tweets[tweet][0] += 1  # Adjust the interaction count
                    tweets[tweet][1] = tweets[tweet][1] * interactions_copy[neighbor][tweet]
            else:
                tweets[tweet] = [1, interactions_copy[neighbor][tweet]]
    smoothedprediction = []
    for tweet, data in tweets.items():
        if (data != ["PASS"]):
            average = data[1]/data[0]
            prediction = (10 + (data[0]*average))/(1 + data[0])
            smoothedprediction.append([id_info_dict[tweet][0], prediction])
    smoothedprediction.sort(key=lambda x: x[1], reverse=True)
    return smoothedprediction[0:nrecs]

In [54]:
recommendations = recommender_standard(1, 10, 10)
for line in recommendations:
    print(line)

['@awokezombie try to find out if they printed the ballots for november ....yet... you know monkey pox is going to make us use paperballots... last time they started printing them in march.', 565.0]
["i like when we are supposed to pretend it's not 99.9% being spread by multiple partner butt sex with people who have obvious penis and anus sores.  why do we have to say stop having sex with people who have pustules on their genitals?!? why. is. it. necessary? #monkeypox _url_", 565.0]
['@jujuberunnin monkey pox slowly morphing into an std is wild', 377.5]
['the monkey pox is a scam.', 377.5]
["@iqfy_ @sqeptiq @nixonist @nypost if gay men weren't being promiscuous and spreading monkey pox this wouldn't be a problem", 252.5]
["@cdcgov @cdcdirector i know this isn't you. you wouldn't be sweating with worry from any pandemic. #covidisnotover #covidisairborne #monkeypoxisairborne #monkeypox _url_", 128.33333333333334]
['@popcrave monkey pox has been eradicated', 86.66666666666667]
['tucker ca

In [55]:
recommendations = recommender(26, 10, 10)
for line in recommendations:
    print(line)

['. . very good observation thomas.  they also will not report the vaxx status of the people who have contracted "monkey pox" (aka, side effect of the jab) _url_', 22.59098070164202]
["@genjibear @cdcdirector issue of masks maybe less impt than another glitch-stating #monkeypox is contagious only after someone's symptomatic-nope. body fluids (semen) contagious before someone's symptomatic.they're needlessly spreading monkeypox. is it desire to deceive or failure to read or think?", 22.55602460742501]
['#ebola #monkeypox professional saltwater pool for prevention or use the bathtub with good salt. no drink salt water, no. just bath with a little salt. with fever or otherwise. i used a cloth with warm saltwater for covid fever and it worked. _url_', 22.55602460742501]
['@jeff_kaye moneypox is but another deception.', 22.55602460742501]
['stfu there isnt no such thing as monkey poxâ\x80¦.its called your trying to lock people down again. its not and i repeat not going to work _url_', 18.06

In [56]:
from collections import defaultdict

def recommender2(u, nrecs, k):
    interactions_copy = interactions_dict.copy()
    neighbors = knearestneighbor(u, userset, 3, k)
    tweets = defaultdict(list)
    credibility_scores = get_credibility_scores()
    penalty = 1.0
    for tweet in interactions_copy[u].keys():
        tweets[tweet] = ["PASS"]
    for neighbor in neighbors:
        for tweet in interactions_copy[neighbor].keys():
            credibility_score = credibility_scores.get(tweet, 1.0)
            if tweet in tweets:
                if tweets[tweet] != ["PASS"]:
                    if is_misinformation(tweet):
                        penalty = 0.05
                    else:
                        penalty = 1.0
                    tweets[tweet][0] += credibility_score * penalty 
                    tweets[tweet][1] += credibility_score * penalty * interactions_copy[neighbor][tweet]
            else:
                tweets[tweet] = [credibility_score * penalty, credibility_score * penalty * interactions_copy[neighbor][tweet]]

    smoothed_prediction = []

    # Generate smoothed predictions based on interactions
    for tweet, data in tweets.items():
        if data != ["PASS"]:
            credibility_adjusted_count = data[0]
            credibility_adjusted_score = data[1] / data[0]  # Adjusted average based on credibility
            prediction = (1 + (credibility_adjusted_count * credibility_adjusted_score)) / (1 + credibility_adjusted_count)
            smoothed_prediction.append([id_info_dict[tweet][0], prediction, id_info_dict[tweet][1]])

    # Sort and return the top nrecs recommendations
    smoothed_prediction.sort(key=lambda x: x[1], reverse=True)
    return smoothed_prediction[:nrecs]

def is_misinformation(tweet_id):
    misinformation_data = id_info_dict[tweet_id][1]
    return misinformation_data == 1

def read_misinformation_csv(file_path):
    misinformation_dict = {}
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)
        for row in reader:
            tweet_id, prediction = row[0], int(row[1])
            misinformation_dict[tweet_id] = prediction
    return misinformation_dict

In [57]:
recommendations = recommender2(1, 10, 10)
for line in recommendations:
    print(line)

["@cdcgov @cdcdirector i know this isn't you. you wouldn't be sweating with worry from any pandemic. #covidisnotover #covidisairborne #monkeypoxisairborne #monkeypox _url_", 22.231279525358755, 0]
['they tried with monkey pox and polio and failed. next up is this marburg virus. _url_', 18.065186147544797, 1]
["@ksatnews it's bc china keeps makin it stronger n now workin on the monkey pox", 17.71855816875416, 1]
['@popcrave monkey pox has been eradicated', 16.614813148754976, 0]
["let's really call monkeypox what it is!!! keep your pants on for two weeks, just two weeks. it's spread by sex, got it? it's a fact. be responsible for your behavior.  opinion | let's call monkeypox what it is: a pandemic _url_", 14.25639609104024, 0]
['is monkeypox the new aids? _url_', 14.248932368640586, 0]
["@universevery how am i going to get monkey pox, when i don't rub it against random people for extended period of times? i had covid, it was was cold symptoms. how would a vaccine help me out? still can

In [58]:
recommendations = recommender2(26, 10, 10)
for line in recommendations:
    print(line)

["@genjibear @cdcdirector issue of masks maybe less impt than another glitch-stating #monkeypox is contagious only after someone's symptomatic-nope. body fluids (semen) contagious before someone's symptomatic.they're needlessly spreading monkeypox. is it desire to deceive or failure to read or think?", 22.55602460742501, 0]
['#ebola #monkeypox professional saltwater pool for prevention or use the bathtub with good salt. no drink salt water, no. just bath with a little salt. with fever or otherwise. i used a cloth with warm saltwater for covid fever and it worked. _url_', 22.55602460742501, 0]
['@abcnews really! gees, this is the decade that is going to keep on giving! some countries have monkey pox doubling every 8 days, other reporting cases in kids under 10, now #marburgvirus !! give us a break! has someone been playing in the lab again!!', 14.248932368640586, 0]
['starting to think the monkeypox is actually a new hiv and maybe not related to a pox', 14.248932368640586, 0]
['@william

In [76]:
#statistical analysis

import numpy as np
from statsmodels.stats.proportion import proportions_ztest


group_control = recommender2(1, 10, 10)
group_adjusted = recommender2(26, 10, 10)

misinfo_control = [inner_list[2] for inner_list in group_control]
misinfo_adjusted = [inner_list[2] for inner_list in group_adjusted]


count = np.array([np.sum(misinfo_control == 1), np.sum(misinfo_adjusted == 1)])
nobs = np.array([len(misinfo_control), len(misinfo_adjusted)])

stat, pval = proportions_ztest(count, nobs)

# Output the results
print(f"Z-statistic: {stat}")
print(f"P-value: {pval}")

# Check the significance level (e.g., 0.05)
alpha = 0.05
if pval < alpha:
    print("Reject the null hypothesis. There is a significant difference in proportions.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in proportions.")


Z-statistic: nan
P-value: nan
Fail to reject the null hypothesis. There is no significant difference in proportions.


  zstat = value / std
