In [1]:
import pandas as pd
import os
import pickle

import preprocessor as p
import re
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.test.utils import datapath

In [2]:
quote_csv = [f"Quote_Network/{file}" for file in os.listdir("Quote_Network") if file.endswith(".csv")]
reply_csv = [f"Reply_Network/{file}" for file in os.listdir("Reply_Network") if file.endswith(".csv")]

In [3]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

def clean_tweet(tweet, column):
    ## remove contractions
    tweet[column] = tweet[column].apply(lambda x: [contractions.fix(word) for word in x.split()])

    ## convert back into string so that tokenization can be done
    tweet[column] = [' '.join(map(str, l)) for l in tweet[column]]

    ## tokenize
    tweet[column] = tweet[column].apply(word_tokenize)

    ## convert tokens to lowercase
    tweet[column] = tweet[column].apply(lambda x: [word.lower() for word in x])

    ## remove punctuations and numerics
    tweet[column] = tweet[column].apply(lambda x: [word for word in x if re.search('^[a-z]+$',word)])

    ## remove stopwords
    stop_list = stopwords.words('english')
    tweet[column] = tweet[column].apply(lambda x: [word for word in x if word not in stop_list])
    
    tweet[column] = tweet[column].apply(lambda x: lemmatize_words(x))
    return tweet

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [6]:
def check_topic(date, text):
    if date < '2020-04-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("sg_start_lda_model_18"))
        dictionary = corpora.Dictionary.load(datapath("sg_start_lda_model_18.id2word"))
        num_topics = 18
        prefix = "SG_Start_Topic_"
    elif date >= '2020-04-01' and date < '2020-06-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("sg_circuit_lda_model_17"))
        dictionary = corpora.Dictionary.load(datapath("sg_circuit_lda_model_17.id2word"))
        num_topics = 17
        prefix = "SG_Circuit_Topic_"
    elif date >= '2020-06-01' and date < '2021-12-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("sg_phases_lda_model_11"))
        dictionary = corpora.Dictionary.load(datapath("sg_phases_lda_model_11.id2word"))
        num_topics = 11
        prefix = "SG_Phases_Topic_"
    elif date >= '2021-12-01' and date < '2023-02-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("sg_acute_lda_model_18"))
        dictionary = corpora.Dictionary.load(datapath("sg_acute_lda_model_18.id2word"))
        num_topics = 18
        prefix = "SG_Acute_Topic_"
    else:
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("sg_green_lda_model_6"))
        dictionary = corpora.Dictionary.load(datapath("sg_green_lda_model_6.id2word"))
        num_topics = 6
        prefix = "SG_Green_Topic_"
        
    corpus = dictionary.doc2bow(text)
    top_topics = lda_model.get_document_topics(corpus, minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    return f"{prefix}{topic_vec.index(max(topic_vec))+1}"

In [7]:
def determine_sentiment(score):
    if score < 0:
        return "Negative"
    elif score > 0:
        return "Positive"
    else:
        return "Neutral"

In [8]:
def preprocess(text):
    text = p.clean(text)
    tokenization = word_tokenize(text) 
    stop_words = set(stopwords.words('english'))
    tokenization = [w for w in tokenization if not w in stop_words]   
    return text

In [9]:
def fake_or_real(pred):
    if pred == 0:
        return "Real"
    else:
        return "Fake"

In [10]:
def load_vectorizer_and_model():
    return pickle.load(open("../../models/misinformation/vectorizer.txt", "rb")), pickle.load(open("../../models/misinformation/logreg.txt", "rb"))

In [11]:
vectorizer, model = load_vectorizer_and_model()

In [12]:
modded_quote_dfs = []

for csv in quote_csv:
    df = pd.read_csv(csv)
    df2 = df.copy()
    df2 = clean_tweet(df, 'renderedContent')
    
    # Retrieve topics
    data = df2['renderedContent'].values.tolist()
    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_trigrams = make_trigrams(data_words)
    topics = []
    for i in range(len(df2)):
        topics.append(check_topic(df2.iloc[i].date, data_words_trigrams[i]))
    df["Topic"] = topics
    
    # Calculate sentiments
    df2["renderedContent"] = df2['renderedContent'].apply(lambda tweet:str(tweet))
    sentiment_scores = df2['renderedContent'].apply(sid.polarity_scores)
    sentiments = sentiment_scores.apply(lambda x: x["compound"])
    sentiment_label = sentiments.apply(determine_sentiment)
    df["Sentiment"] = sentiment_label
    
    # Check if real / fake news
    df["renderedContent"] = df["renderedContent"].apply(preprocess)
    verdict_labels = []
    for i in range(len(df)):
        vectorized_text = vectorizer.transform([df.iloc[i].renderedContent])
        verdict_labels.append(fake_or_real(model.predict(vectorized_text)[0]))
    df["Verdict"] = verdict_labels
    
    modded_quote_dfs.append(df)

In [13]:
i=1

for df in modded_quote_dfs:
    df.to_csv(f"{i}.csv")
    i+=1

In [13]:
modded_quote_dfs

[                        date  \
 0  2021-06-07 13:56:03+00:00   
 
                                      renderedContent  \
 0  million dos administered poland begin covid va...   
 
                                user               Topic Sentiment Verdict  
 0  https://twitter.com/BogdziewiczM  SG_Phases_Topic_10  Positive    Fake  ,
                         date  \
 0  2023-02-28 02:08:38+00:00   
 1  2023-02-28 00:08:34+00:00   
 2  2023-02-27 18:56:15+00:00   
 3  2023-02-28 00:08:34+00:00   
 
                                      renderedContent  \
 0              hong kong scrap mask mandate mar http   
 1  coronavirus origin still mystery year pandemic...   
 2            china must honest origin envoy say http   
 3  coronavirus origin still mystery year pandemic...   
 
                                   user             Topic Sentiment Verdict  
 0  https://twitter.com/ChannelNewsAsia  SG_Green_Topic_4   Neutral    Fake  
 1  https://twitter.com/ChannelNewsAsia  SG_Green_T

In [14]:
modded_reply_dfs = []

for csv in reply_csv:
    df = pd.read_csv(csv)
    df2 = df.copy()
    df2 = clean_tweet(df, 'renderedContent')
    
    # Retrieve topics
    data = df2['renderedContent'].values.tolist()
    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_trigrams = make_trigrams(data_words)
    topics = []
    for i in range(len(df2)):
        topics.append(check_topic(df2.iloc[i].date, data_words_trigrams[i]))
    df["Topic"] = topics
    
    # Calculate sentiments
    df2["renderedContent"] = df2['renderedContent'].apply(lambda tweet:str(tweet))
    sentiment_scores = df2['renderedContent'].apply(sid.polarity_scores)
    sentiments = sentiment_scores.apply(lambda x: x["compound"])
    sentiment_label = sentiments.apply(determine_sentiment)
    df["Sentiment"] = sentiment_label
    
    # Check if real / fake news
    df["renderedContent"] = df["renderedContent"].apply(preprocess)
    verdict_labels = []
    for i in range(len(df)):
        vectorized_text = vectorizer.transform([df.iloc[i].renderedContent])
        verdict_labels.append(fake_or_real(model.predict(vectorized_text)[0]))
    df["Verdict"] = verdict_labels
    
    modded_reply_dfs.append(df)

In [15]:
i=1

for df in modded_reply_dfs:
    df.to_csv(f"Reply_{i}.csv")
    i+=1

In [15]:
modded_reply_dfs

[                         date  \
 0   2022-12-24 16:48:10+00:00   
 1   2022-12-12 15:58:10+00:00   
 2   2022-11-28 10:52:02+00:00   
 3   2022-11-27 19:40:31+00:00   
 4   2022-11-27 16:40:27+00:00   
 5   2022-11-27 08:11:22+00:00   
 6   2022-10-09 03:19:32+00:00   
 7   2022-10-08 03:38:30+00:00   
 8   2022-10-08 03:22:08+00:00   
 9   2022-10-08 03:19:00+00:00   
 10  2022-10-08 02:38:51+00:00   
 11  2022-08-11 09:44:22+00:00   
 12  2022-08-06 13:33:00+00:00   
 13  2022-08-03 08:33:30+00:00   
 14  2022-04-06 05:04:16+00:00   
 15  2022-04-06 02:33:17+00:00   
 16  2022-04-04 04:34:03+00:00   
 17  2022-04-03 13:10:38+00:00   
 18  2022-04-02 16:07:45+00:00   
 19  2022-04-02 08:53:24+00:00   
 20  2022-04-02 04:32:26+00:00   
 21  2022-04-02 04:06:39+00:00   
 22  2022-03-15 05:58:21+00:00   
 23  2022-03-14 16:08:58+00:00   
 24  2022-03-14 13:53:40+00:00   
 
                                       renderedContent  \
 0                   first wave people recovered covid  