In [1]:
import pandas as pd
import os
import pickle

import preprocessor as p
import re
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.test.utils import datapath

In [2]:
quote_csv = [f"Quote Network/{file}" for file in os.listdir("Quote Network") if file.endswith(".csv")]
reply_csv = [f"Reply Network/{file}" for file in os.listdir("Reply Network") if file.endswith(".csv")]

In [3]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

def clean_tweet(tweet, column):
    ## remove contractions
    tweet[column] = tweet[column].apply(lambda x: [contractions.fix(word) for word in x.split()])

    ## convert back into string so that tokenization can be done
    tweet[column] = [' '.join(map(str, l)) for l in tweet[column]]

    ## tokenize
    tweet[column] = tweet[column].apply(word_tokenize)

    ## convert tokens to lowercase
    tweet[column] = tweet[column].apply(lambda x: [word.lower() for word in x])

    ## remove punctuations and numerics
    tweet[column] = tweet[column].apply(lambda x: [word for word in x if re.search('^[a-z]+$',word)])

    ## remove stopwords
    stop_list = stopwords.words('english')
    tweet[column] = tweet[column].apply(lambda x: [word for word in x if word not in stop_list])
    
    tweet[column] = tweet[column].apply(lambda x: lemmatize_words(x))
    return tweet

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [6]:
def check_topic(date, text):
    if date < '2020-03-22':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("us_start_lda_model_15"))
        dictionary = corpora.Dictionary.load(datapath("us_start_lda_model_15.id2word"))
        num_topics = 15
        prefix = "US_Start_Topic_"
    elif date >= '2020-03-22' and date < '2020-05-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("us_circuit_lda_model_11"))
        dictionary = corpora.Dictionary.load(datapath("us_circuit_lda_model_11.id2word"))
        num_topics = 11
        prefix = "US_Circuit_Topic_"
    elif date >= date >= '2020-05-01' and date < '2021-01-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("us_2020_lda_model_7"))
        dictionary = corpora.Dictionary.load(datapath("us_2020_lda_model_7.id2word"))
        num_topics = 7
        prefix = "US_2020_Topic_"
    elif date >= date >= '2021-01-01' and date < '2022-01-01':
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("us_2021_lda_model_11"))
        dictionary = corpora.Dictionary.load(datapath("us_2021_lda_model_11.id2word"))
        num_topics = 11
        prefix = "US_2021_Topic_"
    else:
        lda_model = gensim.models.ldamodel.LdaModel.load(datapath("us_2022_lda_model_19"))
        dictionary = corpora.Dictionary.load(datapath("us_2022_lda_model_19.id2word"))
        num_topics = 19
        prefix = "US_2022_Topic_"
        
    corpus = dictionary.doc2bow(text)
    top_topics = lda_model.get_document_topics(corpus, minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    return f"{prefix}{topic_vec.index(max(topic_vec))+1}"

In [7]:
def determine_sentiment(score):
    if score < 0:
        return "Negative"
    elif score > 0:
        return "Positive"
    else:
        return "Neutral"

In [8]:
def preprocess(text):
    text = p.clean(text)
    tokenization = word_tokenize(text) 
    stop_words = set(stopwords.words('english'))
    tokenization = [w for w in tokenization if not w in stop_words]   
    return text

In [9]:
def fake_or_real(pred):
    if pred == 0:
        return "Real"
    else:
        return "Fake"

In [10]:
def load_vectorizer_and_model():
    return pickle.load(open("../../models/misinformation/vectorizer.txt", "rb")), pickle.load(open("../../models/misinformation/logreg.txt", "rb"))

In [11]:
vectorizer, model = load_vectorizer_and_model()

In [12]:
modded_quote_dfs = []

for csv in quote_csv:
    df = pd.read_csv(csv)
    df2 = df.copy()
    df2 = clean_tweet(df, 'renderedContent')
    
    # Retrieve topics
    data = df2['renderedContent'].values.tolist()
    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_trigrams = make_trigrams(data_words)
    topics = []
    for i in range(len(df2)):
        topics.append(check_topic(df2.iloc[i].date, data_words_trigrams[i]))
    df["Topic"] = topics
    
    # Calculate sentiments
    df2["renderedContent"] = df2['renderedContent'].apply(lambda tweet:str(tweet))
    sentiment_scores = df2['renderedContent'].apply(sid.polarity_scores)
    sentiments = sentiment_scores.apply(lambda x: x["compound"])
    sentiment_label = sentiments.apply(determine_sentiment)
    df["Sentiment"] = sentiment_label
    
    # Check if real / fake news
    df["renderedContent"] = df["renderedContent"].apply(preprocess)
    verdict_labels = []
    for i in range(len(df)):
        vectorized_text = vectorizer.transform([df.iloc[i].renderedContent])
        verdict_labels.append(fake_or_real(model.predict(vectorized_text)[0]))
    df["Verdict"] = verdict_labels
    
    modded_quote_dfs.append(df)

In [13]:
modded_quote_dfs

[                        date  \
 0  2020-07-04 13:18:56+00:00   
 1  2020-07-03 03:53:57+00:00   
 2  2020-06-23 13:16:42+00:00   
 3  2020-05-28 02:16:43+00:00   
 4  2020-05-09 23:28:41+00:00   
 5  2020-04-30 23:24:09+00:00   
 6  2020-04-26 18:53:19+00:00   
 7  2020-04-07 13:25:41+00:00   
 8  2020-04-07 00:45:52+00:00   
 
                                      renderedContent  \
 0  kim guilfoyle speaking indoor event south dako...   
 1                                 died covid florida   
 2  u rank world coronavirus test per million peop...   
 3  president acknowledged country surpassed covid...   
 4  kurtschlichter trump said merkel abe praised p...   
 5  astounding many people like brian kilmeade amp...   
 6  trump enjoyed vacation last month turned coron...   
 8  one month ago today trump told u want let peop...   
 
                           user               Topic Sentiment Verdict  
 0  https://twitter.com/atrupar     US_2020_Topic_3  Positive    Fake  
 1  https

In [14]:
modded_reply_dfs = []

for csv in reply_csv:
    df = pd.read_csv(csv)
    df2 = df.copy()
    df2 = clean_tweet(df, 'renderedContent')
    
    # Retrieve topics
    data = df2['renderedContent'].values.tolist()
    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_trigrams = make_trigrams(data_words)
    topics = []
    for i in range(len(df2)):
        topics.append(check_topic(df2.iloc[i].date, data_words_trigrams[i]))
    df["Topic"] = topics
    
    # Calculate sentiments
    df2["renderedContent"] = df2['renderedContent'].apply(lambda tweet:str(tweet))
    sentiment_scores = df2['renderedContent'].apply(sid.polarity_scores)
    sentiments = sentiment_scores.apply(lambda x: x["compound"])
    sentiment_label = sentiments.apply(determine_sentiment)
    df["Sentiment"] = sentiment_label
    
    # Check if real / fake news
    df["renderedContent"] = df["renderedContent"].apply(preprocess)
    verdict_labels = []
    for i in range(len(df)):
        vectorized_text = vectorizer.transform([df.iloc[i].renderedContent])
        verdict_labels.append(fake_or_real(model.predict(vectorized_text)[0]))
    df["Verdict"] = verdict_labels
    
    modded_reply_dfs.append(df)

In [15]:
modded_reply_dfs

[Empty DataFrame
 Columns: [date, renderedContent, user, Topic, Sentiment, Verdict]
 Index: [],
                           date  \
 0    2022-11-17 19:19:07+00:00   
 1    2022-11-10 23:33:12+00:00   
 2    2022-11-05 12:57:38+00:00   
 3    2022-11-05 00:06:29+00:00   
 4    2022-10-24 22:15:02+00:00   
 ..                         ...   
 223  2020-06-24 21:28:27+00:00   
 224  2020-06-21 16:59:30+00:00   
 225  2020-06-17 15:09:15+00:00   
 226  2020-06-14 15:11:33+00:00   
 227  2020-04-22 04:09:48+00:00   
 
                                        renderedContent  \
 0                       ghostofsurf dineshdsouza covid   
 1                 two original three booster covid yet   
 2    mcgeheenaomi marypeltola rural king mom house ...   
 3          mcgeheenaomi marypeltola cool covid vaccine   
 4    pediatricnursey yeah policy get away failing c...   
 ..                                                 ...   
 223  imagine pissed trying keep people safe fuck ka...   
 224  covi