# Data Cleaning

This notebook combines the previously scraped twitter data, cleans and add any additional columns needed prior to analysis.

# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx
import community as louvain
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import nltk

  import pandas.util.testing as tm


# Load and Combine Datasets

Load all tweets and user information scraped from Dec 18 - 24th, 2020 and combine them into a single dataframe.

In [2]:
# load all tweets scraped
df_connections1 = pd.read_csv("output/2020-12-18 Tweets/final_results.csv", dtype = str)
df_connections2 = pd.read_csv("output/2020-12-19 Tweets/final_results.csv", dtype = str)
df_connections3 = pd.read_csv("output/2020-12-20 Tweets/final_results.csv", dtype = str)
df_connections4 = pd.read_csv("output/2020-12-21 Tweets/final_results.csv", dtype = str)
df_connections5 = pd.read_csv("output/2020-12-22 Tweets/final_results.csv", dtype = str)
df_connections6 = pd.read_csv("output/2020-12-23 Tweets/final_results.csv", dtype = str)
df_connections7 = pd.read_csv("output/2020-12-24 Tweets/final_results.csv", dtype = str)
df_connections8 = pd.read_csv("output/2021-01-09 Missing Tweets/missing_tweet_results.csv", dtype = str)

In [3]:
# combined all tweets scraped into a single dataframe
df_connections = pd.concat([df_connections1, df_connections2, df_connections3, df_connections4,
                            df_connections5, df_connections6, df_connections7, df_connections8],
                           ignore_index = True)

# drop all duplicated entries
df_connections = df_connections.drop_duplicates()

In [4]:
# check for number of rows
print("# of rows in df_connections1:", df_connections1.shape[0])
print("# of rows in df_connections2:", df_connections2.shape[0])
print("# of rows in df_connections3:", df_connections3.shape[0])
print("# of rows in df_connections4:", df_connections4.shape[0])
print("# of rows in df_connections5:", df_connections5.shape[0])
print("# of rows in df_connections6:", df_connections6.shape[0])
print("# of rows in df_connections7:", df_connections7.shape[0])
print("# of rows in df_connections8:", df_connections8.shape[0])
print("# of rows after consolidation and dropping duplicates:", df_connections.shape[0])

# of rows in df_connections1: 2286
# of rows in df_connections2: 2527
# of rows in df_connections3: 2385
# of rows in df_connections4: 2981
# of rows in df_connections5: 3746
# of rows in df_connections6: 3797
# of rows in df_connections7: 4929
# of rows in df_connections8: 1080
# of rows after consolidation and dropping duplicates: 8946


One of the twitter users who was banned during our scraping process was lotusoak2. We were able to retrieve his information due to a previous scrape. Prior to banning, this user was a very active anti-vax bot, posting dozens of tweets and retweets everyday. Due to the account ban and the deletion of his tweets, most of this accounts post are no longer included in our dataset.

In [5]:
#get lotusoak2's user information from an old retrievement before the account was suspended
user_lotusoak = pd.read_csv("output/followers_ids_1000/followers2 - OLD.csv", dtype = str)
user_lotusoak = user_lotusoak[user_lotusoak["id"] == "424664120"]
user_lotusoak = user_lotusoak.drop_duplicates("id")
user_lotusoak["source_id"] = None

In [6]:
# load all user information scraped
df_users1 = pd.read_csv("output/2020-12-18 Tweets/nodes.csv", dtype = str)
df_users2 = pd.read_csv("output/2020-12-19 Tweets/nodes.csv", dtype = str)
df_users3 = pd.read_csv("output/2020-12-20 Tweets/nodes.csv", dtype = str)
df_users4 = pd.read_csv("output/2020-12-21 Tweets/nodes.csv", dtype = str)
df_users5 = pd.read_csv("output/2020-12-22 Tweets/nodes.csv", dtype = str)
df_users6 = pd.read_csv("output/2020-12-23 Tweets/nodes.csv", dtype = str)
df_users7 = pd.read_csv("output/2020-12-24 Tweets/nodes.csv", dtype = str)
df_users8 = pd.read_csv("output/2021-01-09 Missing Tweets/missing_nodes.csv", dtype = str)

In [7]:
# combined all user information scraped into a single dataframe
df_users = pd.concat([df_users1, df_users2, df_users3, df_users4, df_users5, df_users6, df_users7,
                      df_users8, user_lotusoak], ignore_index = True)

#drop duplicated users
df_users = df_users.drop_duplicates("id")

In [8]:
# check number of rows
print("# of rows in df_users1:", df_users1.shape[0])
print("# of rows in df_users2:", df_users2.shape[0])
print("# of rows in df_users3:", df_users3.shape[0])
print("# of rows in df_users4:", df_users4.shape[0])
print("# of rows in df_users5:", df_users5.shape[0])
print("# of rows in df_users6:", df_users6.shape[0])
print("# of rows in df_users7:", df_users7.shape[0])
print("# of rows in df_users8:", df_users8.shape[0])
print("# of rows in user_lotusoak:", user_lotusoak.shape[0])
print("# of rows after consolidation and dropping duplicates:", df_users.shape[0])

# of rows in df_users1: 1949
# of rows in df_users2: 2028
# of rows in df_users3: 1928
# of rows in df_users4: 2552
# of rows in df_users5: 3091
# of rows in df_users6: 3129
# of rows in df_users7: 4337
# of rows in df_users8: 70
# of rows in user_lotusoak: 1
# of rows after consolidation and dropping duplicates: 7170


# Identify Exisitance of Original Anti-Vax Hashtags

In [9]:
#Add columns to tweets dataframe to identify anti-vaxx hashtags

novax = []
antivax = []
cdcwhistleblower = []
vaccineinjury = []
vaxxed = []
cdcfraud = []

for i in df_connections.index :
    text = df_connections["text"][i]
    if "#novax" in text :
        novax.append(1)
    else :
        novax.append(0)
    if "#antivax" in text :
        antivax.append(1)
    else :
        antivax.append(0)
    if "#cdcwhistleblower" in text :
        cdcwhistleblower.append(1)
    else :
        cdcwhistleblower.append(0)
    if "#vaccineinjury" in text :
        vaccineinjury.append(1)
    else :
        vaccineinjury.append(0)
    if "#vaxxed" in text :
        vaxxed.append(1)
    else :
        vaxxed.append(0)
    if "#cdcfraud" in text :
        cdcfraud.append(1)
    else :
        cdcfraud.append(0)

df_connections["hashtag_novax"] = novax
df_connections["hashtag_antivax"] = antivax
df_connections["hashtag_cdcwhistleblower"] = cdcwhistleblower
df_connections["hashtag_vaccineinjury"] = vaccineinjury
df_connections["hashtag_vaxxed"] = vaxxed
df_connections["hashtag_cdcfraud"] = cdcfraud

# Delete Non-Vaccine Related Conversations and Separately Classify Pro-Vaxx Conversations

In [10]:
#non-vaxx related conversations
non_vax = ["1339388053312270337", "1338969616870043649", "1339030239104684034", "1338952212903469058",
            "1338200367520051200", "1339380932453703681", "1339365858678730752", "1339482593608265728",
            "1339918799336845316", "1339996165773946880", "1339379035281915904", "1339953954994745345",
            "1339118105570627584", "1339409746202251265", "1339414394434854912", "1339413581016694784",
            "1339415314052784130", "1337800649052971008", "1337765454044155905", "1339789526328365056",
            "1339758947624873984", "1339884959138533376", "1339953895129407489", "1340151293470359552",
            "1340130416292872192", "1340375819219316739", "1340367744596922368", "1340210236641202176",
            "1340138727633403906", "1340588046413164544", "1340549469373407233", "1340520673349873665",
            "1340443581064986624", "1340806702069915650", "1340706934165561346", "1340720711451721737",
            "1341646772054200320", "1341125324495609856", "1341157386715852801", "1341751074626134019",
            "1341737824182181888", "1342032447727411201", "1342033228291567617", "1342045857244454912",
            "1342074921225224192", "1342077527242788864"]

#pro-vax conversation starter
pro_vax = ["1339977177215864832", "1339964294205644801", "1339941480715202562", "1340089843431432194",
           "1340043296467537924", "1339877425916895237", "1339966479870128128", "1339912370135699456",
           "1339746759728349188", "1339030239104684034", "1340079856881549315", "1340073476544110592",
           "1340029547102801920", "1339964673190346752", "1339719347057512449", "1339810557159899136",
           "1339737699239014400", "1339104569134108673", "1339337119102345216", "1339373768867721217",
           "1340096675336085509", "1340055177965969408", "1339795110117126144", "1339858348636684289",
           "1339869889276878848", "1339871792505249793", "1339884210312626176", "1339883453886701568",
           "1340003033514708998", "1339988673589338113", "1340012317178691584", "1340024326725263362",
           "1340039535619674114", "1339994129477058560", "1339926719239761924", "1339936903852937217", 
           "1339943707706048513", "1339946410335072257", "1339948903798550528", "1339948908760469506",
           "1339963994442969092", "1340409103043182592", "1340391271777505280", "1340358203377393667",
           "1340141733808656384", "1340163822103904256", "1340369588089925632", "1340371706679091201",
           "1340371714560192512", "1340401519993053186", "1340401898877280258", "1340359157787062272",
           "1340280089855057920", "1340316181136355328", "1340320030043586562", "1340326413887483904",
           "1340328526432657409", "1340329716553805826", "1340146728285184000", "1340175552888639489",
           "1340417017044713472", "1340690524072775680", "1340694484477116417", "1340628813710016512",
           "1340590137852497920", "1340591279323942914", "1340593741078306817", "1340609468514615299",
           "1340621485409357824", "1340638855297458176", "1340647546822209548", "1340658576889659392", 
           "1340669956913029120", "1340673684491677703", "1340688411791863808", "1340577939096780800", 
           "1340563660649861121", "1340447163806076929", "1340507672014188546", "1340535566736220161", 
           "1340538249899634688", "1340552918127292418", "1340440166339633153", "1340703936735490048",
           "1340796796298293256", "1340701332676562944", "1340703925318656004", "1340718687171522560",
           "1340719018932654080", "1340747429977710593", "1340749221595635724", "1340824591686615044",
           "1340900385838526464", "1340900393266634753", "1340991028363489280", "1340794462121947138",
           "1340764307034943488", "1340764316321112064", "1340779419774947333", "1341005946668105728", 
           "1340968352781516800", "1341334935471271938", "1341379857482170370", "1341419840116965377", 
           "1341465325099954182", "1341049209454247936", "1341442116874948609", "1341177788825518085", 
           "1341571312419004416", "1341204484312068096", "1341534894460084226", "1341579710904582157", 
           "1341582658657275906", "1341594850525933568", "1341635905665187841", "1341640071523557376", 
           "1341655136838496266", "1341655151887704065", "1341655163954716673", "1341660449062998016", 
           "1341511710784499712", "1341572415999119361", "1341519257570603010", "1341036136228122629",
           "1341053946337456128", "1341634930812567552", "1341036141009653763", "1341055582623698944", 
           "1341066274386948097", "1341428687510663169", "1341295720956112896", "1341307899113517056",
           "1341383326985310210", "1341413569594798087", "1341428700856913921", "1341292823400865796",
           "1341443748761382914", "1341485645395611650", "1341247587815469056", "1341073940526080003", 
           "1341073946347761665", "1341081415715307522", "1341103434506924033", "1341126712940236801",
           "1341133416096477185", "1341156874616512512", "1341806497798471683", "1341778869674672128",
           "1341810094586163200", "1341791010922393600", "1341745903191941120", "1341791006220570624",
           "1341798349431779331", "1341806064304607232", "1341806073964052482", "1341815754367381508", 
           "1341716018742382592", "1341884107798806528", "1341899794835529728", "1342213747566374918", 
           "1342213739714662409", "1341881623697117185", "1341896757026779136", "1341972354902142976",
           "1342002518180687878", "1342017554936705025", "1341881612989038594", "1342100410052583424", 
           "1341821258082693125"]

In [11]:
#delete non-vaxx conversations
df_connections = df_connections[~df_connections["conversation_id"].isin(non_vax)]

#classify tweets between pro-vaxx and anti-vaxx
df_connections["stance"] = df_connections["conversation_id"]
df_connections.loc[df_connections["stance"].isin(pro_vax), "stance"] = "Pro-Vaxx" #pro-vaxx
df_connections.loc[(df_connections["stance"] != "Pro-Vaxx"), "stance"] = "Anti-Vaxx" #anti-vax

In [12]:
# y = df_connections[df_connections["id"].isin(pro-vaxx)]

# for i in y.index :
#     print("Tweet:", df_connections["id"].loc[i])
#     print(df_connections["text"].loc[i])
#     print("--------------")

# Replace User IDs with Usernames in Tweets Dataframe

After repeatingly having trouble with using Gelphi, we decided to switch to a different application for SNA graphing - Polinode. Whil Polinode does not hand networks with more than 50,000 nodes (Gelphi handles up to 100,000), it is sufficient for our project.

Polinode requires uploading edges and nodes in excel format. However, long IDs are cut off read as scientific notations in excel. Therefore, we must use usernames directly in our edges table instead of relying on the IDs to sync from the both the edges and nodes table in excel to sync.

In [13]:
#extract usernames and user id
usernames = df_users[["id", "username"]].set_index("id")
usernames.columns = ["author_username"]

#join usernames with author_id on tweet dataframe
df_connections = df_connections.join(usernames, on = "author_id")

In [14]:
#change column name before joining for source usernames
usernames.columns = ["source_username"]

#join usernames with source_user_id on tweet dataframe
df_connections = df_connections.join(usernames, on = "source_user_id")

# NLP Analysis

Use NLP and K-Means to cluster the different topics discussed in the tweets dataset.

## Load Libraries

In [15]:
# !pip install spacy
# !python -m spacy download en

In [16]:
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
import nltk
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from string import punctuation
import collections
from collections import Counter
import en_core_web_sm

## Data Cleaning

In [17]:
# isolate only the text and the tweet ids to be analyzed
tweets = df_connections[["id", "text"]].set_index("id")
tweets.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1339977177215864832,Remember when #antivax charlatan @delbigtree a...
1338876187976863744,Yale #STUDY Finds Link Between #Vaccination an...
1339294501375004675,"This film stops the lie, which producer Polly ..."
1339448531074093056,#Glyphosate Found in Childhood #Vaccines\n\nht...
1340048362146570242,#China plans to let victims sue #vaccine manuf...


In [18]:
#define punctuations and stopwords
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = set(stopwords.words('english')).union(punc)

#vectorize individual tweets
text = tweets["text"].values
vectorizer = TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(text)
word_features = vectorizer.get_feature_names()
len(word_features)

14095

In [19]:
from nltk.stem.snowball import SnowballStemmer

# apply stemmer and tokenizer to tweets
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [20]:
## test out vectorizers
# vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize)
# X2 = vectorizer2.fit_transform(text)
# word_features2 = vectorizer2.get_feature_names()
# len(word_features2)

  'stop_words.' % sorted(inconsistent))


11602

In [21]:
# vectorize while restricting maximum features to 1,000
vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X3 = vectorizer3.fit_transform(text)
word_features3 = vectorizer3.get_feature_names()
len(word_features3)

1000

In [22]:
from sklearn.cluster import KMeans

# use K-means to separate dataset into different clusters, testing out different cluster sizes

clusters = [2, 3, 4, 5, 10, 15]

for i in clusters :
    kmeans = KMeans(n_clusters = i, n_init = 5, n_jobs = -1, random_state = 42)
    kmeans.fit(X3)
    print("Number of clusters:", i)
    common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
    for num, centroid in enumerate(common_words):
        print(str(num) + ' : ' + ', '.join(word_features3[word] for word in centroid))
    new_col_name = str(i) + "_clusters"
    tweets[new_col_name] = kmeans.labels_



Number of clusters: 2
0 : rt, vaccin, peopl, chriswicknew, one, get, whi, georebekah, take, like
1 : co, https, vaccin, rt, covid, realdonaldtrump, get, receiv, drmadej, nurs




Number of clusters: 3
0 : vaccin, realdonaldtrump, peopl, get, chriswicknew, like, take, covid, want, think
1 : rt, vaccin, peopl, laptop, georgia, dear, voter, florida, governor, bring
2 : co, https, vaccin, rt, covid, get, drmadej, watch, nurs, receiv




Number of clusters: 4
0 : vaccin, realdonaldtrump, peopl, chriswicknew, get, like, take, co, https, covid
1 : laptop, georgia, dear, voter, florida, governor, bring, imagin, girl, georebekah
2 : co, https, rt, vaccin, covid, drmadej, nurs, watch, get, receiv
3 : rt, vaccin, peopl, refus, opinion, evil, moderna, realdonaldtrump, get, pfizer




Number of clusters: 5
0 : vaccin, realdonaldtrump, peopl, get, chriswicknew, like, covid, take, want, think
1 : laptop, georgia, dear, voter, florida, governor, bring, imagin, girl, georebekah
2 : co, https, rt, vaccin, covid, drmadej, receiv, dircdcyrqc, antivax, realdonaldtrump
3 : rt, peopl, vaccin, opinion, evil, moderna, pfizer, deni, whi, correct
4 : vaccin, get, rt, nurs, co, https, covid, pl, secret, meet




Number of clusters: 10
0 : https, co, vaccin, realdonaldtrump, covid, antivax, chriswicknew, peopl, get, amp
1 : realdonaldtrump, overwhelm, immedi, approv, start, vaccin, distribut, moderna, rollout, review
2 : vaccin, covid, co, https, rt, get, nurs, watch, hous, secret
3 : rt, peopl, vaccin, refus, opinion, bar, wear, whi, deni, show
4 : laptop, georgia, dear, voter, florida, governor, bring, imagin, girl, georebekah
5 : drmadej, dircdcyrqc, co, https, rt, gskfjgebju, nigel, farag, happen, hsrywrrrl
6 : evil, probabl, correct, gtconway, rt, anaphylaxi, rfk, jr, fightback, counsel
7 : sbp, qokm, unbeliev, equal, racial, true, distribut, marklevinshow, even, co
8 : pollytommey, viral, interview, produc, event, current, chanc, busydrt, listen, yet
9 : safe, gwinezrx, vasovag, teamtrump, worri, absolut, littl, vice, gsx, bdzkdr




Number of clusters: 15
0 : vaccin, realdonaldtrump, peopl, get, covid, chriswicknew, like, take, richardursomd, know
1 : co, covid, https, pelosi, nanci, speaker, pgswmksmfj, wfpfupxp, vaccin, rt
2 : peopl, deni, wear, refus, mask, opinion, chriswicknew, rt, vaccin, bu
3 : probabl, correct, gtconway, rfk, jr, fightback, counsel, wise, josephjflynn, rt
4 : anti, vaxxer, pl, secret, meet, today, plan, expos, ccdhate, stop
5 : penc, mike, teamtrump, receiv, watch, coronavirus, vice, bdzkdr, gsx, presid
6 : distribut, overwhelm, immedi, approv, sbp, qokm, unbeliev, racial, equal, start
7 : co, https, drmadej, rt, dircdcyrqc, vaxx, realdonaldtrump, antivax, vaccin, covid
8 : preciouslindi, gatesfound, nihdirector, tx, gat, nih, connect, billgat, theori, conspiraci
9 : howleyreport, herebi, volunt, video, refus, live, take, abroad, multi, run
10 : evil, wrought, industri, pure, pharmaceut, corrupt, realcandaceo, fauci, dr, gate
11 : laptop, georgia, dear, voter, florida, governor, bring, ima

In [23]:
# for i in range(tweets.shape[0]) :
#     if (tweets.iloc[i]["5_clusters"] == 1) :
#         print("Cluster:", tweets.iloc[i]["5_clusters"])
#         print(tweets.iloc[i]["text"])
#         print("------------")

After looking through the different clusters and its separations, we have determined that the tweet categories can be separated fairly well with only 5 clusters. Therefore, only the results of the 5 clusters are kept.

In [24]:
#join the 5_cluster separation with tweet data
nlp_clusters = pd.DataFrame(tweets["5_clusters"])
nlp_clusters = nlp_clusters.rename(columns = {"5_clusters" : "nlp_clusters"})
df_connections = df_connections.join(nlp_clusters, on = "id")

In [25]:
df_connections = df_connections.reset_index()

In [26]:
# label clusters with their relevant categories
df_connections["nlp_clusters"] = df_connections["nlp_clusters"].astype(str)
df_connections["nlp_clusters"] = df_connections["nlp_clusters"].replace({"0" : "Political/Conspiracy",
                                                                         "1" : "Other",
                                                                         "2" : "Info-Sharing",
                                                                         "3" : "COVID Vaccine",
                                                                         "4" : "Other"})

# Create Edges and Nodes

## Tweets by Users

In [27]:
#get information required for creating an edges list
edges_user = df_connections[["author_username", "source_username", "text"]]

#set columns names as specified by Polinode
edges_user.columns = ["Source", "Target", "weight"]

#drop columns with NAs
edges_user = edges_user.dropna()

#create an unweighted version of edges for later calculations purposes
edges_user_unweighted = edges_user.copy()

#create a weighted version for edges
edges_user = edges_user.groupby(["Source", "Target"]).count().reset_index()
edges_user = edges_user.sort_values("weight", ascending = False).reset_index(drop = True)

In [28]:
edges_user.head()

Unnamed: 0,Source,Target,weight
0,SputnikInt,SputnikInt,41
1,keryn_1,carnivalist2,28
2,carnivalist2,carnivalist2,27
3,carnivalist2,keryn_1,17
4,TheHiddenJewell,TheHiddenJewell,16


In [29]:
#get information and attributes needed for nodes
nodes_user = df_users[['username', 'verified', 'location', 'created_at', 'description',
                       'public_metrics.followers_count', 'public_metrics.following_count',
                       'public_metrics.tweet_count']]

#set columns names as specified by Polinode
nodes_user =  nodes_user.rename(columns = {"username" : "Name", "public_metrics.followers_count" : "followers_count",
                                           "public_metrics.following_count" : "following_count",
                                           "public_metrics.tweet_count" : "tweet_count"})

#ensure the node list only have users appearing on the edges list
nodes_user = nodes_user[nodes_user["Name"].isin(edges_user["Source"].append(edges_user["Target"]).dropna().tolist())]

In [30]:
nodes_user.head()

Unnamed: 0,Name,verified,location,created_at,description,followers_count,following_count,tweet_count
0,thereal_truther,False,California,2015-11-29T21:35:35.000Z,"Exposing the lies, propaganda & hate of the an...",7014,5484,120569
1,FloBo2018,False,,2018-01-28T07:03:02.000Z,Parler: @FloBo\nAlso @therealFloBo1\n\n#1A #2A...,3026,4555,26805
2,and_kell,False,,2009-05-06T18:25:00.000Z,Find me on Parler @andkell,4477,1812,82057
3,PupBellus,False,"Manchester, England",2011-10-11T02:42:16.000Z,"Rubber Pup, Dom, Ferral, Playful, Happy, Naughty!",2321,350,848
4,MrCrazee,False,"Plymouth, Devon",2009-01-26T22:16:00.000Z,Independent Parliamentary Candidate for Plymou...,2269,2653,11063


# Tweets

In [31]:
# add columns to tweets dataframe to separate out the activities of the source tweets
edges_tweet = df_connections[["id", "replied_to_tweet_id", "quoted_tweet_id", "retweeted_id"]]
source = []
target = []

for i in edges_tweet.index :
    if (edges_tweet["replied_to_tweet_id"].loc[i] == edges_tweet["replied_to_tweet_id"].loc[i]) :
        source.append(edges_tweet["id"].loc[i])
        target.append(edges_tweet["replied_to_tweet_id"].loc[i])
    if (edges_tweet["quoted_tweet_id"].loc[i] == edges_tweet["quoted_tweet_id"].loc[i]) :
        source.append(edges_tweet["id"].loc[i])
        target.append(edges_tweet["quoted_tweet_id"].loc[i])
    if (edges_tweet["retweeted_id"].loc[i] == edges_tweet["retweeted_id"].loc[i]) :
        source.append(edges_tweet["id"].loc[i])
        target.append(edges_tweet["retweeted_id"].loc[i])
        
edges_tweet = pd.DataFrame(columns = ["Source", "Target"])
edges_tweet["Source"] = source
edges_tweet["Target"] = target

In [32]:
# keep only tweets that were not deleted by Twitter
edges_tweet = edges_tweet[edges_tweet["Target"].isin(df_connections["id"])]

In [33]:
# get attributes needed for nodes
nodes_tweet = df_connections[["id", "text", "hashtag_novax", "hashtag_antivax", "hashtag_cdcwhistleblower",
                              "hashtag_vaccineinjury", "hashtag_vaxxed", "hashtag_cdcfraud", "stance",
                             "author_username", "source_username", "nlp_clusters"]]

# drop duplicated tweet ids
nodes_tweet = nodes_tweet.drop_duplicates("id")

In [34]:
# get list of tweets included in edges and drop duplicates
tweet_ids = edges_tweet["Source"].append(edges_tweet["Target"]).drop_duplicates()
tweet_ids = tweet_ids.tolist()

In [35]:
# ensure the nodes only have tweets that are included in edges
nodes_tweet = nodes_tweet[nodes_tweet["id"].isin(tweet_ids)]
nodes_tweet = nodes_tweet.rename(columns = {"id" : "Name"})

In [36]:
#export all edges and nodes files
edges_user.to_csv("output/Combined Tweets/2020-12-24 edges_user.csv", index = False)
edges_user_unweighted.to_csv("output/Combined Tweets/2020-12-24 edges_user_unweighted.csv", index = False)
edges_tweet.to_csv("output/Combined Tweets/2020-12-24 edges_tweet.csv", index = False)
nodes_user.to_csv("output/Combined Tweets/2020-12-24 nodes_user.csv", index = False)
nodes_tweet.to_csv("output/Combined Tweets/2020-12-24 nodes_tweet.csv", index = False)