# _Experimentation: March 24, 2020_

**TL;DR** --> Explore strategies to analyze text of tweets, and see if there are any potential instances of disinformation.

In [52]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# import libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import string
import os
import re
from tqdm.autonotebook import tqdm
tqdm.pandas()

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

  del sys.path[0]
  from pandas import Panel


## _Load Data_

In [2]:
def load_data(origpath, datapath, filename):
    """
    Given path to a specific data directory, loads in data from given filename
    """
    # change directory to where data is located
    os.chdir(datapath)
    # load in data with given filename
    df = pd.read_pickle(filename)
    # change directory back to original
    os.chdir(origpath)
    # return dataframe
    return df

In [3]:
origpath = "/notebooks/CovidDisinfo-Detect/experiments"
datapath = "/notebooks/CovidDisinfo-Detect/data/interim"
filename = "coronavirus_20200324.pkl"

In [34]:
%%time
df = load_data(origpath, datapath, filename)

CPU times: user 404 ms, sys: 254 ms, total: 658 ms
Wall time: 657 ms


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 300011 entries, 2020-03-23 06:59:59+00:00 to 2020-03-22 23:57:53+00:00
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  300011 non-null  int64 
 1   conversation_id     300011 non-null  int64 
 2   user_id             300011 non-null  int64 
 3   username            300011 non-null  object
 4   name                300011 non-null  object
 5   tweet               300011 non-null  object
 6   mentions            300011 non-null  object
 7   urls                300011 non-null  object
 8   photos              300011 non-null  object
 9   replies_count       300011 non-null  int64 
 10  retweets_count      300011 non-null  int64 
 11  likes_count         300011 non-null  int64 
 12  hashtags            300011 non-null  object
 13  link                300011 non-null  object
 14  retweet             300011 non-null  bool  
 15  quote

In [36]:
df[:5]

Unnamed: 0_level_0,id,conversation_id,user_id,username,name,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,link,retweet,quote_url,video,reply_to_userids,reply_to_usernames
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-03-23 06:59:59+00:00,1241877316734717953,1241877316734717952,4128946034,sanjacintoclan,SanJacintoClan,"Trump, at White House briefing, outlines broad...",none,https://www.foxnews.com/politics/trump-coronav...,none,0,0,1,none,https://twitter.com/SanJacintoClan/status/1241...,False,none,0,4128946034,SanJacintoClan
2020-03-23 06:59:59+00:00,1241877315648401408,1241877315648401408,858501829,harrietnix,"Harriet Nix, The Egyptian Queen",BREAKING: Rand Paul Has Tested Positive For Th...,none,https://trendingpolitics.com/breaking-rand-pau...,none,0,1,1,none,https://twitter.com/HarrietNix/status/12418773...,False,none,0,858501829,HarrietNix
2020-03-23 06:59:59+00:00,1241877315623112704,1241877315623112704,614853,vajra,E Brown,Spanish opera singer Placido Domingo tests pos...,none,https://reut.rs/2WwhA3H,none,0,0,0,none,https://twitter.com/vajra/status/1241877315623...,False,none,0,614853,vajra
2020-03-23 06:59:59+00:00,1241877315447148544,1241877315447148544,374822255,skift,Skift,"This week, travel startups @bimbleapp and @Pru...","bimbleapp,pruvoweb",https://bit.ly/3949hyr,none,0,0,2,none,https://twitter.com/skift/status/1241877315447...,False,none,0,374822255842419111162482689717347878344257536,"skift,bimbleapp,PruvoWeb"
2020-03-23 06:59:59+00:00,1241877312871845891,1241877312871845888,489756858,gpbgeorge,GEORGE(Dancing&more),Coronavirus: IOC considers postponing 2020 Tok...,none,https://twitter.com/i/events/1219057585707315201,none,0,0,1,#coronavirus,https://twitter.com/GPBGeorge/status/124187731...,False,none,0,489756858,GPBGeorge


## _Clean up Tweet Text_

In [37]:
# what percentage of the tweets contain a newline character (either \n or \n\n)
df["tweet"].str.contains(r"\n|\n\n").mean()

0.2652002759898804

In [38]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [39]:
def remove_pattern(input_text, pattern, space=False):
    # find all patterns in given text
    r = re.findall(pattern, input_text)
    # for each occurrence
    for i in r:
        if space == True:
            input_text = re.sub(i, " ", input_text)
        else:
            # remove and replace with blank
            input_text = re.sub(i, "", input_text)
    # return text
    return input_text

In [40]:
def spacy_tokenizer(parser, text):
    # create token object
    mytokens = parser(text)
    # lemmatize each token and convert into lowercase
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    # remove stop words
    mytokens = [word for word in mytokens if word not in STOP_WORDS]
    # return processed list of tokens
    return mytokens

In [46]:
def clean_tweet(df):
    """
    Given a dataframe, clean up the tweet column to (hopefully) make it easier to analyze.
    """
    # load spacy
    nlp = spacy.load('en_core_web_md')
    # create list of puntuation marks
    punctuations = string.punctuation
    # load english tokenizer, tagger, parser, NEW and word_vectors
    parser = English()
    # create a new column that contains "clean" tweet text, beginning with removing newline characters
    df["clean_tweet"] = df["tweet"].str.replace(r"\n|\n\n", " ")
    # removes all user mentions (since we have them in a separate column already)
    print("Removing user mentions...")
    df["clean_tweet"] = df["clean_tweet"].apply(lambda x: remove_pattern(x, "@[\w]*"))
    # remove special characters, numbers, punctuations
    print("Removing special characters, numbers, punctuations...")
    df["clean_tweet"] = df["clean_tweet"].str.replace("[^a-zA-Z#]", " ")
    # removing short words --> remove all words having character length less than 3
    print("Removing short words (character length less than 3)...")
    df["clean_tweet"] = df["clean_tweet"].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))
    # remove urls
    print("Removing URLs...")
    df["clean_tweet"] = df["clean_tweet"].apply(lambda x: remove_pattern(x, r"http\S+"))
    # remove hashtags (but keep words)
    print("Removing hashtags (but keeping word(s))...")
    df["clean_tweet"] = df["clean_tweet"].str.replace("#", "")
    # tokenize with spacy
    print("Tokenizing w/ spaCy...")
    df["clean_tweet"] = df["clean_tweet"].progress_apply(lambda x: " ".join(spacy_tokenizer(parser, x)))
    print("Done!")
    return df

In [55]:
%%time
df_cleantext = clean_tweet(df)

Removing user mentions...
Removing special characters, numbers, punctuations...
Removing short words (character length less than 3)...
Removing URLs...
Removing hashtags (but keeping word(s))...
Tokenizing w/ spaCy...


HBox(children=(FloatProgress(value=0.0, max=300011.0), HTML(value='')))


Done!
CPU times: user 1min 31s, sys: 501 ms, total: 1min 31s
Wall time: 1min 31s


In [60]:
df_cleantext[:3]

Unnamed: 0_level_0,id,conversation_id,user_id,username,name,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,link,retweet,quote_url,video,reply_to_userids,reply_to_usernames,clean_tweet
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-03-23 06:59:59+00:00,1241877316734717953,1241877316734717952,4128946034,sanjacintoclan,SanJacintoClan,"Trump, at White House briefing, outlines broad...",none,https://www.foxnews.com/politics/trump-coronav...,none,0,0,1,none,https://twitter.com/SanJacintoClan/status/1241...,False,none,0,4128946034,SanJacintoClan,trump white house briefing outlines broad meas...
2020-03-23 06:59:59+00:00,1241877315648401408,1241877315648401408,858501829,harrietnix,"Harriet Nix, The Egyptian Queen",BREAKING: Rand Paul Has Tested Positive For Th...,none,https://trendingpolitics.com/breaking-rand-pau...,none,0,1,1,none,https://twitter.com/HarrietNix/status/12418773...,False,none,0,858501829,HarrietNix,breaking rand paul tested positive chinese cor...
2020-03-23 06:59:59+00:00,1241877315623112704,1241877315623112704,614853,vajra,E Brown,Spanish opera singer Placido Domingo tests pos...,none,https://reut.rs/2WwhA3H,none,0,0,0,none,https://twitter.com/vajra/status/1241877315623...,False,none,0,614853,vajra,spanish opera singer placido domingo tests pos...


In [57]:
#!pip install vaderSentiment

## _VADER Sentiment_

In [58]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [59]:
def vader_score(text):
    """
    Given text, returns compound VADER score.
    """
    return analyzer.polarity_scores(text)["compound"]

In [63]:
df_cleantext["vader_comp"] = df_cleantext["tweet"].progress_apply(vader_score)

HBox(children=(FloatProgress(value=0.0, max=300011.0), HTML(value='')))




In [64]:
df_cleantext.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 300011 entries, 2020-03-23 06:59:59+00:00 to 2020-03-22 23:57:53+00:00
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  300011 non-null  int64  
 1   conversation_id     300011 non-null  int64  
 2   user_id             300011 non-null  int64  
 3   username            300011 non-null  object 
 4   name                300011 non-null  object 
 5   tweet               300011 non-null  object 
 6   mentions            300011 non-null  object 
 7   urls                300011 non-null  object 
 8   photos              300011 non-null  object 
 9   replies_count       300011 non-null  int64  
 10  retweets_count      300011 non-null  int64  
 11  likes_count         300011 non-null  int64  
 12  hashtags            300011 non-null  object 
 13  link                300011 non-null  object 
 14  retweet             300011 non-null  b

In [13]:
# what are the most mentioned users?
df["mentions"].value_counts()[:20]

none                              200171
youtube                             3856
realdonaldtrump                     2878
randpaul                            1852
gatewaypundit                        973
ukchange                             756
mailonline                           615
borisjohnson                         601
nbcnews                              548
googlenews                           504
yahoo                                400
jeffreestar,pulte                    383
nypost                               350
abc                                  346
elijahdaniel                         341
narendramodi                         319
newyorker                            312
usatoday                             296
rollingstone                         284
realdonaldtrump,jbpritzker,cnn       266
Name: mentions, dtype: int64

In [14]:
# what are the most common hashtags? 
df["hashtags"].value_counts()[:20]

none                               188465
#coronavirus                        23232
#coronavirus,#covid19                2048
#covid19,#coronavirus                1656
#covidー19,#coronavirus               1063
#foxnews                             1048
#covid19                              910
#coronavirus,#covidー19                899
#smartnews                            849
#covidー19                             589
#covidactnow                          571
#topbuzz                              429
#cultforgood                          341
#santrampalji_canendcorona            269
#breaking                             226
#peopleoverprofits,#coronavirus       183
#coronavirus,#stayathome              175
#randpaul,#coronavirus                169
#coronavirus,#socialdistancing        154
#trump,#coronavirus                   149
Name: hashtags, dtype: int64

In [18]:
# what are the most common urls?
df["urls"].value_counts()[:10]

none                                                                                                                  141952
https://twitter.com/i/events/1239712380373708800                                                                         871
https://twitter.com/i/events/1219057585707315201                                                                         735
https://twitter.com/i/events/1241837095108272134                                                                         521
https://www.cnn.com/2020/03/22/politics/rand-paul-coronavirus/index.html                                                 484
https://twitter.com/RandPaul/status/1241780756617273345                                                                  396
https://www.nbcnews.com/politics/congress/rand-paul-becomes-first-known-senator-test-positive-coronavirus-n1166111       360
https://www.washingtonpost.com/world/2020/03/22/coronavirus-latest-news/                                                 349


In [66]:
from datetime import datetime
origpath = "/notebooks/CovidDisinfo-Detect/experiments"
datapath = "/notebooks/CovidDisinfo-Detect/data/interim"

In [68]:
def save_dataframe(df, origpath, datapath):
    """
    Given a dataframe, saves it as a pickle file and stores in correct data folder.
    """
    # change directory to where data is located
    os.chdir(datapath)
    # get current date from filename
    date = datetime.now().strftime("%Y%m%d")
    # load in data with given filename
    df.to_pickle(f"coronavirus_{date}.pkl")
    print(f"Pickle file saved at {os.getcwd()}")
    # change directory back to original
    os.chdir(origpath)

In [69]:
save_dataframe(df_cleantext, origpath, datapath)

Pickle file saved at /notebooks/CovidDisinfo-Detect/data/interim
