In [1]:
import sys
sys.path.insert(0, '../')
import config as cf
import pandas as pd

### Load US_Reopen Data:

In [2]:
df = pd.read_csv(cf.US_REOPEN_DATA)
df = df.dropna()
print("Shape = ", df.shape)
df.info()

Shape =  (44186, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44186 entries, 0 to 44185
Data columns (total 8 columns):
id               44186 non-null int64
created_at       44186 non-null object
original_text    44186 non-null object
clean_text       44186 non-null object
sentiment        44186 non-null object
lang             44186 non-null object
screen_name      44186 non-null object
location         44186 non-null object
dtypes: int64(1), object(7)
memory usage: 3.0+ MB


### Remove Retweet Data:

In [3]:
df = df[df.apply(lambda x: not x["original_text"].startswith("RT"), axis=1)]
print(df.shape)
print("Shape = ", df.shape)
df.info()

(11410, 8)
Shape =  (11410, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11410 entries, 8 to 44185
Data columns (total 8 columns):
id               11410 non-null int64
created_at       11410 non-null object
original_text    11410 non-null object
clean_text       11410 non-null object
sentiment        11410 non-null object
lang             11410 non-null object
screen_name      11410 non-null object
location         11410 non-null object
dtypes: int64(1), object(7)
memory usage: 802.3+ KB


### Preprocess and Extract Words:

In [4]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import gazetteers
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re, calendar

stopwords  = [w.lower() for w in list(stopwords.words('english'))]
gazetteers = [x.lower() for x in gazetteers.words()]
skips = ["covid", "corona", "new", "novel", "open", "amp", "repu"]
calendars = [m.lower() for m in list(calendar.month_name) + list(calendar.day_name)]

def get_key_words(tweet):
    tweet = tweet.lower() #lowercase
    tweet = re.sub(cf.RX_MENTION, '', tweet) #mention
    tweet = re.sub(cf.RX_HASHTAG, '', tweet) #hashtag
    tweet = re.sub(cf.RX_URL, '', tweet) #url
    tweet = re.sub(cf.RX_EMAIL, '', tweet) #email
    words = TweetTokenizer().tokenize(tweet)
    words = [re.sub(cf.RX_ONLY_AB, '', w) for w in words] #only alphabet
    words = [w for w in words if len(w) > 3] #word length
    words = [w for w in words if w.lower() not in stopwords]
    words = [w for w in words if w.lower() not in gazetteers]
    words = [w for w in words if w.lower() not in calendars]
    for ws in skips:
        words = [w for w in words if ws.lower() not in w.lower()]
    words = [WordNetLemmatizer().lemmatize(w) for w in words]
    return words

df["key_words"] = df["original_text"].apply(get_key_words)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11410 entries, 8 to 44185
Data columns (total 9 columns):
id               11410 non-null int64
created_at       11410 non-null object
original_text    11410 non-null object
clean_text       11410 non-null object
sentiment        11410 non-null object
lang             11410 non-null object
screen_name      11410 non-null object
location         11410 non-null object
key_words        11410 non-null object
dtypes: int64(1), object(8)
memory usage: 891.4+ KB


### N-gram, N = [1, 2, 3] and T = 15:

In [5]:
import collections
from collections import Counter
from decimal import Decimal

N = 3 #N-grams
T = 15 #Max top words

def n_grams(kw, n):
    output = []
    for i in range(len(kw)-n+1):
        output.append(kw[i:i+n])
    return output

for n in range(1, N+1):
    key_words = df["key_words"].tolist()
    key_words = [[' '.join(x) for x in n_grams(kw, n)] for kw in key_words]
    key_words = [[each_word] for each_list in key_words for each_word in each_list]
    count = dict(Counter(map(tuple, key_words)))
    count = sorted(count.items(), key=lambda x: x[1], reverse=True)
    count = [(c[0][0],c[1]) for c in count]
    sumc = sum([c[1] for c in count])
    def get_sci_val(x):
        x = round(x/sumc, 6)
        x = "{:.2E}".format(Decimal(x))
        return x
    count = [(c[0],c[1],get_sci_val(c[1])) for c in count]
    print("{}-grams(word,count,perentage):\n{}\n".format(n,count[:T]))


1-grams(word,count,perentage):
[('state', 1275, '1.65E-2'), ('business', 1179, '1.53E-2'), ('plan', 587, '7.60E-3'), ('economy', 548, '7.09E-3'), ('people', 485, '6.28E-3'), ('today', 477, '6.17E-3'), ('back', 454, '5.88E-3'), ('case', 444, '5.75E-3'), ('testing', 431, '5.58E-3'), ('country', 419, '5.42E-3'), ('need', 416, '5.38E-3'), ('phase', 401, '5.19E-3'), ('county', 378, '4.89E-3'), ('restaurant', 351, '4.54E-3'), ('week', 340, '4.40E-3')]

2-grams(word,count,perentage):
[('testing site', 141, '2.14E-3'), ('social distancing', 107, '1.62E-3'), ('white house', 88, '1.34E-3'), ('state begin', 62, '9.41E-4'), ('small business', 61, '9.26E-4'), ('look like', 59, '8.95E-4'), ('public health', 56, '8.50E-4'), ('stay home', 48, '7.28E-4'), ('back work', 45, '6.83E-4'), ('wear mask', 43, '6.52E-4'), ('hair salon', 41, '6.22E-4'), ('next week', 39, '5.92E-4'), ('task force', 34, '5.16E-4'), ('business begin', 34, '5.16E-4'), ('contact tracing', 33, '5.01E-4')]

3-grams(word,count,perentag