## Twitter data extract

In [1]:
# Load libraries
import pandas as pd
import time
import json

# Load Twitter API client
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

### Define API arguments

In [2]:
# Define arguments to be able to connect to Twitter API for the last 30 days of tweets
premium_30day_search_args = load_credentials(filename=".twitter_keys.yaml",
                 yaml_key="search_30day_tweets_api",
                 env_overwrite=False)

# Define arguments to be able to connect to Twitter API for the last 30 days of tweets
premium_fullarchive_search_args = load_credentials(filename=".twitter_keys.yaml",
                 yaml_key="search_fullarchive_tweets_api",
                 env_overwrite=False)


Grabbing bearer token from OAUTH
Grabbing bearer token from OAUTH


### Extract last 30 days data

In [None]:
# Extract last 30 days tweets using Twitter API
# This is due to tweets older than 30 days use a different end point and limit quota

write_to_disk = False
if write_to_disk:
    all_tweets = []
    for t in list(range(1,30)):
        from_date = str(t).zfill(2)
        to_date = str(t+1).zfill(2)

        rule = gen_rule_payload(
            "UK coronavirus lang:en",
            from_date=f"2020-05-{from_date}", 
            to_date=f"2020-05-{to_date}",
            results_per_call=100
        )

        tweets = collect_results(rule, max_results=500, result_stream_args=premium_30day_search_args) 

        all_tweets.extend(tweets)
        print(len(all_tweets))
        # Set a sleep time to avoid hitting API limits
        time.sleep(10)

        # Write extacted tweets to a JSON file
        with open('data.json', 'w', encoding='utf-8') as f:
            json.dump(all_tweets, f, indent=4, ensure_ascii=False)
        
        

### Extract data older than 30 days

In [None]:
# Extract tweets older than 30 days
# This is due to tweets older than 30 days use a different end point and limit quota

# Extract tweets older then 30 days using Twitter API
write_to_disk = False
if write_to_disk:
    month = '04' # specify month number
#     all_tweets = []
    for t in list(range(1,30)):
        from_date = str(t).zfill(2)
        to_date = str(t+1).zfill(2)

        rule = gen_rule_payload(
            "UK coronavirus lang:en",
            from_date=f"2020-{month}-{from_date}", 
            to_date=f"2020-{month}-{to_date}",
            results_per_call=100
        )

        tweets = collect_results(rule, max_results=150, result_stream_args=premium_fullarchive_search_args) 

        all_tweets.extend(tweets)
        print(f"2020-{month}-{to_date}")
        print(len(all_tweets))
        print('---')
        # Set a sleep time to avoid hitting API limits
        time.sleep(10)

        # Write extacted tweets to a JSON file
        with open('data.json', 'w', encoding='utf-8') as f:
            json.dump(all_tweets, f, indent=4, ensure_ascii=False)

### Load tweet data

In [3]:
# Read JSON file that contains tweet data
with open('data.json', 'r',) as f:
    tweets = json.load(f)

# Check number of tweets
len(tweets)

22250

In [4]:
# Print twitter text sample
[print(tweet.get('text'),'\n\n') for tweet in tweets[0:3]]

RT @reactionlife: Why is Germany able to test for coronavirus so much more than the UK? - @reactionlife https://t.co/9xltrMYhNu 


RT @cdhawesi: 'Absolutely wrong': how UK's coronavirus test strategy unravelled | Coronavirus outbreak | The Guardian https://t.co/gJK3mppm… 


RT @Independent: US Coast Guard orders foreign cruise ships to care for suspected coronavirus passengers on board 'indefinitely' https://t.… 




[None, None, None]

In [5]:
# Check all keys associated with individual tweet objects
tweet_keys = []
for t in tweets:
    tweet_keys.extend(list(t.keys()))
    tweet_keys = list(set(tweet_keys))
    
tweet_keys = sorted(tweet_keys)
tweet_keys

['contributors',
 'coordinates',
 'created_at',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'favorited',
 'filter_level',
 'geo',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'matching_rules',
 'place',
 'possibly_sensitive',
 'quote_count',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'reply_count',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'user']

In [6]:
# Check whether all tweets are unique tweets
unique_tweets = []
ids = []
for t in tweets:
    if not t.get('id') in ids:
        ids.append(t.get('id'))
        unique_tweets.append(t)
    else:
        pass

len(unique_tweets)

22250

In [9]:
# Extract text content from individual tweets
write_text_tweets = False

if write_text_tweets:
    
    text_tweets = []
    for t in unique_tweets:
        t_ = {}
        if t.get('retweeted_status'):
            t_['id'] = t.get('retweeted_status').get('id')
            t_['created_at'] = t.get('retweeted_status').get('created_at')
            if t.get('retweeted_status').get('extended_tweet'):
                t_['text'] = t.get('retweeted_status').get('extended_tweet').get('full_text')
            else:
                t_['text'] = t.get('retweeted_status').get('text')
        else:
            t_['id'] = t.get('id')
            t_['created_at'] = t.get('created_at')
            if t.get('extended_tweet'):
                t_['text'] = t.get('extended_tweet').get('full_text')
            else:
                t_['text'] = t.get('text')

        text_tweets.append(t_)

    print(len(text_tweets))

    df = pd.DataFrame(text_tweets)
    df.to_csv('tweets.csv', encoding='utf-8', index=False)
    print('Tweet text saved to file!')
            

22250
Tweet text saved to file!


## Text cleaning

In [None]:
def remove_emojis(text):
    import re
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

In [None]:
# Define text processing function
def preprocess(text):
    import re, string
    
    text = text.replace('(<br/>)', ' ')
    text = text.replace('(&amp)', ' ')
    text = text.replace('(&gt)', ' ')
    text = text.replace('(&lt)', ' ')
    text = text.replace('(\xa0)', ' ')
    text = text.replace(r'\n',' ',)
    text = text.replace('"',' ',)
    text = text.replace("'",' ',)
    
    # Remove URL
    text = re.sub(r"(?:\@|http?\://|https?\://|www.)\S+", "", text)
    
    # Remove emoji
    text = remove_emojis(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    
    # Remove white space
    text = " ".join(text.split())
    
    return text

## Preparing labeled data for training

In [None]:
import pandas as pd

labeled = pd.read_csv('tweets_labeled.csv', encoding='utf-8')
labeled = labeled[labeled['label'].notnull()]
labeled.rename(columns={'text':'sentence'}, inplace=True)
labeled['sentence'] = labeled['sentence'].replace(r'\n',' ',regex=True)
labeled['sentence'] = labeled['sentence'].apply(preprocess)
labeled['sentence'] = labeled['sentence'].str.lower()
labeled['label'] = labeled['label'].astype(int)
labeled['label'] = labeled['label'].astype(str)
print(labeled.shape)
labeled.head()

In [None]:
labeled.tail()

In [None]:
labeled.label.value_counts(normalize=False)

In [None]:
df = labeled.copy(deep=True)

X = df[['sentence']]
y = df[['label']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.20)

df_test = pd.concat([X_test, y_test], axis=1)
df_train = pd.concat([X_train, y_train], axis=1)

df_validation = df_train.groupby('label').apply(lambda x: x.sample(frac=0.25)).reset_index(drop=True)

print(df_train.label.value_counts(normalize=True))
print(df_validation.label.value_counts(normalize=True))
print(df_test.label.value_counts(normalize=True))

### Save train, validation and test set for training based on Tensorflow BERT model

In [None]:
df_train.to_csv('train.tsv', encoding='utf-8', index=False, sep='\t')
df_validation.to_csv('dev.tsv', encoding='utf-8', index=False, sep='\t')
df_test.to_csv('test.tsv', encoding='utf-8', index=False, sep='\t')

## Sentiment analysis with custom trained model

## Sentiment analysis with vaderSentiment module

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

In [None]:
def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))

In [None]:
print_sentiment_scores(labeled.text.values[15])

## Sentiment 140 dataset

In [11]:
import pandas as pd

df = pd.read_csv(
    'sentiment_140/training_utf.csv',
    encoding='utf-8',
    names=[
        'polarity',
        'tweet_id',
        'date_of_tweet',
        'query',
        'user',
        'text',
    ]
)
print(df.shape)
df.head(10)

(1048576, 6)


Unnamed: 0,polarity,tweet_id,date_of_tweet,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
