In [66]:
# Load libraries
import pandas as pd
import time
import json

# Load Twitter API client
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

In [67]:
# Define arguments to be able to connect to Twitter API
premium_search_args = load_credentials(filename=".twitter_keys.yaml",
                 yaml_key="search_30day_tweets_api",
                 env_overwrite=False)


Grabbing bearer token from OAUTH


In [None]:
# Extract last 30 days tweets using Twitter API
write_to_disk = False
if write_to_disk:
    all_tweets = []
    for t in list(range(1,30)):
        from_date = str(t).zfill(2)
        to_date = str(t+1).zfill(2)

        rule = gen_rule_payload(
            "UK coronavirus lang:en",
            from_date=f"2020-05-{from_date}", 
            to_date=f"2020-05-{to_date}",
            results_per_call=100
        )

        tweets = collect_results(rule, max_results=500, result_stream_args=premium_search_args) 

        all_tweets.extend(tweets)
        print(len(all_tweets))
        # Set a sleep time to avoid hitting API limits
        time.sleep(10)

    # Write extacted tweets to a JSON file
    with open('data.json', 'w', encoding='utf-8') as f:
        json.dump(all_tweets, f, indent=4, ensure_ascii=False)

In [68]:
# Read JSON file that contains tweet data
with open('data.json', 'r',) as f:
    tweets = json.load(f)

# Check number of tweets
len(tweets)

14500

In [84]:
[print(tweet.get('text'),'\n\n') for tweet in tweets[0:3]]

RT @BBCWorld: Dogs are being trained to find out if they can detect people with coronavirus 🐶

https://t.co/QXmVzTsBoM https://t.co/TTndNp6… 


RT @sallywilts: @BBCNewsnight Now I understand Sage are going to advise the government that masks are useless. Yet again. No wonder we’re t… 


RT @LGAcomms: 💷 Council finances
😷 PPE
🏘️ Supporting the shielded
🧠 Mental health
👩🏼‍⚕️ Public health
🧒🏻 Children &amp; young people
🙋🏽 Volunte… 




[None, None, None]

In [89]:
# Check all keys associated with individual tweet objects
tweet_keys = []
for t in tweets:
    tweet_keys.extend(list(t.keys()))
    tweet_keys = list(set(tweet_keys))
    
tweet_keys = sorted(tweet_keys)
tweet_keys

['contributors',
 'coordinates',
 'created_at',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'favorited',
 'filter_level',
 'geo',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'matching_rules',
 'place',
 'possibly_sensitive',
 'quote_count',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'reply_count',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'user']

In [29]:
# Check whether all tweets are unique tweets
unique_tweets = []
ids = []
for t in tweets:
    if not t.get('id') in ids:
        ids.append(t.get('id'))
        unique_tweets.append(t)
    else:
        pass

len(unique_tweets)

14500

In [63]:
# Extract text content from individual tweets
write_text_tweets = False

if write_text_tweets:
    
    text_tweets = []
    for t in unique_tweets:
        t_ = {}
        if t.get('retweeted_status'):
            t_['id'] = t.get('retweeted_status').get('id')
            t_['created_at'] = t.get('retweeted_status').get('created_at')
            if t.get('retweeted_status').get('extended_tweet'):
                t_['text'] = t.get('retweeted_status').get('extended_tweet').get('full_text')
            else:
                t_['text'] = t.get('retweeted_status').get('text')
        else:
            t_['id'] = t.get('id')
            t_['created_at'] = t.get('created_at')
            if t.get('extended_tweet'):
                t_['text'] = t.get('extended_tweet').get('full_text')
            else:
                t_['text'] = t.get('text')

        text_tweets.append(t_)

    len(text_tweets)


    df = pd.DataFrame(text_tweets)
    df.to_csv('tweets.csv', encoding='utf-8', index=False)
            

14500

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

In [17]:
def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))

In [18]:
print_sentiment_scores("I just got a call from my boss - does he realise it's Saturday?")

I just got a call from my boss - does he realise it's Saturday? {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [27]:
print_sentiment_scores("I hate.")

I hate.--------------------------------- {'neg': 0.787, 'neu': 0.213, 'pos': 0.0, 'compound': -0.5719}


In [32]:
import pandas as pd

df = pd.read_csv(
    'sentiment_140/training_utf.csv',
    encoding='utf-8',
    names=[
        'polarity',
        'tweet_id',
        'date_of_tweet',
        'query',
        'user',
        'text',
    ]
)
print(df.shape)
df.head(50)

(1048576, 6)


Unnamed: 0,polarity,tweet_id,date_of_tweet,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [None]:
import re, string

def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace('(<br/>)', ' ')
    ReviewText = ReviewText.str.replace('(<a).(>).(</a>)', ' ')
    ReviewText = ReviewText.str.replace('(&amp)', ' ')
    ReviewText = ReviewText.str.replace('(&gt)', ' ')
    ReviewText = ReviewText.str.replace('(&lt)', ' ')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')
    return ReviewText
df['tweets'] = preprocess(df['full_text'])

def remove_whitespace(x):
    return " ".join(x.split())

def remove_punctuation(x):
    return x.translate(str.maketrans(' ', ' ', string.punctuation))

# df['tweets'] = df['tweets'].apply(remove_punctuation)
df['tweets'] = df['tweets'].apply(remove_whitespace)

tweets = df.tweets.values.tolist()

In [None]:
import string, re

def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace('(<br/>)', ' ')
#    ReviewText = ReviewText.str.replace('(<a).(>).(</a>)', ' ')
    ReviewText = ReviewText.str.replace('(&amp)', ' ')
    ReviewText = ReviewText.str.replace('(&gt)', ' ')
    ReviewText = ReviewText.str.replace('(&lt)', ' ')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')
#     ReviewText = ReviewText.str.replace('\d+', ' ')
    return ReviewText
df['Description'] = preprocess(df['Description'])

def remove_url(x):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*', '', x)

def remove_whitespace(x):
    return " ".join(x.split())

def remove_punctuation(x):
    return x.translate(str.maketrans(' ', ' ', string.punctuation))

df['Description'] = df['Description'].apply(remove_url)
# df['Description'] = df['Description'].apply(remove_punctuation)
df['Description'] = df['Description'].apply(remove_whitespace)

In [None]:
exts = df['Description'].values.tolist()

labels = [x['label'] for x in b]

dic = {2:'positive', 3:'negative', 4:'neutral'}

new_labels = [dic.get(n, n) for n in labels]

new_labels[:20]

c = [{'text': x, 'label': y} for x, y in zip(texts, new_labels)]