# Sentiment analysis using Deep Learning on Twitter Data

__Concepts covered:__
* Twitter API
* Sentiment analysis
* Deep Learning for inference
* Huggingface models
* Transformers

## Install transformers library

In [65]:
!pip install -q transformers

## Connect to Twitter

In [83]:
import json
import pandas as pd
import numpy as np

In [84]:
credentials_filename = 'twitter_credentials.json'

with open('twitter_credentials.json') as f:
    credentials = json.load(f)

print(credentials.keys())

dict_keys(['CONSUMER_KEY', 'CONSUMER_SECRET', 'ACCESS_TOKEN', 'ACCESS_SECRET'])


In [85]:
# Import Tweepy
import tweepy

# Authenticate
auth = tweepy.OAuthHandler(credentials['CONSUMER_KEY'], credentials['CONSUMER_SECRET'])
auth.set_access_token(credentials['ACCESS_TOKEN'], credentials['ACCESS_SECRET'])
api = tweepy.API(auth)

## Get tweets

In [86]:
def tweets2dataframe(tweets):
    columns = ['TweetId', 'created_at', 'User', 'Tweet']
    data = []
    
    for tweet in tweets:
        data.append([tweet.id, tweet.created_at, tweet.user.screen_name, tweet.full_text])
    
    df = pd.DataFrame(data, columns=columns)
    
    #Express created_at as date only
    df['created_at'] = df['created_at'].dt.date
    
    return(df)
    

In [87]:
search = "greta thunberg"
limit = 500

#Search tweets
tweets = tweepy.Cursor(api.search_tweets,
                       q=search,
                       lang="en", tweet_mode='extended').items(limit)

#Make dataframe
df = tweets2dataframe(tweets)

In [88]:
df

Unnamed: 0,TweetId,created_at,User,Tweet
0,1589276006661390342,2022-11-06,mwmw7mwmw,RT @PeterSweden7: Greta Thunberg says that she...
1,1589275985379495936,2022-11-06,miv_joe,RT @PeterSweden7: Greta Thunberg says that she...
2,1589275972280340480,2022-11-06,BellProtection,RT @PeterSweden7: Greta Thunberg says that she...
3,1589275965167128576,2022-11-06,mbarminski,RT @PeterSweden7: Greta Thunberg says that she...
4,1589275945483264001,2022-11-06,TomBeen,RT @PeterSweden7: Greta Thunberg says that she...
...,...,...,...,...
495,1589264804870782976,2022-11-06,bambambioo,RT @PeterSweden7: Greta Thunberg says that she...
496,1589264771018555392,2022-11-06,Wh1tneyyy,RT @willfulchaos: 77 degrees in november.. GRE...
497,1589264756808245254,2022-11-06,Awildcole,RT @willfulchaos: 77 degrees in november.. GRE...
498,1589264752802672642,2022-11-06,ArshooterMd,RT @PeterSweden7: Greta Thunberg says that she...


In [89]:
df.sample(5)

Unnamed: 0,TweetId,created_at,User,Tweet
34,1589275252269674505,2022-11-06,Reaganaddicted,RT @PeterSweden7: Greta Thunberg says that she...
220,1589270985672622080,2022-11-06,gpse7en,RT @PeterSweden7: Greta Thunberg says that she...
98,1589274067299098625,2022-11-06,MarkRosser20,RT @PeterSweden7: Greta Thunberg says that she...
6,1589275891708088320,2022-11-06,Lola90244421,RT @PeterSweden7: Greta Thunberg says that she...
168,1589272536407498752,2022-11-06,DavidCo82529586,RT @PeterSweden7: Greta Thunberg says that she...


In [90]:
#Tweets per day
df.created_at.value_counts()

2022-11-06    500
Name: created_at, dtype: int64

In [91]:
#Tweets per day
df.User.value_counts().head()

jeannetix_         2
DJPM6              2
mwmw7mwmw          1
feu_serpent        1
rakeshg45180200    1
Name: User, dtype: int64

## Load Deep Learning model for Sentiment Analysis

In [92]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)

sentiment_pipeline = pipeline(task='sentiment-analysis', model=model, tokenizer=tokenizer)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


For model_name = "cardiffnlp/twitter-roberta-base-sentiment", the labels are as follows:
* 0 -> Negative
* 1 -> Neutral
* 2 -> Positive

In [93]:
i = np.random.randint(df.shape[0])
i, df.Tweet[i], sentiment_pipeline(df.Tweet[i])

(58,
 'RT @PeterSweden7: Greta Thunberg says that she wants to overthrow the Western capitalist system.\n\nI always told you it was Climate Communis…',
 [{'label': 'LABEL_1', 'score': 0.4945134222507477}])

## Calculate sentiment of tweets
... and save as a column in the dataframe

In [94]:
#Why does this take so long? -- It's not called deep learning for nothing!
df['sentiment'] = df.apply(lambda x: sentiment_pipeline(x['Tweet']), axis=1)

In [95]:
df.sample(5)

Unnamed: 0,TweetId,created_at,User,Tweet,sentiment
366,1589268065283629056,2022-11-06,PressD,RT @PeterSweden7: Greta Thunberg says that she...,"[{'label': 'LABEL_1', 'score': 0.4945134222507..."
460,1589265569546895360,2022-11-06,Rachel_Claveau,RT @PeterSweden7: Greta Thunberg says that she...,"[{'label': 'LABEL_1', 'score': 0.4945134222507..."
156,1589272712111063041,2022-11-06,AmericanThinks,RT @PeterSweden7: Greta Thunberg says that she...,"[{'label': 'LABEL_1', 'score': 0.4945134222507..."
133,1589273321686073344,2022-11-06,itsmmaha,RT @willfulchaos: 77 degrees in november.. GRE...,"[{'label': 'LABEL_1', 'score': 0.6030967831611..."
36,1589275213845479430,2022-11-06,Eleaccount,RT @PeterSweden7: Greta Thunberg says that she...,"[{'label': 'LABEL_1', 'score': 0.4945134222507..."


Tidy info on sentiment analysis

In [96]:
df['sentiment_polarity'] = df.apply(lambda x: x['sentiment'][0]['label'], axis=1)
df['sentiment_score'] = df.apply(lambda x: x['sentiment'][0]['score'], axis=1)
df = df.drop(columns=['sentiment'])
df.sample(5)

Unnamed: 0,TweetId,created_at,User,Tweet,sentiment_polarity,sentiment_score
155,1589272782206291968,2022-11-06,rodriigc296,RT @willfulchaos: 77 degrees in november.. GRE...,LABEL_1,0.603097
294,1589269353161453569,2022-11-06,danielcapitaooo,RT @willfulchaos: 77 degrees in november.. GRE...,LABEL_1,0.603097
458,1589265635271651328,2022-11-06,Hanauta_Br00k,RT @PeterSweden7: Greta Thunberg says that she...,LABEL_1,0.494513
3,1589275965167128576,2022-11-06,mbarminski,RT @PeterSweden7: Greta Thunberg says that she...,LABEL_1,0.494513
27,1589275460474900481,2022-11-06,TheBillyBatson,RT @PeterSweden7: Greta Thunberg says that she...,LABEL_1,0.494513


In [97]:
#Rewrite polarity labels
df['sentiment_polarity'] = df.sentiment_polarity.replace({'LABEL_0': 'negative','LABEL_1': 'neutral', 'LABEL_2': 'positive'})
df.sample(5)

Unnamed: 0,TweetId,created_at,User,Tweet,sentiment_polarity,sentiment_score
171,1589272441884327937,2022-11-06,wakanakaflame,RT @willfulchaos: 77 degrees in november.. GRE...,neutral,0.603097
443,1589266031507570688,2022-11-06,tcastandrew,RT @PeterSweden7: Greta Thunberg says that she...,neutral,0.494513
367,1589268032236711937,2022-11-06,RomadXYZ,RT @PeterSweden7: Greta Thunberg says that she...,neutral,0.494513
170,1589272479041998849,2022-11-06,HollyTa00354591,RT @PeterSweden7: Greta Thunberg says that she...,neutral,0.494513
274,1589269802249760770,2022-11-06,hugh_mcfadyen,RT @PeterSweden7: Greta Thunberg says that she...,neutral,0.494513


In [98]:
df.sentiment_polarity.value_counts()

neutral     474
negative     18
positive      8
Name: sentiment_polarity, dtype: int64