In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Twitter Sentiment Analysis

The goal of this project is to use ML models to predict the Sentiment of Tweets.

Data Source: https://www.kaggle.com/datasets/kazanova/sentiment140/data

Sentiment140: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.

## Examining Initial Dataset

We are not limited to this dataset for this project. Additional data might be needed.

According to the author of this dataset, the tweets were collected by scraping tweets with emoticons and labelling the tweets according to these emoticons. The emoticons have been removed after labeling the data.

In [4]:
# load dataset
# df = pd.read_csv('datasets/twitter_training.csv',names=['ID', 'entity', 'sentiment', 'tweet'])
df = pd.read_csv('datasets/sentiment140.csv', names=['sentiment', 'id', 'date', 'query', 'user', 'text'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   id         1600000 non-null  int64 
 2   date       1600000 non-null  object
 3   query      1600000 non-null  object
 4   user       1600000 non-null  object
 5   text       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [11]:
df.head(10)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [41]:
df['id'].value_counts() 

2190457769    2
1972193428    2
1989776729    2
1989776908    2
1564543229    2
             ..
2197311196    1
2197311146    1
2197310899    1
2197310477    1
2193602129    1
Name: id, Length: 1598315, dtype: int64

## Pre-trained model

We use the pre-trained model as a reference for the following models and to demonstrate how easy and accurate the use of pre-trained models is for well researched tasks.

HF link: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sentiment_analysis = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [40]:
# Example usage
sentiment_analysis("I love you")

[{'label': 'positive', 'score': 0.8594695329666138}]

In [30]:
df_example = df.head(30).copy()
df_example['predicted_sentiment'] = df_example['text'].apply(sentiment_analysis)

In [31]:
df_example.iloc[:, [0, 5, 6]]

Unnamed: 0,sentiment,text,predicted_sentiment
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[{'label': 'negative', 'score': 0.745751440525..."
1,0,is upset that he can't update his Facebook by ...,"[{'label': 'negative', 'score': 0.900523424148..."
2,0,@Kenichan I dived many times for the ball. Man...,"[{'label': 'neutral', 'score': 0.6559894084930..."
3,0,my whole body feels itchy and like its on fire,"[{'label': 'negative', 'score': 0.897633254528..."
4,0,"@nationwideclass no, it's not behaving at all....","[{'label': 'negative', 'score': 0.931198000907..."
5,0,@Kwesidei not the whole crew,"[{'label': 'neutral', 'score': 0.6053249835968..."
6,0,Need a hug,"[{'label': 'neutral', 'score': 0.6586580872535..."
7,0,@LOLTrish hey long time no see! Yes.. Rains a...,"[{'label': 'positive', 'score': 0.922187566757..."
8,0,@Tatiana_K nope they didn't have it,"[{'label': 'neutral', 'score': 0.5189148783683..."
9,0,@twittera que me muera ?,"[{'label': 'neutral', 'score': 0.8834701180458..."


## Possible pre-processing steps

In [32]:
# One straight forward step is the removal of the @username
# The username could be very impactful for the model but is not relevant for the actual sentiment of the tweet

def remove_username(text):
    return ' '.join(word for word in text.split() if not word.startswith('@'))

df_example['text'] = df_example['text'].apply(remove_username)
df_example['predicted_sentiment'] = df_example['text'].apply(sentiment_analysis)
df_example.iloc[:, [0, 5, 6]]

Unnamed: 0,sentiment,text,predicted_sentiment
0,0,"http://twitpic.com/2y1zl - Awww, that's a bumm...","[{'label': 'negative', 'score': 0.771735787391..."
1,0,is upset that he can't update his Facebook by ...,"[{'label': 'negative', 'score': 0.905853092670..."
2,0,I dived many times for the ball. Managed to sa...,"[{'label': 'neutral', 'score': 0.6504483222961..."
3,0,my whole body feels itchy and like its on fire,"[{'label': 'negative', 'score': 0.880418777465..."
4,0,"no, it's not behaving at all. i'm mad. why am ...","[{'label': 'negative', 'score': 0.912394821643..."
5,0,not the whole crew,"[{'label': 'neutral', 'score': 0.5887722969055..."
6,0,Need a hug,"[{'label': 'neutral', 'score': 0.4998500347137..."
7,0,"hey long time no see! Yes.. Rains a bit ,only ...","[{'label': 'positive', 'score': 0.906500339508..."
8,0,nope they didn't have it,"[{'label': 'neutral', 'score': 0.4911064505577..."
9,0,que me muera ?,"[{'label': 'neutral', 'score': 0.8560315370559..."


In [37]:
# Depending on the model used, it could be useful to remove stop words as well. The nltk library has a list of stop words
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
    stop_words = stopwords.words('english')
    return ' '.join(word for word in text.split() if word not in stop_words)

In [42]:
# The text needs to be tokenized and lemmatized before it can be used to train most most models