# Sentiment Analysis for Text Messages

1. Import required libraries

In [55]:
import nltk.corpus
#nltk.download('stopwords')
import re
import pandas as pd
import plotly.express as px
import transformers

from nltk.corpus import stopwords
from tqdm import tqdm
from transformers import pipeline

2. Define main methods to be used

In [56]:
def clean_string(text):
    # Remove unicode characters
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text]
    # Make lower
    text = text.lower()
    return text

def remove_stopwords(text):
    # Remove stop words
    stop = stopwords.words('english')
    text = " ".join([word for word in text.split() if word not in (stop)])
    return text

def sentiment_label(text):
    sentiment = model(text)
    return sentiment

def get_label(list):
    label = list[0]['label']
    return label

3. Instantiate model to be used for the sentiment analysis

In [57]:
model = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

4. Read in the sample data and select columns to be maintained

In [58]:
df = pd.read_csv("clean_nus_sms.csv", usecols=['id', 'Message', 'length', 'Date'])
df.Message = df.Message.astype(str)

5. Clean the text

In [59]:
tqdm.pandas(desc="Clean string")
df['cleanedMessage'] = df.Message.progress_apply(clean_string)

Clean string: 100%|████████████████████| 48598/48598 [00:00<00:00, 50178.68it/s]


6. Remove stopwords 

In [60]:
tqdm.pandas(desc="Remove stopwords")
df['cleanedMessage'] = df.cleanedMessage.progress_apply(remove_stopwords)

Remove stopwords: 100%|████████████████| 48598/48598 [00:02<00:00, 20569.17it/s]


7. Run the model on the cleaned text to get sentiment scores

In [None]:
tqdm.pandas(desc="Get sentiment")
df['sentiment'] = df.cleanedMessage.progress_apply(sentiment_label)

Get sentiment:  12%|██▍                  | 5766/48598 [11:23<1:21:42,  8.74it/s]

8. Assign sentiment labels based on calculated scores

In [None]:
tqdm.pandas(desc="Get label value")
df['sentiment_label'] = df.sentiment.progress_apply(get_label)

9. Display histogram of assigned values

In [None]:
fig = px.histogram(df, x="sentiment_label", color='sentiment_label', histfunc='sum', title="Text message sentiment")
fig.show()

10. Further considerations could be the usage of different trained models to compare or train your own sentiment model. 