# Data Cleaning

In this notebook, the goal is to preprocess the data so that it is more suitable for performing sentiment analysis. To this end, we aim to remove the "#" character from tweets as sometimes the following words can impact the sentiment.

In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("data/tesla-tweets.csv")

In [9]:
df.head()

Unnamed: 0,Date & Time,Profile Picture Link,Twitter ID,Tweet Text,Tweet Link
0,"April 10, 2022 at 07:44PM",http://pbs.twimg.com/profile_images/15120745...,@Jessica1988kk,"RT @invest_answers: Crypto news, #Bitcoin Whal...",https://twitter.com/Jessica1988kk/status/15131...
1,"April 10, 2022 at 07:45PM",http://pbs.twimg.com/profile_ s/87878355348773...,@JotaGe2014,#Tesla tiene récord de autos vendidos. Es impr...,https://twitter.com/JotaGe2014/status/15131737...
2,"April 10, 2022 at 07:45PM",http://pbs.twimg.com/profile_images/936422368...,@MmeCallas,RT @CottonCodes: 🐒 #love in my #MariaCallas I ...,https://twitter.com/MmeCallas/status/151317374...
3,"April 10, 2022 at 07:45PM",http://pbs.twimg.com/profile_images/146366591...,@BotSecx,RT @CottonCodes: 🐒 #love in my #MariaCallas I ...,https://twitter.com/BotSecx/status/15131737626...
4,"April 10, 2022 at 07:45PM",http://pbs.twimg.com/profile_images/111675859...,@agseh,RT @RupiReportero_: 🙆‍♂️🚘 Al que le robaron la...,https://twitter.com/agseh/status/1513173864829...


In [11]:
df.tail()

Unnamed: 0,Date & Time,Profile Picture Link,Twitter ID,Tweet Text,Tweet Link
151995,"November 12, 2022 at 02:20PM",http://pbs.twimg.com/profile_images/153320879...,@FolgendenFolgen,"RT @unheilbargut: Erinnert ihr euch, als @elon...",https://twitter.com/FolgendenFolgen/status/159...
151996,"November 12, 2022 at 03:10PM",http://abs.twimg.com/sticky/default_profile_i...,@kerntdennis,"RT @unheilbargut: Erinnert ihr euch, als @elon...",https://twitter.com/kerntdennis/status/1591395...
151997,"November 12, 2022 at 03:10PM",http://pbs.twimg.com/profile_images/133838713...,@kadiff651,"RT @unheilbargut: Erinnert ihr euch, als @elon...",https://twitter.com/kadiff651/status/159139556...
151998,"November 12, 2022 at 03:10PM",http://pbs.twimg.com/profile_images/121896452...,@TeslaradarB,Nice! skatebambi 🇸🇪 just spotted a 2021 Tesla ...,https://twitter.com/TeslaradarB/status/1591395...
151999,"November 12, 2022 at 03:11PM",http://pbs.twimg.com/profile_images/157116418...,@JandTContent,Crash and burn EVERYWHERE... \n\nAnother one b...,https://twitter.com/JandTContent/status/159139...


In [12]:
tweets = df["Tweet Text"]
tweets.head()

0    RT @invest_answers: Crypto news, #Bitcoin Whal...
1    #Tesla tiene récord de autos vendidos. Es impr...
2    RT @CottonCodes: 🐒 #love in my #MariaCallas I ...
3    RT @CottonCodes: 🐒 #love in my #MariaCallas I ...
4    RT @RupiReportero_: 🙆‍♂️🚘 Al que le robaron la...
Name: Tweet Text, dtype: object

## Sample test of Sentiment Analysis model

Via huggingface and this (https://huggingface.co/blog/sentiment-analysis-twitter) tutorial

In [25]:
#importing the stuff and definining the analysis function

import requests
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
hf_token = "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"

API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": "Bearer %s" % (hf_token)}

def analysis(data):
    #function that computes the sentiment
    payload = dict(inputs=data, options=dict(wait_for_model=True))
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


In [34]:

tweets_analysis = []

for tweet in tweets:
    try:
        sentiment_result = analysis(tweet)[0]
        top_sentiment = max(sentiment_result, key=lambda x: x['score']) # Get the sentiment with the higher score
        tweets_analysis.append({'tweet': tweet, 'sentiment': top_sentiment['label']})
 
    except Exception as e:
        print(e)

In [35]:
print(sentiment_result)

[{'label': 'neutral', 'score': 0.8032652139663696}, {'label': 'positive', 'score': 0.14004221558570862}, {'label': 'negative', 'score': 0.05669258534908295}]


In [47]:

# for i in range(10):
#     print(tweets[i])
#     print('\n')
print(tweets[1])

#Tesla tiene récord de autos vendidos. Es impresionante, pero no deja de sorprenderme que vendiendo 6 veces menos unidades que Toyota, por ejemplo, Tesla valga 3 o 4 veces más. https://t.co/u7Jm8oS54t vía @Inoreader


In [50]:
#sanity check

tweets2 = tweets[0:1]
tweets2[1] = "Tesla has a record for cars sold. It's impressive, but it never ceases to amaze me that by selling 6 times less units than Toyota, for example, Tesla is worth 3 or 4 times more. https://t.co/u7Jm8oS54t via @Inoreader"
print(tweets2)

print(analysis(tweets2[1])[0])

0    RT @invest_answers: Crypto news, #Bitcoin Whal...
1    Tesla has a record for cars sold. It's impress...
Name: Tweet Text, dtype: object
[{'label': 'positive', 'score': 0.8890513181686401}, {'label': 'neutral', 'score': 0.09520529210567474}, {'label': 'negative', 'score': 0.015743352472782135}]
