# Clean raw data

## Libraries

Data wrangling and visualization.

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
import requests
from credentials import hf_token


pd.options.display.max_columns = 50
%matplotlib inline
plt.style.use("bmh")
sns.set()

## Import data

Query the data from Elasticsearch. Alternatively you can load the *tweets.csv* file.

In [3]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

In [4]:
from elasticsearch_dsl import Search
from elasticsearch_dsl import connections

connections.create_connection(hosts=["localhost"])

s = Search(using = es, index = "tweets")
response = s.scan()

count = 0
records = []
for hit in response:
    records.append(hit.to_dict())
    #print(hit.to_dict())  # be careful, it will printout every hit in your index
    count += 1

print(count)

1002


## DataFrame modifications

In [5]:
records_df = pd.DataFrame.from_dict(records)

In [6]:
records_df.head(10)

Unnamed: 0,Tweet_Id,Datetime,Text,Likes,Retweets,Location
0,1598466952309211136,2022-12-01T23:59:54+00:00,"If y’all are throwing out any Balenciaga, espe...",2,0,"Los Angeles, CA"
1,1598466908008980480,2022-12-01T23:59:43+00:00,#75 live at 8pm EST on this page. So many thin...,0,0,Montreal
2,1598466875947880448,2022-12-01T23:59:35+00:00,@BaileyUnspoken @RealTalkPerson @KimKardashian...,0,0,Riverside Estates
3,1598466874303463426,2022-12-01T23:59:35+00:00,@kanyewest DEMNA! Lead Designer of Balenciaga!...,0,0,"Houston, TX"
4,1598466848319750144,2022-12-01T23:59:29+00:00,Kanye out here burying Balenciaga news tired 😴,0,0,"Manchester, England"
5,1598466846080262144,2022-12-01T23:59:28+00:00,Thank God I didn’t end up buying them balencia...,0,0,Sydney / Nairobi / Harare
6,1598466840992579584,2022-12-01T23:59:27+00:00,@mmpadellan With the drama at Balenciaga and D...,3,0,
7,1598466839293599745,2022-12-01T23:59:27+00:00,@kanyewest We love Demna. He's not Balenciaga ...,0,0,
8,1598466829797687298,2022-12-01T23:59:24+00:00,@vikare06 Looks like a balenciaga photo shoot,2,0,Hyde Park
9,1598466786244317184,2022-12-01T23:59:14+00:00,@DestinyVaughn @M4D3R0 @ksenijapavlovic @Mikha...,0,0,"New York, NY / Manahatta"


### Datetime & Location

In [7]:
records_df["Datetime"] = pd.to_datetime(records_df["Datetime"], format = "%Y-%m-%dT%H:%M:%S%z")

In [8]:
records_df["Location"] = records_df["Location"].replace('',np.nan,regex = True)

In [9]:
#records_df.head(10)

### Custom handles as columns

In [10]:
#custom_handles = "pedophilia", "#cancelBalenciaga", "#QAnon"

In [11]:
cancel_handle = ["Cancel Balenciaga", "Cancel", "#cancelBalenciaga"]
pedophilia_handle = ["pedophilia", "pedo", "#pedophilia"]
QAnon_handle = ["QAnon", "#QAnon"]

In [12]:
def identify_subject(tweet, refs):
    flag = 0 
    for ref in refs:
        if tweet.find(ref) != -1:
            flag = 1
    return flag

In [13]:
records_df["Cancel"] = records_df["Text"].apply(lambda x: identify_subject(x, cancel_handle)) 
records_df["Pedophilia"] = records_df["Text"].apply(lambda x: identify_subject(x, pedophilia_handle))
records_df["QAnon"] = records_df["Text"].apply(lambda x: identify_subject(x, QAnon_handle))

In [14]:
#records_df.head(10)

## Clean function

In [15]:
def clean_text(text):
    
    """
    Make text lowercase, remove numbers, whitespaces, punctuation, usernames, RT
    and httptco.
    """
    
    # make text lowercase
    text = text.lower()
    # removing numbers
    text = re.sub("\w*\d\w*", "", text)
    # if there's more than 1 whitespace, then make it just 1
    text = re.sub("\s+", " ", text)
    # if there's a new line, then make it a whitespace
    text = re.sub("\n", " ", text)
    # removing &amp;
    text = re.sub("(\&amp\;)", "", text)
    # removing any usernames
    text = re.sub("(@[^\s]+)", "", text)
    # remove `rt` for retweet
    text = re.sub("(rt)", "", text)
    # string.punctuation is a string of all punctuation marks
    # so this gets rid of all punctuation
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    # getting rid of `httptco`
    text = re.sub("(httptco)", "", text)

    return text

cleaning = lambda x: clean_text(x)

In [16]:
records_df["Clean_tweets"] = records_df["Text"].apply(cleaning)

In [17]:
#records_df.head()

In [18]:
#records_df.isnull().sum()

## Preprocessing function

In [19]:
from textblob import Word

In [20]:
#nltk.download("stopwords")
#nltk.download("wordnet")

In [21]:
#nltk.download('omw-1.4')

In [22]:
stop_words = stopwords.words("english")
custom_stopwords = ["balenciaga"]

In [23]:
def preprocess_tweets(tweet, custom_stopwords):
    processed_tweet = tweet
    processed_tweet.replace('[^\w\s]', '')
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in stop_words)
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in custom_stopwords)
    processed_tweet = " ".join(Word(word).lemmatize() for word in processed_tweet.split())
    return(processed_tweet)

In [24]:
records_df["Processed_tweets"] = records_df["Clean_tweets"].apply(lambda x: preprocess_tweets(x, custom_stopwords))
#records_df.head()

## RoBERTa model

In [25]:
#pip install transformers

In [26]:
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"

In [27]:
hf_token = hf_token

In [28]:
API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": "Bearer %s" % (hf_token)}

In [29]:
def analysis(data):
    payload = dict(inputs=data, options=dict(wait_for_model=True))
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [30]:
tweets = []

In [31]:
for item in records_df["Processed_tweets"]:
    tweets.append(item)

In [32]:
#tweets

In [33]:
tweets_analysis = []
for tweet in tweets:
    try:
        sentiment_result = analysis(tweet)[0]
        top_sentiment = max(sentiment_result, key=lambda x: x['score']) # Get the sentiment with the higher score
        tweets_analysis.append({'Processed_tweets': tweet, 'Sentiment': top_sentiment['label']})
 
    except Exception as e:
        print(e)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [34]:
#tweets_analysis

In [35]:
# Load the data in a dataframe
pd.set_option('max_colwidth', None)
pd.set_option('display.width', 3000)
df = pd.DataFrame(tweets_analysis)

In [38]:
df.head()

Unnamed: 0,Processed_tweets,Sentiment
0,y’all throwing especially size l shis size shoe please bring dispose,negative
1,live est page many thing talk including climate change activism etc httpstcotjkfjnvykg,neutral
2,literally wear done something le serious could agree literal child pornography campaign would approved many people involves whole company,negative
3,demna lead designer get,neutral
4,kanye burying news tired 😴,negative


In [39]:
df = pd.merge(records_df, df, how = "left", on = ["Processed_tweets"])

In [40]:
df.head(10)

Unnamed: 0,Tweet_Id,Datetime,Text,Likes,Retweets,Location,Cancel,Pedophilia,QAnon,Clean_tweets,Processed_tweets,Sentiment
0,1598466952309211136,2022-12-01 23:59:54+00:00,"If y’all are throwing out any Balenciaga, especially size L shirts or a size 12 for shoes, please bring it to me so I can dispose of it",2,0,"Los Angeles, CA",0,0,0,if y’all are throwing out any balenciaga especially size l shis or a size for shoes please bring it to me so i can dispose of it,y’all throwing especially size l shis size shoe please bring dispose,negative
1,1598466908008980480,2022-12-01 23:59:43+00:00,"#75 live at 8pm EST on this page. So many things to talk about including Balenciaga, climate change activism etc. https://t.co/tJkfjNvyKg",0,0,Montreal,0,0,0,live at est on this page so many things to talk about including balenciaga climate change activism etc httpstcotjkfjnvykg,live est page many thing talk including climate change activism etc httpstcotjkfjnvykg,neutral
2,1598466875947880448,2022-12-01 23:59:35+00:00,"@BaileyUnspoken @RealTalkPerson @KimKardashian @RealTristan13 But she literally ONLY wears balenciaga. If Balenciaga had done something less serious I could agree with you, but this is literal CHILD PORNOGRAPHY. The campaign would have had to be approved by so many people at Balenciaga. This involves the whole company.",0,0,Riverside Estates,0,0,0,but she literally only wears balenciaga if balenciaga had done something less serious i could agree with you but this is literal child pornography the campaign would have had to be approved by so many people at balenciaga this involves the whole company,literally wear done something le serious could agree literal child pornography campaign would approved many people involves whole company,negative
3,1598466874303463426,2022-12-01 23:59:35+00:00,@kanyewest DEMNA! Lead Designer of Balenciaga! Get him out.,0,0,"Houston, TX",0,0,0,demna lead designer of balenciaga get him out,demna lead designer get,neutral
4,1598466848319750144,2022-12-01 23:59:29+00:00,Kanye out here burying Balenciaga news tired 😴,0,0,"Manchester, England",0,0,0,kanye out here burying balenciaga news tired 😴,kanye burying news tired 😴,negative
5,1598466846080262144,2022-12-01 23:59:28+00:00,Thank God I didn’t end up buying them balenciaga sneakers last year,0,0,Sydney / Nairobi / Harare,0,0,0,thank god i didn’t end up buying them balenciaga sneakers last year,thank god didn’t end buying sneaker last year,neutral
6,1598466840992579584,2022-12-01 23:59:27+00:00,@mmpadellan With the drama at Balenciaga and Democrats mad at Elon Musk removing pedo sights. It like the Democrats are trying to normalize pedophilia. I’m so disgusted right now!!,3,0,,0,1,0,with the drama at balenciaga and democrats mad at elon musk removing pedo sights it like the democrats are trying to normalize pedophilia i’m so disgusted right now,drama democrat mad elon musk removing pedo sight like democrat trying normalize pedophilia i’m disgusted right,negative
7,1598466839293599745,2022-12-01 23:59:27+00:00,@kanyewest We love Demna. He's not Balenciaga CEO.,0,0,,0,0,0,we love demna hes not balenciaga ceo,love demna he ceo,positive
8,1598466829797687298,2022-12-01 23:59:24+00:00,@vikare06 Looks like a balenciaga photo shoot,2,0,Hyde Park,0,0,0,looks like a balenciaga photo shoot,look like photo shoot,neutral
9,1598466786244317184,2022-12-01 23:59:14+00:00,@DestinyVaughn @M4D3R0 @ksenijapavlovic @MikhailaFuller @BALENCIAGA @KimKardashian The photos are above,0,0,"New York, NY / Manahatta",0,0,0,the photos are above,photo,neutral


In [41]:
df.columns

Index(['Tweet_Id', 'Datetime', 'Text', 'Likes', 'Retweets', 'Location', 'Cancel', 'Pedophilia', 'QAnon', 'Clean_tweets', 'Processed_tweets', 'Sentiment'], dtype='object')

In [51]:
df.isnull().sum()

Tweet_Id            0
Datetime            0
Text                0
Likes               0
Retweets            0
Location            0
Cancel              0
Pedophilia          0
QAnon               0
Clean_tweets        0
Processed_tweets    0
Sentiment           0
dtype: int64

In [46]:
def safe_value(field_val):
    return field_val if not pd.isna(field_val) else "Other"

df["Location"] = df["Location"].apply(safe_value)
df["Sentiment"] = df["Sentiment"].apply(safe_value)

In [47]:
df.to_csv("processed_tweets.csv", sep=",", index = False)

## Store processed tweets in Elasticsearch

In [48]:
from elasticsearch.helpers import bulk

In [49]:
bulk_data = []
for i,row in df.iterrows():
    bulk_data.append(
        {
            "_index": "tweets_sentiment",
            "_id": i,
            "_source": {
                "Tweet_Id": row["Tweet_Id"],
                "Datetime": row["Datetime"],
                "Text": row["Text"],
                "Likes": row["Likes"],
                "Retweets": row["Retweets"],
                "Location": row["Location"],
                "Cancel": row["Cancel"],
                "Pedophilia": row["Pedophilia"],
                "QAnon": row["QAnon"],
                "Clean_tweets": row["Clean_tweets"],
                "Processed_tweets": row["Processed_tweets"],
                "Sentiment": row["Sentiment"]
            }
        }
    )
bulk(es, bulk_data)

(1018, [])

In [50]:
es.indices.refresh(index="tweets_sentiment")
es.cat.count(index="tweets_sentiment", format="json")

[{'epoch': '1675116021', 'timestamp': '22:00:21', 'count': '1018'}]