In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import json
from src.coordinator import Coordinator
from src.utils.preprocessing import TextPreprocessor
import pandas as pd

In [7]:
coord = Coordinator()

In [8]:
dataset = pd.DataFrame()
for file_path in coord.data_raw.iterdir():
    if file_path.name.endswith('jsonl'):
        user = file_path.stem
        user_df = pd.read_json(file_path, lines=True)
        dataset = dataset.append(user_df, ignore_index=True)
        print(user, user_df.shape[0])
print('TOTAL:', dataset.shape[0])

britneyspears 5116
arianagrande 14018
rihanna 9623
justinbieber 14000
katyperry 8880
theellenshow 14000
selenagomez 4276
cnnbrk 14000
twitter 10730
kimkardashian 14000
taylorswift13 420
ladygaga 7615
realdonaldtrump 13999
barackobama 7059
narendramodi 14000
jtimberlake 3320
shakira 5408
youtube 14000
jimmyfallon 11632
cristiano 3251
TOTAL: 189347


In [9]:
dataset.head(3)

Unnamed: 0,tweet_id,user,time_epoch,tweet,n_likes,n_retweets,n_replies,n_emojies,hashtags,mentions
0,1247643096596111362,britneyspears,1586296268,Enough said 😜🧼🧼🌸🌸🌸🌸 and thank you to all of th...,86426,25499,960,5,"[#worldhealthday, #thankshealthheroes]",[]
1,1243706495793074182,britneyspears,1585357710,Oops!…how did 20 years go by so fast 😅😅🙄 ?! I ...,122851,16256,2212,8,[],[]
2,1243668424674459648,britneyspears,1585348633,Happy Birthday @MariahCarey !! You are one of ...,61079,7634,1345,1,[],[mariahcarey]


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189347 entries, 0 to 189346
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tweet_id    189347 non-null  int64 
 1   user        189347 non-null  object
 2   time_epoch  189347 non-null  int64 
 3   tweet       189347 non-null  object
 4   n_likes     189347 non-null  int64 
 5   n_retweets  189347 non-null  int64 
 6   n_replies   189347 non-null  int64 
 7   n_emojies   189347 non-null  int64 
 8   hashtags    189347 non-null  object
 9   mentions    189347 non-null  object
dtypes: int64(6), object(4)
memory usage: 14.4+ MB


## 1. Remove all links and pictures in a tweet

In [11]:
preprocessor = TextPreprocessor()

In [12]:
dataset['tweet'].apply(lambda tweet: len(preprocessor.url_rgx.findall(tweet))).value_counts()

0    110729
1     77069
2      1435
3        98
4        13
5         3
Name: tweet, dtype: int64

In [13]:
dataset['tweet'].apply(lambda tweet: len(preprocessor.pic_rgx.findall(tweet))).value_counts()

0    132941
1     56401
2         5
Name: tweet, dtype: int64

**Drop links from tweet**

In [14]:
dataset['tweet'] = dataset['tweet'].apply(lambda tweet: preprocessor.clean_links(tweet))
dataset['tweet'].apply(lambda tweet: len(preprocessor.pic_rgx.findall(tweet))).value_counts(), \
dataset['tweet'].apply(lambda tweet: len(preprocessor.url_rgx.findall(tweet))).value_counts()

(0    189347
 Name: tweet, dtype: int64,
 0    189347
 Name: tweet, dtype: int64)

## 2. Remove emojis in a tweet

In [22]:
dataset['tweet'] = dataset['tweet'].apply(lambda tweet: preprocessor.clean_emojis(tweet))
dataset['tweet'].apply(lambda tweet: len(preprocessor.emoji_rgx.findall(tweet))).value_counts()

0    117162
Name: tweet, dtype: int64

## 3. Drop tweets with less than 50 characters

**Number of tweets with less than 50 characters and more than 50 characters respectively**

In [23]:
dataset[dataset['tweet'].apply(lambda tweet: sum(len(token) for token in tweet.split()) < 50)].shape[0]

287

In [24]:
dataset = dataset[dataset['tweet'].apply(lambda tweet: sum(len(token) for token in tweet.split()) >= 50)]
dataset.shape[0]

116875

In [25]:
dataset.groupby('user').size()

user
arianagrande        4308
barackobama         6357
britneyspears       3159
cnnbrk             13814
cristiano           2016
jimmyfallon         8110
jtimberlake         2195
justinbieber        5084
katyperry           5585
kimkardashian       5676
ladygaga            5461
narendramodi       12636
realdonaldtrump    11616
rihanna             4890
selenagomez         2281
shakira             4387
taylorswift13        218
theellenshow       10141
twitter             1840
youtube             7101
dtype: int64

In [26]:
dataset.head(3)

Unnamed: 0,tweet_id,user,time_epoch,tweet,n_likes,n_retweets,n_replies,n_emojies,hashtags,mentions
0,1247643096596111362,britneyspears,1586296268,Enough said 🧼🧼 and thank you to all of the hea...,86426,25499,960,5,"[#worldhealthday, #thankshealthheroes]",[]
1,1243706495793074182,britneyspears,1585357710,Oops!…how did 20 years go by so fast ?! I can...,122851,16256,2212,8,[],[]
2,1243668424674459648,britneyspears,1585348633,Happy Birthday @MariahCarey !! You are one of ...,61079,7634,1345,1,[],[mariahcarey]


In [31]:
dataset.to_json(coord.data_interim / 'dataset_v1.jsonl', lines=True, orient='records')

## 4. Preprocessing demonstraion on a few examples

In [27]:
preprocessor = TextPreprocessor()

In [28]:
examples = dataset['tweet'].iloc[:10].values
examples

array(['Enough said\xa0🧼🧼\xa0and thank you to all of the healthcare workers tirelessly working to keep us safe during this time !!!! #WorldHealthDay #ThanksHealthHeroes',
       'Oops!…how did 20 years go by so fast\xa0\xa0?! I can’t believe it. I remember that red suit was so freaking hot … but the dance was fun\xa0\xa0and it made the shoot fly by !!! You have all shown so much support for this song & I thank you for it … sending love to you all\xa0️️️\xa0!!',
       'Happy Birthday @MariahCarey !! You are one of the main reasons I started singing ... your Butterfly 🦋 album never gets old even after 20+ years … it’s simply a classic and I will be listening to it today as I work out in the gym !!! Have a wonderful birthday\xa0🦋\xa0!!! God bless.',
       'Listen to the new @y2k2y and @_AlexanderLewis remix of #Toxic, released today in celebration of the opening of @BritneyTheZone!   \xa0 @RCARecords',
       'Coming soon !!!!  I can’t wait to see all of the pics you guys take at @britn

In [29]:
for example in examples:
    print(example)
    print(preprocessor(example))
    print()

Enough said 🧼🧼 and thank you to all of the healthcare workers tirelessly working to keep us safe during this time !!!! #WorldHealthDay #ThanksHealthHeroes
['healthcare', 'worker', 'tirelessly', 'work', 'safe', 'time', 'worldhealthday', 'thankshealthheroes']

Oops!…how did 20 years go by so fast  ?! I can’t believe it. I remember that red suit was so freaking hot … but the dance was fun  and it made the shoot fly by !!! You have all shown so much support for this song & I thank you for it … sending love to you all ️️️ !!
['oops', 'year', 'fast', 'remember', 'red', 'suit', 'freak', 'hot', 'dance', 'fun', 'shoot', 'fly', 'support', 'song', 'send', 'love']

Happy Birthday @MariahCarey !! You are one of the main reasons I started singing ... your Butterfly 🦋 album never gets old even after 20+ years … it’s simply a classic and I will be listening to it today as I work out in the gym !!! Have a wonderful birthday 🦋 !!! God bless.
['happy', 'birthday', 'mariahcarey', 'main', 'reason', 'start'