# DM - Timeseries analysis [TASK 4.1]

Library imports and initial settings.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from pandarallel import pandarallel
import pandas as pd

from utils import fetch_preprocessed_dataset, store_preprocessed_dataset, build_grid_plot

import os
import sys
import logging as lg
import warnings

warnings.filterwarnings("ignore")

root = lg.getLogger()
root.setLevel(lg.INFO)

handler = lg.StreamHandler(sys.stdout)
handler.setLevel(lg.DEBUG)
formatter = lg.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

nb_workers = int(os.cpu_count() / 2 + 1)

pandarallel.initialize(
    progress_bar=True,
    nb_workers=nb_workers,
)

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Fetching users and tweets dataset saved.

In [2]:
# Load the users_num dataset from csv
dataset = fetch_preprocessed_dataset(step_name="outlier_detection")
users = dataset['users.pickle']
tweets = dataset['tweets.pickle']

In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11508 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              11508 non-null  int64         
 1   name            11508 non-null  string        
 2   lang            11508 non-null  string        
 3   bot             11508 non-null  bool          
 4   created_at      11508 non-null  datetime64[ns]
 5   statuses_count  11508 non-null  int64         
dtypes: bool(1), datetime64[ns](1), int64(2), string(2)
memory usage: 550.7 KB


In [4]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10266779 entries, 0 to 13664695
Data columns (total 9 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user_id         object        
 1   retweet_count   float64       
 2   reply_count     float64       
 3   favorite_count  float64       
 4   num_hashtags    float64       
 5   num_urls        float64       
 6   num_mentions    float64       
 7   created_at      datetime64[ns]
 8   text            string        
dtypes: datetime64[ns](1), float64(6), object(1), string(1)
memory usage: 783.3+ MB


## Preprocessing

Remove tweets with invalid `user_id`.

In [5]:
user_ids = set([str(i) for i in users['id'].values.tolist()])

tweets = tweets[tweets['user_id'].map(lambda i: i in user_ids)].astype({'user_id': 'int64'})

Tweets filtering by date. Here are used tweets of year 2019.

In [6]:
INIT_2019_TIMESTAMP = pd.Timestamp('2019-01-01')
INIT_2020_TIMESTAMP = pd.Timestamp('2020-01-01')

mask_tweets_2019 = tweets['created_at'].parallel_map(lambda t: INIT_2019_TIMESTAMP <= t < INIT_2020_TIMESTAMP)
tweets = tweets[mask_tweets_2019]
tweets = tweets.sort_values(by=['created_at'])

tweets

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2010486), Label(value='0 / 2010486…

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
8384098,2240858066,0.0,0.0,0.0,0.0,0.0,0.0,2019-01-01 00:21:18,tava me sentindo super mal esses dias
6210422,2240858066,0.0,0.0,0.0,0.0,0.0,0.0,2019-01-01 00:23:50,Ãs vezes tudo o que a gente precisa pra se se...
6852918,2240858066,4.0,0.0,0.0,0.0,0.0,1.0,2019-01-01 00:24:30,RT @myh3ro: TO FAZENDO TODO MUNDO ASSISTIR SHE...
192212,494302461,1.0,0.0,0.0,0.0,0.0,0.0,2019-01-01 00:47:32,"Ã cosÃ¬, mente e cuore sono anarchici nella l..."
7159403,494302461,8.0,0.0,7.0,0.0,0.0,0.0,2019-01-01 00:48:03,"Le cose piÃ¹ belle non sono perfette, sono spe..."
...,...,...,...,...,...,...,...,...,...
7460655,2275740517,1.0,0.0,1.0,0.0,0.0,0.0,2019-11-15 22:02:30,Anche la mia mamma quando ero piccolo mi sgrid...
12234385,2273349732,0.0,0.0,0.0,0.0,0.0,0.0,2019-11-15 22:03:55,LE DIFFICOLTA' SONO COME LA CARTAIGIENICA... N...
9099401,2275248397,0.0,0.0,0.0,0.0,0.0,0.0,2019-11-15 22:05:08,Tra 20 anni sarai deluso per ciÃ² che non hai ...
9440580,2274254095,0.0,0.0,0.0,0.0,0.0,0.0,2019-11-15 22:10:43,"Vuoi sapere chi sei? Non chiedertelo, agisci: ..."


Computation of indexes `acceptance_score` and `diffusion_score` to combine them in the `success_score`.

In [7]:
acceptance_score = tweets['retweet_count'] + tweets['reply_count'] + tweets['favorite_count']
diffusion_score = tweets['num_hashtags'] + tweets['num_mentions'] + tweets['num_urls']

success_score = acceptance_score / (diffusion_score + 0.1)

tweets["success_score"] = success_score.map(lambda s: -1 if s == 0 else s)

tweets["success_score"].describe()

count    4.474637e+06
mean     1.330052e+02
std      4.083036e+03
min     -1.000000e+00
25%     -1.000000e+00
50%     -1.000000e+00
75%      9.090909e-01
max      1.823150e+06
Name: success_score, dtype: float64

Useless columns removal.

In [8]:
users = users[["id", "bot"]]
tweets = tweets[["user_id", "success_score"]]

Success score timeseries column addition. Here it's created a new column for each users with the list of `success_score` ordered by `created_at` date.

In [13]:
tweets_grouped_by_users = tweets.groupby(['user_id'])['success_score'].apply(list)

users = users.merge(tweets_grouped_by_users.to_frame('success_score'), left_on='id', right_index=True)

`users` preprocessed dataframe.

In [15]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6787 entries, 0 to 11506
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             6787 non-null   int64 
 1   bot            6787 non-null   bool  
 2   success_score  6787 non-null   object
dtypes: bool(1), int64(1), object(1)
memory usage: 165.7+ KB


In [14]:
users

Unnamed: 0,id,bot,success_score
0,2353593986,True,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,2358850842,False,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,466124818,True,"[-1.0, 70.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
5,2199062688,False,"[72.72727272727272, -1.0, 60.0, 10.0, 14499.99..."
9,2357425536,True,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...
11496,39220893,False,"[1.8181818181818181, 0.9090909090909091, 17.09..."
11497,2364069194,True,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
11501,2364683192,True,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
11504,1378532629,False,"[-1.0, 20.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


## Timeseries clustering

## Shapelets discovery