# DM - Timeseries analysis [TASK 4.1]

Library imports and initial settings.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from pandarallel import pandarallel
import pandas as pd

from utils import fetch_preprocessed_dataset, store_preprocessed_dataset, build_grid_plot

import os
import sys
import logging as lg
import warnings

warnings.filterwarnings("ignore")

root = lg.getLogger()
root.setLevel(lg.INFO)

handler = lg.StreamHandler(sys.stdout)
handler.setLevel(lg.DEBUG)
formatter = lg.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

nb_workers = int(os.cpu_count() / 2 + 1)

pandarallel.initialize(
    progress_bar=True,
    nb_workers=nb_workers,
)

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Fetching users and tweets dataset saved.

In [2]:
# Load the users_num dataset from csv
dataset = fetch_preprocessed_dataset(step_name="outlier_detection")
users = dataset['users.pickle']
tweets = dataset['tweets.pickle']

In [3]:
users.info

<bound method DataFrame.info of                id                name lang    bot          created_at  \
0      2353593986     Lamonica Raborn   en   True 2019-02-22 18:00:42   
1      2358850842       Lourie Botton   en  False 2019-02-26 03:02:32   
2       137959629    Dadan Syarifudin   en   True 2015-04-30 07:09:56   
3       466124818      Carletto Focia   it   True 2017-01-18 02:49:18   
4      2571493866           MBK Ebook   en  False 2019-06-18 19:30:21   
...           ...                 ...  ...    ...                 ...   
11503  2911861962  Madrid Lae Maika .   en  False 2019-11-29 13:16:02   
11504  1378532629           Clau Sato   en  False 2018-04-27 03:01:58   
11505   126984069  ALMA LETICIA NUÑO    es  False 2015-03-29 17:01:24   
11506  2383025796     Minnie Guadagno   en   True 2019-03-13 02:44:13   
11507   933183398           Corvanna    en  False 2017-11-09 23:24:16   

       statuses_count  
0                  76  
1                  54  
2                  

In [4]:
tweets.info

<bound method DataFrame.info of              user_id  retweet_count  reply_count  favorite_count  \
0          327746321            0.0          0.0             0.0   
1          333722906            1.0          0.0             0.0   
2         2379755827            0.0          0.0             0.0   
3          466226882            0.0          0.0             0.0   
4         1355537995          114.0          0.0             0.0   
...              ...            ...          ...             ...   
13664687  2272611686            1.0          0.0             3.0   
13664689     8657052            0.0          0.0             1.0   
13664691   587491046            0.0          0.0             0.0   
13664694   127895572            0.0          0.0             1.0   
13664695   465421036            3.0          0.0             4.0   

          num_hashtags  num_urls  num_mentions          created_at  \
0                  0.0       0.0           0.0 2019-09-11 14:53:55   
1          

In [5]:
tweets.info

<bound method DataFrame.info of              user_id  retweet_count  reply_count  favorite_count  \
0          327746321            0.0          0.0             0.0   
1          333722906            1.0          0.0             0.0   
2         2379755827            0.0          0.0             0.0   
3          466226882            0.0          0.0             0.0   
4         1355537995          114.0          0.0             0.0   
...              ...            ...          ...             ...   
13664687  2272611686            1.0          0.0             3.0   
13664689     8657052            0.0          0.0             1.0   
13664691   587491046            0.0          0.0             0.0   
13664694   127895572            0.0          0.0             1.0   
13664695   465421036            3.0          0.0             4.0   

          num_hashtags  num_urls  num_mentions          created_at  \
0                  0.0       0.0           0.0 2019-09-11 14:53:55   
1          

## Preprocessing

Tweets filtering by date. Here are used tweets of year 2019.

In [6]:
INIT_2019_TIMESTAMP = pd.Timestamp('2019-01-01')
INIT_2020_TIMESTAMP = pd.Timestamp('2020-01-01')

mask_tweets_2019 = tweets['created_at'].parallel_map(lambda t: INIT_2019_TIMESTAMP <= t < INIT_2020_TIMESTAMP)
tweets2019 = tweets[mask_tweets_2019]

tweets2019

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2053356), Label(value='0 / 2053356…

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
0,327746321,0.0,0.0,0.0,0.0,0.0,0.0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang..."
2,2379755827,0.0,0.0,0.0,0.0,0.0,1.0,2019-05-02 13:34:31,@LOLatComedy awsome
3,466226882,0.0,0.0,0.0,0.0,0.0,0.0,2019-11-04 07:17:37,Stephen Hawkins: i buchi neri non esistono se ...
9,5812422,0.0,0.0,0.0,0.0,0.0,3.0,2019-01-20 16:13:04,“@darrenrovell: Denver businesses keep taking ...
10,2599346388,1.0,0.0,1.0,0.0,0.0,0.0,2019-07-28 09:38:49,Where df @Sassy_ileana at man. When she's awak...
...,...,...,...,...,...,...,...,...,...
13664685,466207205,0.0,0.0,0.0,0.0,0.0,0.0,2019-11-03 16:59:57,A volte mi piacerebbe sdoppiarmi per potermi a...
13664687,2272611686,1.0,0.0,3.0,0.0,0.0,0.0,2019-01-27 16:01:54,C'Ã¨ un momento in cui sentire ed ascoltare si...
13664689,8657052,0.0,0.0,1.0,0.0,0.0,1.0,2019-04-26 02:53:34,@_victoriamm twoot.
13664694,127895572,0.0,0.0,1.0,1.0,0.0,0.0,2019-03-07 19:56:55,Shooting crew of porn movies. #TheWorstJobToHave


Computation of indexes `acceptance_score` and `diffusion_score` to combine them in the `success_score`.

In [7]:
acceptance_score = tweets['retweet_count'] + tweets['reply_count'] + tweets['favorite_count']
diffusion_score = tweets['num_hashtags'] + tweets['num_mentions'] + tweets['num_urls']

success_score = acceptance_score / (diffusion_score + 0.1)

success_score = success_score.map(lambda s: -1 if s == 0 else s)

success_score

0           -1.000000
1            0.909091
2           -1.000000
3           -1.000000
4           54.285714
              ...    
13664687    40.000000
13664689     0.909091
13664691    -1.000000
13664694     0.909091
13664695    70.000000
Length: 10266779, dtype: float64

## Timeseries clustering

## Shapelets discovery