## Data Mining

### RUN Only with COLAB

This cell will setup notebook for running on Google Colab platform.

In [4]:
#!git clone https://FedericoSilvestri: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@github.com/federicosilvestri/data-mining.git

In [5]:
#%cd data-mining

In [6]:
import json
import math
import re
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

import sys
import logging as lg

root = lg.getLogger()
root.setLevel(lg.INFO)

handler = lg.StreamHandler(sys.stdout)
handler.setLevel(lg.DEBUG)
formatter = lg.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

  from scipy.stats.stats import pearsonr


## Dataset

Fetching the dataset

In [7]:
from utils import fetch_dataset

dataset = fetch_dataset()

2022-10-29 10:09:51,232 - root - INFO - Pandas reading dataset tweets.csv...
2022-10-29 10:10:16,284 - root - INFO - Pandas reading dataset users.csv...


### Users

In [8]:
users = dataset['users.csv']

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              11508 non-null  int64  
 1   name            11507 non-null  object 
 2   lang            11508 non-null  object 
 3   bot             11508 non-null  int64  
 4   created_at      11508 non-null  object 
 5   statuses_count  11109 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 539.6+ KB


#### Data cleaning


In [9]:
min_date = pd.Timestamp('2006-03-21')
max_date = pd.Timestamp('2022-09-28')

def clean_null_attributes(df, column_validators, ratio=0.5):
    n_null_items = int(len(column_validators) * ratio)
    rows = []
    for i, row in df.iterrows():
        count = 0
        for head, validator in column_validators:
            if row[head] == np.nan or (validator is not None and validator(row[head])):
                count += 1
        if count > n_null_items:
            rows.append(i)
    df.drop(df.index[rows], inplace=True)
    return rows

check_int = lambda label: not bool(re.search(r'^(\d)+(\.0+)?$', str(label)))
check_positive_int = lambda label: check_int(label) or float(label) < 0
check_date = lambda label: pd.Timestamp(label) < min_date or pd.Timestamp(label) > max_date

In [10]:
column_validators = [
    ('id', check_int),
    ('name', None),
    ('lang', None),
    ('bot', lambda label: label == '1' or label == '0'),
    ('statuses_count', check_int),
    ('created_at', check_date),
]

deleted_rows = clean_null_attributes(users, column_validators)
print(f'deleted rows: {len(deleted_rows)}')

deleted rows: 0


In [11]:
users['name'].replace(np.nan, '', inplace=True)

In [12]:
users['lang'].value_counts()

en                    9970
it                     906
es                     319
pt                      65
en-gb                   50
ru                      42
fr                      36
ja                      33
zh-tw                   17
tr                      14
id                      12
ko                       9
de                       8
nl                       6
en-GB                    4
ar                       3
zh-TW                    3
da                       2
Select Language...       2
en-AU                    1
zh-cn                    1
pl                       1
el                       1
fil                      1
sv                       1
xx-lc                    1
Name: lang, dtype: int64

In [13]:
users['bot'].value_counts()

1    6116
0    5392
Name: bot, dtype: int64

In [14]:
outlier_data = pd.Timestamp('1800-01-01')

def filter_datetime(df, att):
    def date_callback(el):
        try:
            datetime = pd.Timestamp(el)
            if datetime < min_date or datetime > max_date:
                return np.nan
            else:
                return datetime
        except ValueError:
            return np.nan

    df[att] = df[att].map(date_callback)
    df[att] = df[att].map(lambda date: date if date is not np.nan else outlier_data)
    return df.astype({att: 'datetime64[ns]'})

In [15]:
users = filter_datetime(users, 'created_at')

In [16]:
users = users[users['statuses_count'].notna()].astype({'statuses_count': 'int64'})

In [17]:
print(f'start length: {len(users)}')
users = users.drop_duplicates()
print(f'end length: {len(users)}')

start length: 11109
end length: 11109


In [18]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11109 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              11109 non-null  int64         
 1   name            11109 non-null  object        
 2   lang            11109 non-null  object        
 3   bot             11109 non-null  int64         
 4   created_at      11109 non-null  datetime64[ns]
 5   statuses_count  11109 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 607.5+ KB


In [19]:
users.describe()

Unnamed: 0,id,bot,statuses_count
count,11109.0,11109.0,11109.0
mean,1263638000.0,0.550545,5883.312359
std,936290900.0,0.497461,19073.007305
min,678033.0,0.0,0.0
25%,466289800.0,0.0,41.0
50%,1127892000.0,1.0,68.0
75%,2356956000.0,1.0,2916.0
max,3164942000.0,1.0,399555.0


### Tweets

In [20]:
tweets = dataset['tweets.csv']

tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13664696 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   user_id         object
 2   retweet_count   object
 3   reply_count     object
 4   favorite_count  object
 5   num_hashtags    object
 6   num_urls        object
 7   num_mentions    object
 8   created_at      object
 9   text            object
dtypes: object(10)
memory usage: 1.0+ GB


#### Data cleaning

In [21]:
def filter_int(df, att):
    df[att] = df[att].replace(r'\D', np.nan, regex=True)
    return df[df[att].notna()].astype({att: 'int64'})

def replace_with_median(df, att):
    filtered = filter_int(df, att)
    median = filtered[att].median()
    df[att].replace(np.nan, median, inplace=True)
    return df.astype({att: 'int64'})

In [None]:
column_validators = [
    ('id', None),
    ('user_id', check_int),
    ('retweet_count', check_positive_int),
    ('reply_count', check_positive_int),
    ('favorite_count', check_positive_int),
    ('num_hashtags', check_positive_int),
    ('num_urls', check_positive_int),
    ('num_mentions', check_positive_int),
    ('created_at', check_date),
    ('text', None),
]

deleted_rows = clean_null_attributes(tweets, column_validators, ratio=0.5)
print(f'deleted rows: {len(deleted_rows)}')

In [None]:
tweets = filter_int(tweets, 'user_id')
tweets = replace_with_median(tweets, 'retweet_count')
tweets = replace_with_median(tweets, 'reply_count')
tweets = replace_with_median(tweets, 'favorite_count')
tweets = replace_with_median(tweets, 'num_hashtags')
tweets = replace_with_median(tweets, 'num_urls')
tweets = replace_with_median(tweets, 'num_mentions')

In [None]:
tweets['text'].replace(np.nan, '', inplace=True)

In [None]:
tweets = filter_datetime(tweets, 'created_at')

In [None]:
print(f'start length: {len(tweets)}')
tweets = tweets.drop_duplicates()
print(f'end length: {len(tweets)}')

In [None]:
tweets.info()

In [None]:
tweets.describe()

### Correlation

In [None]:
tweets.corr()

## Distribution

In [None]:
def build_grid_plot(configs):
    cols = 2 if len(configs) <= 4 else 3
    rows = math.ceil(len(configs) / cols)
    fig_dims = (rows, cols)
    fig = plt.figure(figsize=(20, rows * 5))
    fig.subplots_adjust(hspace=0.2, wspace=0.2)

    for i, config in enumerate(configs):
        if i == len(configs) - 1 and len(configs) % cols == 1 and cols % 2 == 1:
            plt.subplot2grid(fig_dims, (i // cols, cols // 2))
        else:
            plt.subplot2grid(fig_dims, (i // cols, i % cols))
        if config['type'] == 'hist':
            config['column'].hist(bins=int(math.log2(len(config['column'])) + 1))
            plt.title(config['title'])
        elif config['type'] == 'bar':
            config['column'].value_counts().plot(kind='bar', title=config['title'])
            if ('rotation' in config) and config['rotation']:
                plt.xticks(rotation=0)
        elif config['type'] == 'boxplot':
            config['df'].boxplot(column=config['columns'])
    plt.show()

In [None]:
configs = [
    {
        'type': 'hist',
        'column': tweets['retweet_count'],
        'title': 'Retweet Counts'
    },
    {
        'type': 'hist',
        'column': tweets['reply_count'],
        'title': 'Replay Counts',
    },
    {
        'type': 'hist',
        'column': tweets['favorite_count'],
        'title': 'Favorite Counts'
    },
    {
        'type': 'hist',
        'column': tweets['num_hashtags'],
        'title': 'Hashtag Counts'
    },
    {
        'type': 'hist',
        'column': tweets['num_urls'],
        'title': 'Url Counts'
    },
    {
        'type': 'hist',
        'column': tweets['num_mentions'],
        'title': 'Mentions Counts'
    },
    {
        'type': 'hist',
        'column': tweets['created_at'],
        'title': 'Tweets Creation Date Distribution'
    }
]

build_grid_plot(configs=configs)

In [None]:
configs = [
    {
        'type': 'hist',
        'column': users['statuses_count'],
        'title': 'Statues Counts'
    },
    {
        'type': 'bar',
        'column': users['bot'].map(lambda v: 'Bot' if v else 'User'),
        'title': 'Bot and User Counts',
        'rotation': True
    },
    {
        'type': 'bar',
        'column': users['lang'],
        'title': 'Languages Counts'
    },
    {
        'type': 'hist',
        'column': users['created_at'],
        'title': 'User Creation Date Distribution'
    }
]

build_grid_plot(configs=configs)

### Outlier detection

In [None]:
def replace_outliers(df, column_name, threshold):
    column = df[column_name]
    to_replace = len(column[column > threshold])
    perc_to_replace = round((to_replace / len(column) * 100), 2)
    lg.info(f'{to_replace} ({perc_to_replace}) element replaced for column {column_name}')
    column[column > threshold] = column.median()

In [None]:
def boxplot_tweets_show():
    configs = [
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['retweet_count']
        },
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['reply_count']
        },
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['favorite_count']
        },
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['num_hashtags']
        },
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['num_urls']
        },
        {
            'type': 'boxplot',
            'df': tweets,
            'columns': ['num_mentions']
        },
    ]

    build_grid_plot(configs=configs)

In [None]:
boxplot_tweets_show()

In [None]:
replace_outliers(tweets, 'retweet_count', 6e5)
replace_outliers(tweets, 'reply_count', 6e4)
replace_outliers(tweets, 'favorite_count', 1.2e5)
replace_outliers(tweets, 'num_hashtags', 1e4)
replace_outliers(tweets, 'num_urls', 1e4)
replace_outliers(tweets, 'num_mentions', 1e5)

boxplot_tweets_show()

In [None]:
plt.figure(figsize=(20, 10))
tweets.plot.scatter(x='reply_count', y='favorite_count')
plt.show()

## Data preparation

● How many tweets were published by the user?
● How many tweets are published by the user in a given period of time?
● Total number of tweets
● Total number of likes and comments
● Ratio between the number of tweets and the number of likes
● Entropy of the user
● Average length of the tweets per user
● Average number of special characters in the tweets per user

In [None]:
tweets_grouped_by_users = tweets.groupby(['user_id']).size()
users['tweets_num'] = tweets_grouped_by_users
users['tweets_num'][users['tweets_num'].isna()] = 0
users = users.astype({'tweets_num': 'int64'})

users