In [48]:
import glob
import vaex
import vaex.ml
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [49]:
hdf5_list = np.array(glob.glob('data/01_raw/hdf5/*.hdf5'))

In [50]:
tweets = vaex.open_many(hdf5_list)

In [None]:
tweets.export_hdf5

In [4]:
tweets.export_hdf5(path='data/02_intermediate/tweets.hdf5')


In [45]:
df = vaex.open('data/02_intermediate/tweets.hdf5')

In [46]:
features = df.groupby(by=df["user_id"], agg={
#     "tweets": vaex.agg.count("id"),
    "hashtags_mean": vaex.agg.mean("hashtags_count"),
    "hashtags_max": vaex.agg.max("hashtags_count"),
    "hashtags_min": vaex.agg.min("hashtags_count"),
    "hashtags_std": vaex.agg.std("hashtags_count"),
    "reply_to_unique": vaex.agg.nunique("reply_to"),
    "is_reply": vaex.agg.sum("is_reply"),
    "is_quote": vaex.agg.sum("is_quote"),
    "unique_sources": vaex.agg.nunique("source"),
})

In [47]:
features = features.to_pandas_df()
features.set_index("user_id", inplace=True)
features.to_pickle('data/04_features/tweets_features.pkl')

In [5]:
df["hashtags_count"].std()

1.159120669529411

In [29]:
scaler = vaex.ml.MaxAbsScaler(features=["hashtags_count"], prefix='scaled_')
scaler.fit(df)
df_trans = scaler.transform(df)
df_trans[["hashtags_count", "scaled_hashtags_count"]]

#,hashtags_count,scaled_hashtags_count
0,0,0.0
1,3,0.1
2,3,0.1
3,1,0.03333333333333333
4,1,0.03333333333333333
...,...,...
80019240,0,0.0
80019241,6,0.2
80019242,2,0.06666666666666667
80019243,2,0.06666666666666667


In [26]:
df[df["user_id"] == "3378113121"]["hashtags_count"][:1000].values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
df[df["user_id"] == "3378113121"]["hour"].std()

0.4926312996394547

In [6]:
df.dtypes

id                         <class 'str'>
user_id                    <class 'str'>
created_at                datetime64[ns]
source                     <class 'str'>
is_reply                            bool
is_quote                            bool
hashtags_count                      int8
mentions_count                      int8
urls_count                          int8
symbols_count                       int8
sensitive                           bool
truncated                           bool
lang                       <class 'str'>
is_retweet                          bool
text                       <class 'str'>
retweeted_author           <class 'str'>
media_count                         int8
reply_to                   <class 'str'>
quote_of                   <class 'str'>
retweet_count                      int32
favorite_count                     int32
year                               int64
month                              int64
day                                int64
hour            