In [3]:
import pandas as pd
from twitter_bot_detection.helpers import log_running_time
from kedro.io import PickleLocalDataSet
from catboost.text_processing import Tokenizer

In [4]:
users = pd.read_pickle('data/03_primary/users.pkl')

In [5]:
def count_char_types(string):
    flags = [0, 0, 0, 0]
    for c in string:
        if c.islower():
            flags[0] = 1
        elif c.isupper():
            flags[1] = 1
        elif c.isdigit():
            flags[2] = 1
        else:
            flags[3] = 1
        
    return sum(flags)

In [6]:
features = users[[
    "protected", "followers_count", "friends_count",  "listed_count", "favourites_count", "statuses_count",
    "verified", "default_profile", "default_profile_image",
]].copy()

features["protected"] = features["protected"].astype(int)
features["verified"] = features["verified"].astype(int)
features["default_profile"] = features["default_profile"].astype(int)
features["default_profile_image"] = features["default_profile_image"].astype(int)


features["char_types"] = users["screen_name"].apply(count_char_types)
features["has_location"] = (users.location != '').astype(int)
features["has_description"] = (users.description != '').astype(int)
features["created_at_time"] = pd.cut(users["created_at"].dt.hour, [-1, 6, 11, 18, 23], labels=["night", "morning", "day", "evening"])
features["account_active_for_days"] = (users["last_status_date"] - users["created_at"]).dt.days
features["has_banner"] = ~users["profile_banner_url"].isnull().astype(int)
features["has_profile_url"] = ~users["profile_url"].isnull().astype(int)
features["description_urls_count"] = users["description_urls"].str.len()
features["label"] = (users["label"] == 'bot').astype(int)

features["tweets_per_day"] = (users["statuses_count"] + 0.001) / (features["account_active_for_days"] + 0.001)
features["favourites_per_day"] = (users["favourites_count"] + 0.001) / (features["account_active_for_days"] + 0.001)
features["fr_to_flw_ratio"] = (users["friends_count"] + 0.001) / (users["followers_count"] + 0.001)
features["faw_to_tweets_ratio"] = (users["favourites_count"] + 0.001) / (users["statuses_count"] + 0.001)
features["tweets_to_faw_ratio"] = (users["statuses_count"] + 0.001) / (users["favourites_count"] + 0.001)
# features["listed_count_cat"] = pd.cut(users["listed_count"], [-1, 100, 500, 2000, 5000, 10000, float('inf')], labels=['100', '500', '2000', '5000', '10000', 'inf']).astype(str)

features = pd.get_dummies(features, columns=["created_at_time"])


2020-03-13 16:06:11,403 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [45]:
features.to_pickle('data/04_features/users.pkl')

In [46]:
@log_running_time
def extract_user_features(users: PickleLocalDataSet) -> PickleLocalDataSet:
    features = users[[
        "protected", "followers_count", "friends_count",  "listed_count", "favourites_count", "statuses_count",
        "verified", "default_profile", "default_profile_image",
    ]].copy()
    features["char_types"] = users["screen_name"].apply(count_char_types)
    features["has_location"] = users.location != ''
    features["has_description"] = users.description != ''
    features["created_at_time"] = pd.cut(users["created_at"].dt.hour, [-1, 6, 11, 18, 23], labels=["night", "morning", "day", "evening"])
    features["account_active_for_days"] = (users["last_status_date"] - users["created_at"]).dt.days
    features["has_banner"] = ~users["profile_banner_url"].isnull()
    features["has_profile_url"] = ~users["profile_url"].isnull()
    features["description_urls_count"] = users["description_urls"].str.len()
    features["label"] = (users["label"] == 'bot').astype(int)

    features["tweets_per_day"] = users["statuses_count"] / (features["account_active_for_days"] + 0.0000001)
    features["favourites_per_day"] = users["favourites_count"] / (features["account_active_for_days"] + 0.0000001)
    features["fr_to_flw_ratio"] = users["friends_count"] / (users["followers_count"] + 0.0000001)
    features["faw_to_tweets_ratio"] = users["favourites_count"] / (users["statuses_count"] + 0.0000001)
    features["tweets_to_faw_ratio"] = users["statuses_count"] / (users["favourites_count"] + 0.0000001)
    # features["listed_count_cat"] = pd.cut(users["listed_count"], [-1, 100, 500, 2000, 5000, 10000, float('inf')], labels=['100', '500', '2000', '5000', '10000', 'inf']).astype(str)

    return features.drop(columns=["protected"])

In [22]:
extract_user_features(users).to_pickle('data/04_features/user_features.pkl')

2020-03-10 23:09:28,816 - twitter_bot_detection.helpers - INFO - Running 'extract_user_features' took 0.04 seconds
