# Features Games

In [1]:
import os
import re

project_path = re.findall(
    pattern=r".+monday-vip-consulting",
    string=os.path.abspath(os.getcwd()),
)[0]
os.chdir(os.path.join(project_path))
%load_ext jupyter_black

In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px


from src.utils.pather import Pather
from src.data.make_features_dataset import FeaturesDataset

pather = Pather()

### Accounts Features

In [None]:
accounts = pd.read_csv(pather.interim_accounts)
accounts.shape

In [None]:
accounts.head(10)

In [None]:
accounts.nunique()

In [None]:
features = [
    "account_id",
    "paying",
    "collection_21_days",
    "max_team_size",
    "min_team_size",
    "industry",
    "payment_currency",
    "lead_score",
]
account_features = (
    accounts[features]
    .copy()
    .set_index("account_id")
    .astype(
        {
            "industry": "category",
            "payment_currency": "category",
            # "region": "category",
            # "country": "category",
        }
    )
)

In [None]:
account_features

## Users Features

In [None]:
users = pd.read_csv(pather.interim_users)
users.shape

In [None]:
users_features = (
    users.groupby("account_id")
    .agg(
        {
            "user_id": "nunique",
            "is_admin": "sum",
            "pending": "sum",
            "enabled": "sum",
        }
    )
    .rename(
        columns={
            "user_id": "registered_users",
            "is_admin": "number_of_admins",
            "pending": "number_of_pending_users",
            "enabled": "number_of_enabled_users",
        }
    )
    # .sort_values("registered_users", ascending=False)
)
users_features

In [None]:
users.nunique()

In [None]:
users[users["account_id"] == 602168]

# All Features

In [3]:
features_dataseter = FeaturesDataset()
accounts_features = features_dataseter._create_accounts_features()
events_features = features_dataseter._create_events_features()
users_features = features_dataseter._create_users_features()

Loading Data...


In [None]:
accounts_ids_not_in_users = accounts_features[
    ~accounts_features.index.isin(users_features.index)
]

In [None]:
# events_features_features_1 = events_features.drop(accounts_ids_not_in_users)

In [4]:
print(f"account shape: {accounts_features.shape}")
print(f"events shape: {events_features.shape}")
print(f"users shape: {users_features.shape}")

account shape: (716828, 8)
events shape: (716628, 22)
users shape: (716099, 4)


In [None]:
accounts_ids_not_in_users = accounts_features[
    ~accounts_features.index.isin(users_features.index)
]
accounts_ids_not_in_events = accounts_features[
    ~accounts_features.index.isin(events_features.index)
]

In [None]:
in_users_not_in_accounts = users_features[
    ~users_features.index.isin(account_features.index)
]
in_events_not_in_accounts = events_features[
    ~events_features.index.isin(account_features.index)
]

In [None]:
in_events_not_in_accounts

In [28]:
features = accounts_features.merge(
    users_features,
    left_index=True,
    right_index=True,
    how="outer",
).merge(
    events_features,
    left_index=True,
    right_index=True,
    how="outer",
)
target = features.pop("lead_score")

In [6]:
features.head()

Unnamed: 0_level_0,paying,collection_21_days,max_team_size,min_team_size,team_size,industry,payment_currency,registered_users,number_of_admins,number_of_pending_users,...,new_entry_events,payment_events,inbox_events,communicating_events,non_communicating_events,web_events,ios_events,android_events,desktop_app_events,empty_events
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0,0,,,,,USD,1.0,1.0,0.0,...,2.0,0.0,0.0,0.0,2.0,0.0,0.0,42.0,0.0,7.0
6,0,0,5.0,2.0,2-5,Finance,USD,1.0,1.0,0.0,...,1.0,0.0,1.0,1.0,2.0,56.0,0.0,0.0,0.0,7.0
7,0,0,1.0,1.0,1-1,Business Owner,USD,1.0,1.0,0.0,...,1.0,0.0,1.0,1.0,75.0,292.0,0.0,0.0,0.0,17.0
8,0,0,,,,,USD,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,19.0,0.0,3.0
9,0,0,0.0,0.0,,,USD,1.0,1.0,0.0,...,3.0,0.0,3.0,1.0,6.0,155.0,92.0,0.0,0.0,21.0


In [None]:
features
duplicate_mask = features.duplicated()
# Select duplicate rows based on the mask
duplicate_rows = features[duplicate_mask]
duplicate_rows

In [None]:
features["industry"].value_counts().sort_values(ascending=False)
features["payment_currency"].value_counts().sort_values(ascending=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
categorical_cols = ["payment_currency", "industry"]
one_hot_encoder = OneHotEncoder(sparse=False, drop="first")
preprocessor = ColumnTransformer(
    transformers=[("cat", one_hot_encoder, categorical_cols)], remainder="passthrough"
)
X_encoded = preprocessor.fit_transform(features)

In [None]:
X_encoded

Nans

In [53]:
one_person_mask = (
    (features["team_size"] == "Solo yo")
    | (features["team_size"] == "Moi uniquement")
    | (features["team_size"] == "Apenas eu")
    | (features["team_size"] == "Nur ich")
)
features.loc[one_person_mask, "max_team_size"] = 1
features.loc[one_person_mask, "min_team_size"] = 1

features["max_team_size"].fillna(features["max_team_size"].mean(), inplace=True)
features["min_team_size"].fillna(features["min_team_size"].mean(), inplace=True)

TypeError: Cannot setitem on a Categorical with a new category (nan), set the categories first

In [52]:
features["industry"].isna().sum()

258138

In [43]:
features["min_team_size"].isna().sum()

0

In [10]:
features.shape

(716828, 33)

In [47]:
data = features.merge(target, left_index=True, right_index=True)

In [50]:
data_1 = data[((~data["registered_users"].isna()) & (~data["active_users"].isna()))]

In [51]:
data_1.isna().sum()

paying                           0
collection_21_days               0
max_team_size                    0
min_team_size                    0
team_size                   213175
industry                    257470
payment_currency              3581
registered_users                 0
number_of_admins                 0
number_of_pending_users          0
number_of_enabled_users          0
active_users                     0
active_days                      0
total_events                     0
column_events                    0
board_events                     0
num_of_boards                    0
count_kind_columns               0
content_events                   0
group_events                     0
invite_events                    0
import_events                    0
notification_events              0
new_entry_events                 0
payment_events                   0
inbox_events                     0
communicating_events             0
non_communicating_events         0
web_events          

# Finished Features

In [54]:
features = pd.read_csv(pather.features).set_index("account_id")

In [55]:
features.isna().sum()

paying                           0
collection_21_days               0
max_team_size                    0
min_team_size                    0
industry                    219495
payment_currency              3574
registered_users                 0
number_of_admins                 0
number_of_pending_users          0
number_of_enabled_users          0
active_users                     0
active_days                      0
total_events                     0
column_events                    0
board_events                     0
num_of_boards                    0
count_kind_columns               0
content_events                   0
group_events                     0
invite_events                    0
import_events                    0
notification_events              0
new_entry_events                 0
payment_events                   0
inbox_events                     0
communicating_events             0
non_communicating_events         0
web_events                       0
ios_events          