# Logistic Regression

In [27]:
import os
import re

project_path = re.findall(
    pattern=r".+monday-vip-consulting",
    string=os.path.abspath(os.getcwd()),
)[0]
os.chdir(os.path.join(project_path))
%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [28]:
import pandas as pd
from src.utils.pather import Pather
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import preprocessing

In [30]:
pather = Pather()
features = pd.read_csv(pather.features).set_index("account_id")
y = pd.read_csv(pather.target).set_index("account_id")["lead_score"].values

use_categorical = False
categorical_cols = ["payment_currency", "industry", "country"]
features = features.drop(categorical_cols, axis=1)
X = features.values

In [31]:
features.columns

Index(['paying', 'collection_21_days', 'max_team_size', 'min_team_size',
       'registered_users', 'number_of_admins', 'number_of_pending_users',
       'number_of_enabled_users', 'active_users', 'active_days',
       'total_events', 'column_events', 'board_events', 'num_of_boards',
       'count_kind_columns', 'content_events', 'group_events', 'invite_events',
       'import_events', 'notification_events', 'new_entry_events',
       'payment_events', 'inbox_events', 'communicating_events',
       'non_communicating_events', 'web_events', 'ios_events',
       'android_events', 'desktop_app_events', 'empty_events'],
      dtype='object')

In [32]:
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

In [35]:
n_estimators = 10
max_depth = 10

model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42,
)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)
print(f"Train f1 score: {train_f1}")
print(f"Test f1 score: {test_f1}")

train_precision = precision_score(y_train, y_train_pred)
test_precision = precision_score(y_test, y_test_pred)
print(f"Train precision score: {train_precision}")
print(f"Test precision score: {test_precision}")

train_recall = recall_score(y_train, y_train_pred)
test_recall = recall_score(y_test, y_test_pred)
print(f"Train recall score: {train_recall}")
print(f"Test recall score: {test_recall}")

Train f1 score: 0.1220118179367427
Test f1 score: 0.12235075816879765
Train precision score: 0.0680377689657414
Test precision score: 0.06818549038616901
Train recall score: 0.5902714134649277
Test recall score: 0.5950366610265088
