# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import pickle
import time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE

from category_encoders import TargetEncoder

In [2]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

In [3]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Divide Columns

In [4]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

In [5]:
score_dict = {}
def print_scores():
    for key in score_dict.keys():
        print(f'{key}, f1_micro_score: {round(score_dict[key][0],4)}, Run time: {round(score_dict[key][1],0)}')

### logreg0:  Basline Logistic Regression with no feature engineering

In [6]:
ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols),
    ('categorical', ohe_pipe, cat_cols),
    ('integer', 'passthrough', integer_cols),
    ('geo', 'passthrough', geo_cols)
])

In [7]:
logreg0_pipe = imbPipeline([
    ('trans', transformer), 
    ('logreg', LogisticRegression(n_jobs=-1, random_state=42))
])

In [8]:
start = time.time()
f1_micro_logreg0 = cross_val_score(logreg0_pipe, X_train, y_train, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_logreg0 = (end-start)

Run time: 70.11193609237671


In [9]:
f1_micro_logreg0 = f1_micro_logreg0.mean()
f1_micro_logreg0

0.5691174213353799

In [10]:
score_dict['logreg0'] = [f1_micro_logreg0, run_time_logreg0]
print_scores()

logreg0, f1_micro_score: 0.5691, Run time: 70.0


### logreg1: Try normalizing and scaling integer columns and dropping some features

I will drop the same features that I did for the random forest model after checking feature importances.

In [11]:
binary_cols_dropped = binary_cols.copy()
for col in binary_cols_dropped:
    if col.startswith('has_secondary'):
        binary_cols_dropped.remove(col)
binary_cols_dropped.append('has_secondary_use')

cat_cols_dropped = cat_cols.copy()
cat_cols_dropped.remove('legal_ownership_status')
cat_cols_dropped.remove('plan_configuration')

In [12]:
def log_transform(x):
    return np.log(x+1)

In [16]:
function_transformer = FunctionTransformer(log_transform)

ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
integer_pipe = imbPipeline([
    ('function', function_transformer),
    ('ss', StandardScaler())
])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('geo', 'passthrough', geo_cols),
    ('integer', integer_pipe, integer_cols),
])

In [17]:
logreg1_pipe = imbPipeline([
    ('transformer', transformer),
    ('logreg', LogisticRegression(n_jobs=-1, random_state=42))
])

In [18]:
start = time.time()
f1_micro_logreg1 = cross_val_score(logreg1_pipe, X_train, y_train, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_logreg1 = end-start

Run time: 62.303799867630005


In [19]:
f1_micro_logreg1 = f1_micro_logreg1.mean()
f1_micro_logreg1

0.5723151701202354

In [20]:
score_dict['logreg1'] = [f1_micro_logreg1, run_time_logreg1]
print_scores()

logreg0, f1_micro_score: 0.5691, Run time: 70.0
logreg1, f1_micro_score: 0.5723, Run time: 62.0


### logreg2:  Try Target Encoding