### Notes
- Should I also scale one-hot encoded features so that they don't tip the balance

In [4]:
from typing import Dict, List

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## for parallel processing
from threading import Thread, Lock

In [5]:
## CONSTANTS
RANDOM_STATE = 42

## Function to add features

In [6]:
class FeatureGenerator(BaseEstimator, TransformerMixin):
    '''
    Function for generating features given from expressions
    - expressions: a list of [feature_name, expression], where df[feature_name] = df.eval(expression)
    '''
    def __init__(self, expressions:Dict[str, str]=None) -> None:
        super().__init__()

        self.expressions = expressions
        self.mutex = Lock() ## creating a mutex lock

    ## particular features
    def total_customer_loans(self, df):
        df["total_customer_loans"] = df["customer_id"].map(df.groupby("customer_id").count()["lender_id"])
        return df

    def add_feature_from_expression(self, df, feature_name, expression):
        self.mutex.acquire()
        df[feature_name] = df.eval(expression)
        self.mutex.release()

    def transform(self, X:pd.DataFrame, y=None) -> None:
        threads = []
        if self.expressions:
            for feature_name, expression in self.expressions.items():
                # X[feature_name] = X.eval(expression)
                thread = Thread(target=self.add_feature_from_expression, args=(X, feature_name, expression))
                thread.start()
                threads.append(thread)

        [thread.join() for thread in threads]

        X = self.total_customer_loans(X)

        return X

    def fit(self, X:pd.DataFrame, y=None) -> None:
        return self

In [7]:
## Getting data

In [8]:
train_df = pd.read_csv("../data/Train.csv")

In [9]:
train_df.head()

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.5,0.3,7794.0,0
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.0,0.2,1428.0,0
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.4,0.3,2770.0,0
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.2,0.3,1418.0,0


## Adding new features

In [10]:
## feature expressions
expressions = {
    "interest": "Total_Amount_to_Repay-Total_Amount",
    "interest_rate": "(Total_Amount_to_Repay-Total_Amount)/Total_Amount"
}

feature_generator = FeatureGenerator(expressions)
feature_generator.transform(train_df)

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target,interest,interest_rate,total_customer_loans
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0,0.0,0.000000,61
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.50,0.300000,7794.0,0,84.0,0.003244,22
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.00,0.200000,1428.0,0,242.0,0.035072,9
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.40,0.300000,2770.0,0,275.0,0.030699,17
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.20,0.300000,1418.0,0,164.0,0.035933,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68649,ID_244559228408267278,244559,Kenya,228408,267278,Type_1,1460.0,1515.0,2022-07-30,2022-08-06,7,Repeat Loan,438.00,0.300000,455.0,0,55.0,0.037671,10
68650,ID_260062217784267278,260062,Kenya,217784,267278,Type_1,5029.0,5116.0,2022-07-16,2022-07-23,7,Repeat Loan,1508.70,0.300000,1535.0,0,87.0,0.017300,16
68651,ID_259137216701267278,259137,Kenya,216701,267278,Type_1,5289.0,5289.0,2022-07-15,2022-07-22,7,Repeat Loan,1586.70,0.300000,1587.0,0,0.0,0.000000,52
68652,ID_266801303201267278,266801,Kenya,303201,267278,Type_1,3334.0,3334.0,2022-11-23,2022-11-30,7,Repeat Loan,741.09,0.222283,741.0,0,0.0,0.000000,46


## Preparing data for preprocessing

- Remove target
- Get categorical and numerical feature names into separate lists
- pass to column transformer

In [11]:
# ## categorical columns
## don't include country_id because only Kenya
## removed lender_id because test data has lenders that have not been seen
cat_cols = [
            # "loan_type",
            "New_versus_Repeat"
           ]

## numerical columns
num_cols = ["Total_Amount",
            "Total_Amount_to_Repay",
            "duration",
            "Amount_Funded_By_Lender",
            "Lender_portion_Funded",
            "Lender_portion_to_be_repaid",
            "interest",
            "interest_rate"
           ]
## categorical columns
# cat_cols = [
#             "lender_id",
#             "loan_type",
#            ]

# ## numerical columns
# num_cols = [
#             "duration",
#            ]

#### Preparing feature type transformers

In [12]:
num_transformer = Pipeline([("scaling", StandardScaler())])
cat_transformer = OneHotEncoder()

#### Preparing column transformer

In [13]:
column_transformer = ColumnTransformer([("num_transformer", num_transformer, num_cols),
                                        ("cat_transformer", cat_transformer, cat_cols),
                                       ],
                                       remainder="drop"
                                      )

## Preparing Pipeline

In [14]:
X = column_transformer.fit_transform(train_df)
y = train_df["target"]

## Splitting Data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=RANDOM_STATE)

## Grid Search

In [None]:
params = {
    "n_estimators": [100, 200, 800],
    "class_weight": [None, "balanced", "balanced_subsample"],
}

grid_cv = GridSearchCV(RandomForestClassifier(),
                       params,
                       cv=KFold(n_splits=7, shuffle=True,random_state=RANDOM_STATE),
                       n_jobs=-1,
                       scoring="f1",
                       verbose=10
                      )

grid_cv.fit(X_train, y_train)

Fitting 7 folds for each of 27 candidates, totalling 189 fits


In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

#### XGboost

In [157]:
params = {
    "n_estimators": [100, 200, 800],
    "max_depth": [2, 4, 8]
}

grid_xg = GridSearchCV(XGBClassifier(),
                       params,
                       cv=KFold(n_splits=7, shuffle=True,random_state=RANDOM_STATE),
                       n_jobs=-1,
                       scoring="f1",
                       verbose=10
                      )

grid_xg.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

## Training

In [133]:
lr = LogisticRegression(
    random_state=42,
    class_weight="balanced",
    max_iter=1000
)
lr.fit(X_train,y_train)

In [134]:
f1_score(y_test, lr.predict(X_test))

0.475020475020475

In [135]:
lr.coef_

array([[ 0.56679432,  0.35854713,  0.17979777,  0.00912267, -0.15126375,
        -0.08950339, -0.79715378,  3.33033767,  0.40479873, -1.42581063]])

## Predicting

In [145]:
test_df = pd.read_csv("../data/Test.csv")

ids = test_df["ID"].values

test_df = feature_generator.transform(test_df)

test_df = column_transformer.transform(test_df)

preds = grid_cv.predict(test_df)

In [153]:
pd.DataFrame({"ID":ids, "target": preds}).to_csv("../submissions/submission_1.csv", index=False)