In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.data import (
    load_loan_payments_dataset_scoring, 
    load_loan_agencies, 
    load_loan_paymets,
    load_loan_funding_info,
    load_dindex_dataset
)
from pathlib import Path

DATA_DIR = Path("../data").resolve()

In [16]:
prediction_samples = load_loan_payments_dataset_scoring(
    filepath=DATA_DIR / "loan_payments_dataset_scoring.csv"
)

payments = load_loan_paymets(
    filepath=DATA_DIR / "loan_payments_dataset.csv"
)

funding = load_loan_funding_info(
    filepath=DATA_DIR / "loan_funding_origination_info.csv"
)

dindex = load_dindex_dataset(
    filepath=DATA_DIR / "loan_dindexedto_dataset.csv"
)

agencies = load_loan_agencies(
    filepath=DATA_DIR / "loan_agency_product_name.csv",
    just_naboo=True
)

agencies = agencies[agencies.AgencyId.isin(funding.AgencyId)]

In [17]:
df = funding
df = pd.merge(left=df, right=agencies, on="AgencyId")
df = pd.merge(left=df, right=dindex, on="LoanId", suffixes=("_agency", "_dinx"))
df = pd.merge(left=df, right=payments, on="LoanId")

df = df.sort_values(by=["LoanId", "PaymentProcessingDate", "PaymentId"])

df["PaymentPrincipalRelative"] = df.PaymentPrincipal / df.PaymentAmount
df["PaymentInterestRelative"] = df.PaymentInterest / df.PaymentAmount
df["PaymentVATRelative"] = df.PaymentVAT / df.PaymentAmount
df["PaymentTerm"] = df.groupby("LoanId").cumcount() + 1
df["PaymentTermRelative"] = round(df["PaymentTerm"] / df["Term"], 2)

In [18]:
from sklearn.dummy import DummyRegressor
from datetime import datetime

X = df.drop("PaymentPrincipal", axis=1)
y = df.PaymentPrincipal

split_date = datetime(2019, 12, 31)

df_train = df[df.PaymentProcessingDate <= split_date]
df_test = df[df.PaymentProcessingDate > split_date]

X_train = df_train.drop("PaymentPrincipal", axis=1)
X_test  = df_test.drop("PaymentPrincipal", axis=1)

y_train = df_train.PaymentPrincipal
y_test = df_test.PaymentPrincipal

print(f"""
- X_train: {len(X_train)} records
- X_test: {len(X_test)} records
- y_train: {len(y_train)} records
- y_test: {len(y_test)} records
"""
)


- X_train: 3068363 records
- X_test: 1234476 records
- y_train: 3068363 records
- y_test: 1234476 records



In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# create a dummy regressor
dummy_reg = DummyRegressor(strategy='mean')

# fit it on the training set
dummy_reg.fit(X_train, y_train)

# make predictions on the test set
y_pred = dummy_reg.predict(X_test)

# calculate root mean squared error
mae = mean_absolute_error(y_test, y_pred)

print("Dummy MAE:", mae)

Dummy MAE: 479.66421795282133


In [23]:
from tqdm import tqdm

prediction_samples.PaymentPrincipal = dummy_reg.predict(prediction_samples)


In [24]:
prediction_samples

Unnamed: 0,PaymentCode,PaymentPrincipal,PaymentTypeId,PaymentDate,LoanId
0,022102049402357,387.751361,2,2021-02-04,9402357
1,00201030668519,387.751361,0,2020-10-30,668519
2,002011249339435,387.751361,0,2020-11-24,9339435
3,002104209148631,387.751361,0,2021-04-20,9148631
4,00210607131505,387.751361,0,2021-06-07,131505
...,...,...,...,...,...
1848384,002111239354177,387.751361,0,2021-11-23,9354177
1848385,002108319394900,387.751361,0,2021-08-31,9394900
1848386,002112079415809,387.751361,0,2021-12-07,9415809
1848387,002106089347465,387.751361,0,2021-06-08,9347465


In [25]:
prediction_samples.to_csv(
    "first-model.csv",
    columns=["PaymentCode", "PaymentPrincipal"],
    header=False,
    index=False,
)