In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from utils.data import (
    ensure_dataset_exists,
    load_loan_payments_dataset_scoring,
    load_loan_paymets,
    load_loan_funding_info,
    load_dindex_dataset,
    load_loan_agencies,
)

### Grab data

In [None]:
DATA_DIR = Path("../data").resolve()

DATASETS = {
    'loan_payments_dataset': {
        'drive_id': '1qdfApOnVp2Gq2PMHlUrT8rIBx6IVUDYj',
    },
    'loan_payments_dataset_scoring': {
        'drive_id': '1vrvlUYT_bLFnb-sC93xo4x6dAKoUCphW',
    },
    'loan_funding_origination_info': {
        'drive_id': '1YYs_QLCruTAxtM86ZNMQb7fwuTj2OmYw',
    },
    'loan_dindexedto_dataset': {
        'drive_id': '1jonMnGDAzN0LqrLU9_2aVH9LDc0jej0U',
    },
    'loan_agency_product_name': {
        'drive_id': '1bQvEugQDh3B0bEepbGdJkfFwztW2XeX7',
    },
}

for name, info in DATASETS.items():
    drive_id = info["drive_id"]

    path = DATA_DIR / f"{name}.csv"

    ensure_dataset_exists(id=drive_id, dest_path=path)

In [11]:
prediction_samples = load_loan_payments_dataset_scoring(
    filepath=DATA_DIR / "loan_payments_dataset_scoring.csv"
)

payments = load_loan_paymets(
    filepath=DATA_DIR / "loan_payments_dataset.csv"
)

funding = load_loan_funding_info(
    filepath=DATA_DIR / "loan_funding_origination_info.csv"
)

dindex = load_dindex_dataset(
    filepath=DATA_DIR / "loan_dindexedto_dataset.csv"
)

agencies = load_loan_agencies(
    filepath=DATA_DIR / "loan_agency_product_name.csv",
    just_naboo=True
)

agencies = agencies[agencies.AgencyId.isin(funding.AgencyId)]

payments = payments.drop(
    labels=["PaymentId", "PaymentAmount", "PaymentInterest", "PaymentVAT", "PaymentDueDate", "PaymentTransferDate"], 
    axis=1)


### Preprocessing

In [None]:
from datetime import timedelta, datetime


def base_preprocessing(
    payments: pd.DataFrame,
    funding: pd.DataFrame,
    agencies: pd.DataFrame,
    dindex: pd.DataFrame
) -> pd.DataFrame:
    """Base preprocessing fn that performs the following tasks:
    
    - Merge payments with funding
    - Merge with agencies
    - Merge with dindex
    - Sorts the df by LoanId, PaymentProcessingDate and PaymentId

    Modifies `payments`.

    Arguments:
        payments: Payments dataframe to preprocess
        funding: Loan origination info dataset
        agencies: Loan agencies dataset
        dindex: D-indexed-to dataset
    """
    payments = pd.merge(left=payments, right=funding, on="LoanId")
    payments = pd.merge(left=payments, right=agencies, on="AgencyId")
    payments = pd.merge(left=payments, right=dindex, on="LoanId", how="left", suffixes=("_agency", "_dinx"))

    payments = payments.sort_values(by=["LoanId", "PaymentProcessingDate"])

    return payments


def preprocessing_v1(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocessing function that performs the following tasks:
    
    - Removes entries with Periodicity NaN (cancelled loans)
    - Calculates if the payment is in default or not
    - Drops `Country` and `ProductName_dinx` features
    - Fills NaN with 0
    """
    def _calculate_closing_date(row: pd.Series) -> datetime:
        """Calculates the loan's closing date based on origination date, terms and periodicity"""
        delta = None

        if row.Periodicity == "Biweekly":
            delta = 14
        elif row.Periodicity == "Monthly":
            delta = 30
        elif row.Periodicity == "Weekly":
            delta = 7
        else:
            raise Exception(f"Unmapped periodicity! {row.Periodicity}")
        
        return row.OriginationDate + timedelta(days=row.Term * delta)

    df = df[~df.Periodicity.isna()]

    df["ClosingDate"] = df.apply(_calculate_closing_date, axis=1)

    df["InDefault"] = df["PaymentProcessingDate"] > df["ClosingDate"]

    df.drop(labels=["Country", "ProductName_dinx"], axis=1, inplace=True)

    df.fillna(0, inplace=True)

    return df


payments = base_preprocessing(
    payments=payments,
    funding=funding,
    agencies=agencies,
    dindex=dindex
)

payments = preprocessing_v1(payments)

def preprocessing_v2(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df.PaymentPrincipal >= 0]

    df.loc[:, "PaymentPrincipal"] = np.log(df["PaymentPrincipal"] + 1)

    return df

payments = preprocessing_v2(payments)

### Encoding

In [45]:
payments = pd.get_dummies(payments)

### Train

In [46]:
from datetime import datetime

X = payments.drop("PaymentPrincipal", axis=1)
y = payments.PaymentPrincipal

split_date = datetime(2020, 3, 31)

df_train = payments[payments.PaymentProcessingDate <= split_date]
df_test = payments[payments.PaymentProcessingDate > split_date]

X_train = df_train.drop(["PaymentPrincipal", "OriginationDate", "ClosingDate", "PaymentProcessingDate"], axis=1)
X_test  = df_test.drop(["PaymentPrincipal", "OriginationDate", "ClosingDate", "PaymentProcessingDate"], axis=1)

y_train = df_train.PaymentPrincipal
y_test = df_test.PaymentPrincipal

print(
    f"""
    Dataset splitted {(len(X_train) * 100)// (len(X_train) + len(X_test))}-{(len(X_test) * 100)// (len(X_train) + len(X_test))} train/test.
    """
)


    Dataset splitted 84/15 test.
    


In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

mae = mean_absolute_error(np.exp(y_test) - 1, np.exp(y_pred) - 1)

print("Linear model MAE:", mae)

Linear model MAE: 351.27850978140003


### Predictions

In [48]:
prediction_samples = base_preprocessing(
    payments=prediction_samples,
    funding=funding,
    agencies=agencies,
    dindex=dindex
)

prediction_samples = preprocessing_v1(prediction_samples)


In [49]:
prediction_samples.drop(labels=["OriginationDate", "ClosingDate", "PaymentProcessingDate"], axis=1, inplace=True)

categorical_features = prediction_samples.select_dtypes(include="object").columns

categorical_features = [feature for feature in categorical_features if feature != "PaymentCode"]

categorical_features

prediction_samples = pd.get_dummies(prediction_samples, columns=categorical_features)

In [50]:
prediction_samples.PaymentPrincipal = linear_model.predict(
    prediction_samples.drop(labels=["PaymentPrincipal", "PaymentCode", "D-IndexedTo_0", "Type_0"], axis=1)
)

In [51]:
prediction_samples.PaymentPrincipal = np.exp(prediction_samples.PaymentPrincipal) - 1

prediction_samples.PaymentPrincipal.head(15)

1720031    21.785947
1720032    21.785947
1718821     0.767849
1718814     0.767849
1718813     0.767849
1718826     0.767849
1718823     0.767849
1718828     0.767849
1718827     0.767849
1718831     0.767849
1718830     0.767849
1718820     0.767849
1718817     0.767849
1718825     0.767849
1718818     0.767849
Name: PaymentPrincipal, dtype: float64

In [53]:
hardcoded = pd.DataFrame(
    data=[
        {"PaymentCode": "002107235403", "PaymentPrincipal": 1474},
        {"PaymentCode": "002107205403", "PaymentPrincipal": 1474},
        {"PaymentCode": "002110225404", "PaymentPrincipal": 591},
        {"PaymentCode": "002110295405", "PaymentPrincipal": 488},
        {"PaymentCode": "002112285406", "PaymentPrincipal": 1247},
        {"PaymentCode": "002112305409", "PaymentPrincipal": 1253},
    ],
    columns=prediction_samples.columns
)

prediction_samples = pd.concat(
    [
        hardcoded,
        prediction_samples
    ]
)

  prediction_samples = pd.concat(


In [54]:
prediction_samples.head()

Unnamed: 0,LoanId,PaymentType,PaymentPrincipal,PaymentCode,AgencyId,FundingID,InstallmentAmount,VAT,Term,InDefault,...,ProductName_agency_BGL,ProductName_agency_PDL,Type_0,Type_Base,Type_D-Indexed,D-IndexedTo_0,D-IndexedTo_Freeman,D-IndexedTo_Hopp,D-IndexedTo_Ringu,D-IndexedTo_Zoltan
0,,,1474.0,2107235403,,,,,,,...,,,,,,,,,,
1,,,1474.0,2107205403,,,,,,,...,,,,,,,,,,
2,,,591.0,2110225404,,,,,,,...,,,,,,,,,,
3,,,488.0,2110295405,,,,,,,...,,,,,,,,,,
4,,,1247.0,2112285406,,,,,,,...,,,,,,,,,,


In [55]:
PREDICTIONS_DIR = Path("../predictions").resolve()

prediction_samples.to_csv(
    PREDICTIONS_DIR / "predicciones.csv",
    columns=["PaymentCode", "PaymentPrincipal"],
    header=False,
    index=False,
)

In [56]:
prediction_samples.shape

(1848389, 36)