In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("synthetic_payment_data_past_6_months.csv")

In [2]:
df["due_date"] = pd.to_datetime(df["due_date"])

X = df[["customer_name", "amount", "due_date"]]
y = df["label"]

le_customer = LabelEncoder()
X.loc[:, "customer_name"] = le_customer.fit_transform(X["customer_name"])

reference_date = pd.Timestamp("2024-01-01")
X["days_since_reference"] = (X["due_date"] - reference_date).dt.days

X = X.drop(columns=["due_date"])

X.head()

Unnamed: 0,customer_name,amount,days_since_reference
0,42,8040.78,64
1,17,1618.62,64
2,7,6245.31,64
3,95,7418.82,64
4,5,4040.13,64


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 40000
Test set size: 10000


In [4]:
X_train

Unnamed: 0,customer_name,amount,days_since_reference
39087,42,6907.69,204
30893,79,9192.43,175
45278,73,4660.65,227
16398,41,570.73,123
13653,14,4362.14,113
...,...,...,...
11284,62,1900.27,104
44732,20,9080.48,225
38158,65,6440.24,201
860,96,3895.61,67


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model = DecisionTreeClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
import joblib

model_filename = 'payment_classification_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")