In [33]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [34]:
df= pd.read_csv("master table.csv",encoding='latin1')
df.head()

Unnamed: 0,invoice_id,order_id,invoice_date,due_date,paid_date,amount,total_paid,outstanding_payment,invoice_age_days,customer_id,name,seg,signup_date,region,credit_score,lifetime_amount,total_orders,total_invoices,over_due_invoices,overall_refund_amount
0,I9000,O5000,2023-03-01,2023-03-15,2023-03-23,6478.09,6478.09,0.0,983,C1110,Casey-Brown,SMB,2024-07-20,North,515,18725.21,5,5,4,7723.18
1,I9001,O5001,2023-04-17,2023-05-17,2023-04-24,492.08,492.08,0.0,920,C1027,Wilson-Campbell,Enterprise,2023-03-07,West,676,12734.9,3,3,2,0.0
2,I9002,O5002,2024-11-24,2024-12-24,2024-12-01,4521.6,4521.6,0.0,333,C1032,Lutz Group,Individual,2024-05-20,East,649,7618.2,2,2,2,0.0
3,I9003,O5003,2023-02-16,2023-02-23,2023-02-25,3243.41,3243.41,0.0,1003,C1083,Davies-Morales,Enterprise,2024-08-07,West,613,3243.41,1,1,1,0.0
4,I9004,O5004,2023-03-28,2023-04-11,2023-04-24,2977.38,2977.38,0.0,956,C1107,Nelson Inc,SMB,2024-12-03,East,662,2977.38,1,1,1,0.0


In [35]:
df["will_default"] = (df["outstanding_payment"] > 0).astype(int)
print(df["will_default"].value_counts())

will_default
0    116
1     34
Name: count, dtype: int64


In [36]:
df.isnull().sum()

invoice_id               0
order_id                 0
invoice_date             0
due_date                 0
paid_date                5
amount                   0
total_paid               0
outstanding_payment      0
invoice_age_days         0
customer_id              0
name                     0
seg                      0
signup_date              0
region                   0
credit_score             0
lifetime_amount          0
total_orders             0
total_invoices           0
over_due_invoices        0
overall_refund_amount    0
will_default             0
dtype: int64

In [37]:
df.duplicated().sum()

np.int64(0)

In [38]:
date_cols = ["invoice_date", "due_date", "paid_date", "signup_date"]

for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# Tenure at invoice date
df["customer_tenure_days"] = (df["invoice_date"] - df["signup_date"]).dt.days
df["customer_tenure_days"] = df["customer_tenure_days"].clip(lower=0)
df["tenure_years"] = df["customer_tenure_days"] / 365


In [39]:
# Ratio of amount customer paid
df["paid_ratio"] = df["total_paid"] / df["amount"].replace(0, pd.NA)
df["paid_ratio"] = df["paid_ratio"].fillna(0)

# Refund ratio
df["refund_ratio"] = df["overall_refund_amount"] / df["lifetime_amount"].replace(0, pd.NA)
df["refund_ratio"] = df["refund_ratio"].fillna(0)

# Customer overdue ratio
df["overdue_ratio"] = df["over_due_invoices"] / df["total_invoices"].replace(0, pd.NA)
df["overdue_ratio"] = df["overdue_ratio"].fillna(0)

# Average order value
df["avg_order_value"] = df["lifetime_amount"] / df["total_orders"].replace(0, pd.NA)
df["avg_order_value"] = df["avg_order_value"].fillna(0)

# Invoice relative to customer's spending power
df["inv_to_lifetime_ratio"] = df["amount"] / df["lifetime_amount"].replace(0, pd.NA)
df["inv_to_lifetime_ratio"] = df["inv_to_lifetime_ratio"].fillna(0)


In [40]:
# Define features and target variable
y = df["will_default"]

feature_cols = [

    "amount",
    "paid_ratio",
    "refund_ratio",
    "inv_to_lifetime_ratio",

    "credit_score",
    "lifetime_amount",
    "total_orders",
    "total_invoices",
    "over_due_invoices",
    "overdue_ratio",
    "overall_refund_amount",

    "customer_tenure_days",
    "avg_order_value",
    "tenure_years",

    "seg",
    "region",
]

X = df[feature_cols]


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [42]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


preprocess=ColumnTransformer(
    transformers=[
        ('num','passthrough',numeric_cols),
        ('cat',OneHotEncoder(handle_unknown='ignore',drop='first'),cat_cols)
    ]
)

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)   
clf=Pipeline(
    steps=[
        ('preprocess',preprocess),
        ('model',model)
    ]
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92        23
           1       1.00      0.43      0.60         7

    accuracy                           0.87        30
   macro avg       0.93      0.71      0.76        30
weighted avg       0.89      0.87      0.85        30

[[23  0]
 [ 4  3]]


In [43]:
rf = clf.named_steps["model"]
ct = clf.named_steps["preprocess"]

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

cat_feature_names = ct.named_transformers_["cat"].get_feature_names_out(cat_cols)

feature_names = list(numeric_cols) + list(cat_feature_names)

importances = rf.feature_importances_

feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

feat_imp.head(15)


Unnamed: 0,feature,importance
1,paid_ratio,0.303299
4,credit_score,0.104629
0,amount,0.095035
12,avg_order_value,0.07832
5,lifetime_amount,0.067001
13,tenure_years,0.066775
3,inv_to_lifetime_ratio,0.06171
11,customer_tenure_days,0.058528
9,overdue_ratio,0.021167
10,overall_refund_amount,0.018674


In [44]:

# Probability of default (class 1)
df["default_prob"] = clf.predict_proba(X)[:, 1]

# Predicted class (0 = paid, 1 = will_default)
df["predicted_default"] = clf.predict(X)


In [45]:
df['predicted_default'].value_counts()

predicted_default
0    120
1     30
Name: count, dtype: int64

In [46]:
df.to_csv("invoices_with_scores.csv", index=False)