In [None]:
import pandas as pd

In [None]:
matched_transactions_locs = pd.read_pickle("processed/matched_transactions_locs.pkl")

# Model to predict personal or shared transaction

1. Tag expenses as personal or shared using Splitwise
1a. Tag deleted expenses as feedback for not shared 


In [None]:
df = matched_transactions_locs.copy()

In [None]:
df.columns

In [None]:
input_features = [
    "transaction_cost",
    "transaction_date",
    "transaction_description",
    "is_shared",
]
df[df["is_shared"]][input_features]

In [None]:
df = df[input_features]

In [None]:
df["transaction_description"].unique()

In [None]:
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
from splitwise_sync.ml.preprocessing import build_preprocess

pipeline = make_pipeline(
    build_preprocess(),
    DecisionTreeClassifier(
        max_depth=6, min_samples_split=5, random_state=42, min_samples_leaf=5
    ),
)
pipeline

In [None]:
X = df[input_features].copy().drop(columns=["is_shared"])
y = df["is_shared"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

# Transform the training data and get feature names for dataviz
transformed_data = pipeline[:-1].transform(X_train)
feature_names = pipeline[-2].get_feature_names_out()

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Visualize the decision tree
feature_names_simple = [feat.split("__")[-1] for feat in feature_names]

dt_classifier = pipeline[-1]
plt.figure(figsize=(20, 10))
plot_tree(
    dt_classifier,
    filled=True,
    feature_names=feature_names_simple,
    class_names=["Personal", "Shared"],
    rounded=True,
    fontsize=7,
)
plt.title("Decision Tree for Transaction Classification")
plt.show()

# Feature importance
feature_importance = pd.DataFrame(
    {"Feature": feature_names, "Importance": dt_classifier.feature_importances_}
)

# Sort by importance
feature_importance = feature_importance.sort_values(
    "Importance", ascending=False
).reset_index(drop=True)
print("Feature Importance:")
display(feature_importance.head(15))

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(
    feature_importance["Feature"].head(15), feature_importance["Importance"].head(15)
)
plt.xlabel("Importance")
plt.title("Top 15 Feature Importance")
plt.gca().invert_yaxis()  # Invert to have highest importance at the top
plt.show()

In [None]:
import joblib

pipeline.fit(X, y)  # with all the data
joblib.dump(pipeline, "models/decision_tree_model.pkl")