In [None]:
import pandas as pd
import geopandas as gpd

In [None]:
matched_transactions_locs = pd.read_pickle("processed/matched_transactions_locs.pkl")

# Model to predict personal or shared transaction

1. Tag expenses as personal or shared using Splitwise
1a. Tag deleted expenses as feedback for not shared 


In [None]:
df = matched_transactions_locs.copy()

In [None]:
df.columns

In [None]:
df["is_weekend"] = df["transaction_date"].dt.dayofweek >= 5

In [None]:
desc_cols = ["location_address"]

In [None]:
features = [
    "transaction_cost",
    "transaction_date",
    "transaction_description",
    "is_weekend",
]
df[df["is_shared"]][features + desc_cols]

In [None]:
df["description_comuna"] = df["transaction_description"].str[23:37].str.strip()
df["description_pais"] = df["transaction_description"].str[37:45].str.strip()
df["merchant_name"] = df["transaction_description"].str[0:23].str.strip()
df["merchant_name"]

In [None]:
df["description_pais"].value_counts(dropna=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Preparing features for the decision tree model

# 1. Transaction cost is ready to use

# 2. Extract useful features from transaction_date
df["transaction_month"] = df["transaction_date"].dt.month
df["transaction_hour"] = df["transaction_date"].dt.hour
df["transaction_dayofweek"] = df["transaction_date"].dt.dayofweek

In [None]:
df["transaction_description"].unique()

In [None]:
def features_to_df(features, vectorizer, suffix):
    """Convert sparse matrix to DataFrame with appropriate column names."""
    return pd.DataFrame(
        features.toarray(),
        columns=[f"{suffix}_{i}" for i in vectorizer.get_feature_names_out()],
    )

In [None]:
# 3. Process transaction_description using text vectorization
# We'll use a CountVectorizer to convert text descriptions to numerical features
stop_words = ["sumup", "merpago", "mercadopago", "spa"]
# stop_words = []
vectorizer_description = CountVectorizer(
    max_features=200, ngram_range=(2, 10), stop_words=stop_words
)
description_features = vectorizer_description.fit_transform(
    df["merchant_name"].fillna("")
)
description_df = features_to_df(
    description_features, vectorizer_description, "merchant"
)
vectorizer_description.get_feature_names_out()

In [None]:
vectorizer_merchant = CountVectorizer(
    max_features=200, ngram_range=(2, 10), stop_words=stop_words
)
merchant_features = vectorizer_merchant.fit_transform(df["merchant_name"].fillna(""))
merchant_df = features_to_df(merchant_features, vectorizer_merchant, "merchant")
vectorizer_merchant.get_feature_names_out()

In [None]:
vectorizer_comuna = CountVectorizer(
    max_features=200, ngram_range=(1, 10), stop_words=stop_words
)
comuna_features = vectorizer_comuna.fit_transform(df["description_comuna"].fillna(""))
description_df_comuna = features_to_df(comuna_features, vectorizer_comuna, "comuna")
vectorizer_comuna.get_feature_names_out()

In [None]:
vectorizer_pais = CountVectorizer(
    max_features=200, ngram_range=(1, 10), stop_words=stop_words
)
pais_features = vectorizer_pais.fit_transform(df["description_pais"].fillna(""))
description_df_pais = features_to_df(pais_features, vectorizer_pais, "pais")
vectorizer_pais.get_feature_names_out()

In [None]:
# 4. is_weekend is already a boolean feature, ready to use

# Combine all features
numeric_features = df[
    [
        "transaction_cost",
        "transaction_month",
        "transaction_hour",
        "transaction_dayofweek",
        "is_weekend",
    ]
].copy()

# Combine with text features
X = pd.concat(
    [
        numeric_features.reset_index(drop=True),
        description_df.reset_index(drop=True),
        # merchant_df.reset_index(drop=True),
        # description_df_comuna.reset_index(drop=True),
        # description_df_pais.reset_index(drop=True),
    ],
    axis=1,
)
y = df["is_shared"].copy()

# Display feature set
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Class distribution: {y.value_counts()}")

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Create the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(
    max_depth=6, min_samples_split=5, random_state=42
)

# Train the model
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Visualize the decision tree
plt.figure(figsize=(20, 10))
plot_tree(
    dt_classifier,
    filled=True,
    feature_names=X.columns,
    class_names=["Personal", "Shared"],
    rounded=True,
)
plt.title("Decision Tree for Transaction Classification")
plt.show()

# Feature importance
feature_importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": dt_classifier.feature_importances_}
)

# Sort by importance
feature_importance = feature_importance.sort_values(
    "Importance", ascending=False
).reset_index(drop=True)
print("Feature Importance:")
display(feature_importance.head(15))

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(
    feature_importance["Feature"].head(15), feature_importance["Importance"].head(15)
)
plt.xlabel("Importance")
plt.title("Top 15 Feature Importance")
plt.gca().invert_yaxis()  # Invert to have highest importance at the top
plt.show()