In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
inflows = pd.read_parquet('data/ucsd-inflows.pqt')
outflows = pd.read_parquet('data/ucsd-outflows.pqt')

In [3]:
relevant_of = outflows[outflows.category != outflows.memo].reset_index(drop=True)
relevant_of.posted_date = pd.to_datetime(relevant_of.posted_date)
relevant_of.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
1,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
2,0,acc_0,Oculus CA 04/16,11.73,2022-04-18,GENERAL_MERCHANDISE
3,0,acc_0,LOS GIRASOLES STOW OH 03/08,30.04,2022-03-09,FOOD_AND_BEVERAGES
4,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,4.16,2022-03-29,GENERAL_MERCHANDISE


# Memo Cleaning

In [4]:
def clean_memo(memo):
    sw = ['payment', 'transaction', 'deposit', 'withdrawal', 'transfer', 'credit', 'debit', 'refund', 'fee', 'charge', 'purchase', 'atm', 'checkcard']

    memo = memo.lower()

    memo = re.sub(r'[0-9]{3,}', '', memo) # remove instances of numbers more than 3 repeated
    
    memo = re.sub(r'x{3,}', '', memo) # remove the X's (ex. #XXXX)
    memo = re.sub(r'^#[a-z0-9]+', '', memo) # remove the #smth @ start of memo
    memo = re.sub(r'(www\.|\.com)', '', memo) # removing any links or urls

    memo = re.sub(r'[0-9x]{2}((-|/)[0-9x]{2,4}){1,2}', '', memo) # remove dates

    memo = re.sub(r'[^\w\s-]', ' ', memo)  # replace special characters with single space
    memo = re.sub(r'([a-z]+)\s{0,1}-\s{0,1}([a-z]+)', r'\1\2', memo) # replace '-' w/ ''
    memo = re.sub(r'\s+', ' ', memo)  # remove multiple spaces

    # location
    state_abbreviations = [
        "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", 
        "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", 
        "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", 
        "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", 
        "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy"
    ]
    state_pattern = r' (' + '|'.join(state_abbreviations) + r')\b'
    
    memo = re.sub(r'(street|st|road|rd|blvd|avenue|ave|highway|hwy)\b', '', memo)
    memo = memo.strip()
    memo = re.sub(state_pattern, '', memo)
    memo = memo.strip()

    merchant_map = {'wal walmart': 'walmart', 'walmart walmart': 'walmart', 'wal mart': 'walmart'}

    if memo in merchant_map: return merchant_map[memo]
    return memo

In [5]:
relevant_of['clean_memo'] = relevant_of.apply(lambda row: clean_memo(row.memo) if row.memo != row.category else row.memo, axis=1)

relevant_of.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,clean_memo
0,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES,t casa del rioexp fairlawn
1,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES,buffalo wild wings
2,0,acc_0,Oculus CA 04/16,11.73,2022-04-18,GENERAL_MERCHANDISE,oculus
3,0,acc_0,LOS GIRASOLES STOW OH 03/08,30.04,2022-03-09,FOOD_AND_BEVERAGES,los girasoles stow
4,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,4.16,2022-03-29,GENERAL_MERCHANDISE,buzzis laundry 1


In [6]:
# vectorizer = TfidfVectorizer(max_features=1000, max_df=0.85)
# tfidf = vectorizer.fit_transform(clean_data.clean_memo)
# tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf)
# tfidf_df.columns = vectorizer.get_feature_names_out()

In [7]:
def ftr_generation(data, num_bins=10, max_features=1000, max_df=0.85):
    to_remove = ['prism_consumer_id', 'prism_account_id', 'memo', 'amount', 'posted_date', 'clean_memo']

    # Binary data
    data['is_weekend'] = (data.posted_date.dt.weekday >= 5).astype(int)
    data['is_even_amount'] = data.amount.apply(lambda x: 1 if x % 1 == 0 and x % 5 == 0 else 0)

    # tifidf data
    vectorizer = TfidfVectorizer(max_features=max_features, max_df=max_df)
    tfidf = vectorizer.fit_transform(data.clean_memo)
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf)
    tfidf_df.columns = 'tfidf_' + vectorizer.get_feature_names_out()

    # date data
    date_data = pd.DataFrame()
    date_data['month'] = 'month_' + data.posted_date.dt.month.astype(str)
    date_data['weekday'] = 'weekday_' + data.posted_date.dt.weekday.astype(str)

    date_enc = OneHotEncoder(drop='first', handle_unknown='ignore')
    date_oh = date_enc.fit_transform(date_data)

    cols = np.concatenate([typ[1:] for typ in date_enc.categories_])
    date_ftrs = pd.DataFrame.sparse.from_spmatrix(date_oh, columns=cols)

    # amount features
    amount_data = pd.DataFrame()
    amount_data['decile_amounts'] = pd.qcut(data.amount, q=10, labels=[f'bin_{num}' for num in range(1, num_bins + 1)])
    
    amount_enc = OneHotEncoder(drop='first', handle_unknown='ignore')
    amount_oh = amount_enc.fit_transform(amount_data)

    cols = np.concatenate([typ[1:] for typ in amount_enc.categories_])
    amount_ftrs = pd.DataFrame.sparse.from_spmatrix(amount_oh, columns=cols)
    

    return pd.concat([data, amount_ftrs, date_ftrs, tfidf_df], axis=1).drop(columns=to_remove)

In [8]:
model_ftrs = ftr_generation(relevant_of)
model_ftrs.head() 

# 9 seconds

Unnamed: 0,category,is_weekend,is_even_amount,bin_10,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,...,tfidf_world,tfidf_worth,tfidf_xfer,tfidf_xsolla,tfidf_york,tfidf_your,tfidf_youtube,tfidf_zaxby,tfidf_zelle,tfidf_zip
0,FOOD_AND_BEVERAGES,0,0,0,0.0,0,0.0,1.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,FOOD_AND_BEVERAGES,0,0,0,0.0,0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,GENERAL_MERCHANDISE,0,0,0,0.0,0,1.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,FOOD_AND_BEVERAGES,0,0,0,0.0,0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,GENERAL_MERCHANDISE,0,0,0,1.0,0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
model_ftrs.shape # 28 + 1 for category (daniel 57)

(1306452, 1029)

In [10]:
# is_weekend vs day_of_month

# Sanity Check

- \# of rows should be **1306452**
- 1028 ftrs = 1000 tfidf + 9 bins + 11 months + 6 days + is_even + is_weekend

In [11]:
model_ftrs.columns[:28].values

array(['category', 'is_weekend', 'is_even_amount', 'bin_10', 'bin_2',
       'bin_3', 'bin_4', 'bin_5', 'bin_6', 'bin_7', 'bin_8', 'bin_9',
       'month_10', 'month_11', 'month_12', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5'],
      dtype=object)

# Train / Test Sets (by Consumer Sampling)

In [12]:
training_size = int(model_ftrs.shape[0] * 0.75)
training_size

training_size = 100

In [13]:
# test_df = pd.concat([relevant_of[['prism_consumer_id']], model_ftrs], axis=1)
# train_df = pd.DataFrame(columns=test_df.columns)
# to_add = []

# ids = test_df.prism_consumer_id.unique()

# for _ in range(training_size):
#     curr_id = np.random.choice(ids)

#     temp = test_df[test_df.prism_consumer_id == curr_id]
#     if temp.shape[0] > 0:
#         row = temp.sample(n=1,)
#         test_df.drop(row.index, inplace=True)

#     else:
#         row = train_df[train_df.prism_consumer_id == curr_id].sample(n=1, replace=True)

#     to_add.append(row)
        

# Model Training

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [15]:
model_ftrs.to_csv('model_ftrs_df.csv')

In [16]:
X = model_ftrs.drop(columns=['category'])  # Features
y = model_ftrs['category']                 # Target

In [17]:
# del model_ftrs

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 104 seconds

In [None]:
lr = LogisticRegression(max_iter=100, random_state=42, n_jobs=2)
lr.fit(X_train, y_train) # Increase max_iter if needed -- never converges




In [None]:
y_pred = lr.predict(X_test)
y_pred_train = lr.predict(X_train)

In [None]:
test_acc = (y_test == y_pred).mean()
test_acc

In [None]:
train_acc = (y_train == y_pred_train).mean()
train_acc