In [None]:
import os
import pandas as pd
import numpy as np
import re
import json
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn import preprocessing

pd.set_option('display.max_columns', None)

In [None]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords


In [None]:
PATH = sys.path[3]
with open(f"{PATH}/data_to_work_account_info.json") as f:
    account_data = json.load(f)
FIRST_NAME = account_data["kycFirstName"].lower()
SURNAME = account_data["kycLastName"].split()[0].lower()
SECOND_SURNAME = account_data["kycLastName"].split()[-1].lower()
FULL_NAME_LONG = f"{FIRST_NAME} {SURNAME} {SECOND_SURNAME}"
FULL_NAME_SHORT = f"{FIRST_NAME} {SURNAME}"
FULL_NAME_LONG, FULL_NAME_SHORT
WORDS = stopwords.words('english') #typical words, as 'me', 'we', 'our', 'be', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should'


In [None]:

def label_transaction(transactions: dict):
    for counter, transaction_operation in enumerate(transactions):
        if "amount" in transaction_operation.keys():
            if transaction_operation["amount"] > 0:
                transactions[counter]["expense_type"] = "deposit"
            else:
                transactions[counter]["expense_type"] = "expense"
    return transactions

def remove_wording_in_transaction_label(transactions: dict):
    for counter, transaction_operation in enumerate(transactions):
        if "category" in transaction_operation.keys():
            transactions[counter]["category_preprocessed"] = " ".join(transaction_operation["category"].split("micro-v2-")[-1].split("-"))
    return transactions


def remove_commonly_used_words_and_chars(transactions: dict, col_names: list):
    """
    #keeps the descriptions of the banking transaction, and removes garbage data from the transaction string 
    # like colon, words that don't give any value as a feature. 
    # [^a-zA-Z_] match everything that is NOT an uppercase or lowercase letter at the start of the word
    # i.e. "-netto" will be replaced by " netto"  
    """
    for counter, transaction_operation in enumerate(transactions):
        text_merged = ""
        for col_name in col_names: 
            if col_name in transaction_operation.keys():
                text_preprocessed = " ".join([i for i in re.sub("[^a-zA-Z]"," ",transaction_operation[col_name]).split() if (i.lower() not in WORDS)]).lower()
                transactions[counter][f"{col_name}_preprocessed"] = text_preprocessed
                if (FULL_NAME_LONG not in text_preprocessed) and (FULL_NAME_SHORT not in text_preprocessed):
                    text_merged += f"{text_preprocessed} "
        if (text_merged == '') and (transaction_operation["paymentScheme"] == "SEPA"):
            text_merged = "transfer"
        transactions[counter]["transaction_description_merged"] = text_merged.strip()
    return transactions

In [None]:
with open(f"{PATH}/data_to_work_transactions.json") as f:
    data = json.load(f)

In [None]:
remove_commonly_used_words_and_chars(data,["partnerName","merchantName", "referenceText"])
remove_wording_in_transaction_label(data)
label_transaction(data)

In [None]:
df = pd.DataFrame.from_dict(data)

In [None]:
df[
    [
    "amount",
    "merchantCity",
    "merchantName",
    "merchantName_preprocessed",
    "category",
    "category_preprocessed",
    "expense_type",
    "referenceText",
    "referenceText_preprocessed",
    "transaction_description_merged",
    ]
]

## Training

In [None]:
df_training = pd.read_csv(f"{PATH}/training_set.csv")
df_training['description_preprocessed'] = df_training['description'].apply(lambda x: " ".join([i for i in re.sub("[^a-zA-Z]"," ",x).split() if (i not in WORDS and len(i)>1)]).lower()) 
# remove_commonly_used_words_from_transactions(df_training.to_dict("records"), ["description"])
df_training

In [None]:
vectorizer = CountVectorizer() #create the object
vectorizer.fit(df_training['description']) #Gives to each word a number. Each word is a feature, givin 113 features
X_counts_train = vectorizer.fit_transform(df_training['description']).toarray() #for each transaction, it assigns a +1 to each word in the position it was assigned. En cada transacción/fila, le asigna un "1" a una palabra en determinada posición, de las 113 palabras (features) que existen.

######################################### ASIGN LABELS TO CATEGORIES OF THE TRAINING & TESTING SET #########################################

le = preprocessing.LabelEncoder() #Encode target labels with value between 0 and n_classes-1
Y_train = le.fit_transform(df_training['label']) #assings a label to each category.

# Classifier

In [None]:
X_train = X_counts_train
gnb = GaussianNB() #Naive Bayes classifier
gnb.fit(X_train,Y_train) # Fit Gaussian Naive Bayes according to X_train and the labels

# Testing

In [None]:
X_counts_test = vectorizer.transform(df['transaction_description_merged']).toarray() #repeat the process of transforming each word to a number or index in an array
Y_predicted= gnb.predict(X_counts_test) 


# Results of the model

In [None]:
predictions = [] 
predictions_label = [] 
for counter,x in enumerate(Y_predicted):
    predictions.append(list(le.classes_)[x])
    predictions_label.append(x)

In [None]:
df = df.assign(category_model = predictions, label = predictions_label)
df

In [None]:
df[
    [
    "amount",
    "merchantCity",
    "merchantName",
    "merchantName_preprocessed",
    "category",
    "partnerName",
    "category_preprocessed",
    "expense_type",
    "referenceText",
    "referenceText_preprocessed",
    "transaction_description_merged",
    "category_model",
    ]
]

# Non relevant

In [None]:
# pd.read_csv(f"{PATH}/df.csv")

In [None]:
for col_name in df.columns:
    print(col_name, "\n", df[col_name].value_counts(), "\n")

In [None]:
df["transaction_description_merged"].value_counts()

In [None]:
df["type"].value_counts()

In [None]:
"https://stackoverflow.com/questions/15078519/python-dictionary-passed-as-an-input-to-a-function-acts-like-a-global-in-that-fu"