## Import Libraries

In [None]:
import pandas as pd
import nltk as lp
import streamlit as st
import seaborn as sns
import string
import re

## Data processing

### Load in data

In [None]:
df = pd.read_csv("expenses.csv", sep=',', parse_dates=['date'], names=['date','account_name','from_acc_num','to_acc_num','code','af_bij','amount_(eur)','type','description'], header=0)
df.head()

### Convert EUR (str) into float datatype (only need to do it once)

In [None]:
df['amount_(eur)'] = [float(i.replace(',','.')) for i in df['amount_(eur)']]

In [None]:
df.loc[4,'description']

#### Fill NA values

In [None]:
df = df.fillna(0)

#### Filtering for simple overview of all payments made for tuition

In [None]:
df_tuition = df[(df['af_bij'] == 'Af') 
                & (df['amount_(eur)'] >= 2000) 
                & (df['account_name'] != 'Hr ZY Lian')][['date','account_name', 'amount_(eur)']]

df_tuition

##### Lineplot

In [None]:
df_tuition.plot(x='date', y='amount_(eur)', kind='line')

##### Boxplot

In [None]:
df_tuition['amount_(eur)'].plot(kind='box', vert=True, subplots=True, layout=(2,2), figsize=(14,8))

##### Density plot

In [None]:
ax = df_tuition['amount_(eur)'].plot(kind='density')
ax.axvline(df_tuition['amount_(eur)'].mean(), color='red')
ax.axvline(df_tuition['amount_(eur)'].median(), color='green')

##### Histogram

In [None]:
df_tuition['amount_(eur)'].plot(kind='hist')

##### Total Income

In [None]:
total_income = df[(df['af_bij'] == "Bij") & ((df['account_name'].str.contains("RIJK", case=False)) | (df['account_name'].str.contains("UBER", case=False)) | (df['account_name'].str.contains("EUROFINS", case=False)))]
total_income

In [None]:
total_income['amount_(eur)'].sum()

In [None]:
eurofins_income = total_income[(total_income['account_name'].str.contains('EUROFINS', case=False))]
university_income = total_income[(total_income['account_name'].str.contains('RIJK', case=False))]
uber_income = total_income[(total_income['account_name'].str.contains('UBER', case=False)) & ~(total_income['description'].str.contains('refund', case=False))]

In [None]:
ax1 = total_income['amount_(eur)'].plot(kind='density')
ax1.axvline(university_income['amount_(eur)'].mean(), color='red', label='university_income')
ax1.axvline(uber_income['amount_(eur)'].mean(), color='green', label='uber_income')
ax1.axvline(eurofins_income['amount_(eur)'].mean(), color='blue', label='eurofins_income')
ax1.legend()

### General DataFrame info

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.loc[:,['to_acc_num']].value_counts()

In [None]:
df[df['to_acc_num'] == 'NL55INGB0001979576']

In [None]:
df[(df['description'].str.contains('rent', case=False)) & (df['to_acc_num'] != 0)][['account_name','to_acc_num','description']]


In [None]:
df['account_name'].unique()[:20]

In [None]:
df['description'].unique()[500:510]

### Define categories

In [None]:
categories = ['entertainment','food','gifts_donation','housing','income','insurance','legal','loans','personal','savings_investments','subscriptions','taxes','transportation']
sub_categories = [['video/dvd','games','movies','concerts','sporting_events','live_theater','other'],
                  ['groceries','dine_out','other'],
                  ['birthday'],
                  ['mortgage_rent','phone','energy','water','gas','cable','waste_removal','maintenance_repairs','supplies','other'],
                  ['investments','salary','stocks','allowance','commission','interest','government_payments','gifts','payback'],
                  ['home','health','life','other'],
                  ['attorney','alimony','other'],
                  ['personal','credit_card','student','housing','other'],
                  ['medical','personal_care','clothing','tech','music','sports','other'],
                  ['retirement','investment','bank'],
                  ['phone','entertainment','fitness','learning','other'],
                  ['federal','state','local','income','other'],
                  ['vehicle_payment','public_transport','licensing','fuel','maintenance','other']]

#### Combine lists into dictionary

In [None]:
categories_dict = {categories[i]: sub_categories[i] for i in range(len(categories))}
categories_dict

### Convert text to numbers

#### Text preprocessing

##### Define columns with texts as object_columns

In [None]:
object_columns = df.loc[:,df.dtypes == 'object'].columns

for i, value in enumerate(object_columns):
    df[value] = df[value].astype(dtype='str')


##### Remove acc numbers columns

In [None]:
object_columns = object_columns.drop(['from_acc_num','to_acc_num'])

##### Convert all strings to lowercase, remove all punctuations, remove words combined with digits

In [None]:
for i, value in enumerate(object_columns):
    df[value] = df[value].str.lower() ## converts all strings to lowercase
    df[value] = df[value].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x)) ## removes all punctuation from text
    df[value] = df[value].apply(lambda x: re.sub('\w*\d\w*', '', x)) # removes words with digits like m34f1f31 from each row

In [None]:
df

##### Import NLTK functions for next steps

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

##### Tokenization of words

In [None]:
for i, value in enumerate(object_columns):
    df[f'{value}_tokens'] = df[value].apply(lambda x: word_tokenize(x)) ## converts all strings to lowercase and store as new column

In [None]:
df.loc[:,'account_name_tokens':]

##### Stopword removal

In [None]:
english_stopwords = stopwords.words('english')
dutch_stopwords = stopwords.words('dutch')

In [None]:
for i,columns in enumerate(object_columns):
    for j,rows in enumerate(df.index):
        df.replace(df[columns][rows],[word for word in df[columns][rows] if word not in english_stopwords or word not in dutch_stopwords])

In [None]:
df.loc[:,'account_name_tokens':]

##### Stemming and Lemmatization

##### Part of Speech Tagging

##### Information Retrieval