In [10]:
#for dataframe manipulation
import numpy as np 
import pandas as pd

#regular expressoin toolkit
import re

#NLP toolkits
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#for plotting expense categories later
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import seaborn as sns
import matplotlib
import matplotlib.ticker as ticker # for formatting major units on x-y axis

#for downloading BERT
from sentence_transformers import SentenceTransformer

#for finding most similar text vectors
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /Users/fstrauf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import re

def clean_text_BERT(text):

    # Convert words to lower case.
    text = text.lower()

    # Remove special characters and numbers. This also removes the dates 
    # which are not important in classifying expenses
    text = re.sub(r'[^\w\s]|https?://\S+|www\.\S+|https?:/\S+|[^\x00-\x7F]+|\d+', '', str(text).strip())
  
    # Tokenise 
    text_list = word_tokenize(text)
    result = ' '.join(text_list)
    return result

In [3]:
import pandas as pd

# Specify the file path or URL
new_file_path = 'new_data.csv'
trained_file_path = 'trained_data.csv'

# Read the CSV data
new_df = pd.read_csv(new_file_path)
trained_df = pd.read_csv(trained_file_path)

# Print the DataFrame
print(new_df)

      id                            description    category
0   1300       Ben Jerrys Noosa Noosa Heads QLD  DinnerBars
1   1302         LIFELINE NOOSA NOOSA HEADS QLD    Shopping
2   1305      LIVELIFE PHRMCY NOOSA NOOSA HEADS   Groceries
3   1306                      COLES NOOSA HEADS   Groceries
4   1307          FRASER COAST REGIONAL TORQUAY      Travel
5   1308      THE DECK SEA SALT RAINBOW BEACQLD  DinnerBars
6   1309          RAINBOW BEACH RAINBOW BEACQLD      Travel
7   1312                  TELSTRA MELBOURNE VIC     Utility
8   1313       COOLUM HOLIDAY PARK COOLUM BEACH      Travel
9   1314             GELATO MIO COOLUM BEACHQLD  DinnerBars
10  1315    BROKEN HEAD HOLIDAY BROKEN HEAD NSW      Travel
11  1316     COFFEE CLUB COOLUM COOLUM BEACHQLD  DinnerBars
12  1317                     COLES COOLUM BEACH   Groceries
13  1318       FRASER ISLAND FUELS RAINBOW BEAC   Transport
14  1319                       KMART LOGANHOLME    Shopping
15  1320      KINGSLCIFF NORTH HOLID KIN

In [12]:
text_raw = trained_df['description']
trained_text_BERT = text_raw.apply(lambda x: clean_text_BERT(x))

In [20]:
bert_input = trained_text_BERT.tolist()
model = SentenceTransformer('all-mpnet-base-v2') 
embeddings = model.encode(bert_input, show_progress_bar = True)
embedding_BERT = np.array(embeddings)

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.75MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 574kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 24.9MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 1.83MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 294kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 522kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:42<00:00, 10.3MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 116kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 863kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.57MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.24MB/s]
Downloading (…)8e1d/train_script.py: 100%|████

In [21]:
# Load texts
text_test_raw = new_df['description']

# Apply data cleaning function as for training data
text_test_BERT = text_test_raw.apply(lambda x: clean_text_BERT(x))


# Apply BERT embedding
bert_input_test = text_test_BERT.tolist()
#model = SentenceTransformer('paraphrase-mpnet-base-v2') 
embeddings_test = model.encode(bert_input_test, show_progress_bar = True)
embedding_BERT_test = np.array(embeddings_test)

df_embedding_bert_test = pd.DataFrame(embeddings_test)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  6.12it/s]


In [22]:
# Find the most similar word embedding with unseen data in the training data

similarity_new_data = cosine_similarity(embedding_BERT_test, embedding_BERT)
similarity_df = pd.DataFrame(similarity_new_data)

# Returns index for most similar embedding
# See first column of the output dataframe below
index_similarity = similarity_df.idxmax(axis = 1)

# Return dataframe for most similar embedding/transactions in training dataframe
data_inspect = trained_df.iloc[index_similarity, :].reset_index(drop = True)

unseen_verbatim = text_test_raw
matched_verbatim = data_inspect['description']
annotation = data_inspect['category']

d_output = {
            'unseen_transaction': unseen_verbatim,
            'matched_transaction': matched_verbatim, 
            'matched_class': annotation
            
            }

In [23]:
print(d_output)
d_output
df = pd.DataFrame(d_output)

# Export the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

{'unseen_transaction': 0          Ben Jerrys Noosa Noosa Heads QLD
1            LIFELINE NOOSA NOOSA HEADS QLD
2         LIVELIFE PHRMCY NOOSA NOOSA HEADS
3                         COLES NOOSA HEADS
4             FRASER COAST REGIONAL TORQUAY
5         THE DECK SEA SALT RAINBOW BEACQLD
6             RAINBOW BEACH RAINBOW BEACQLD
7                     TELSTRA MELBOURNE VIC
8          COOLUM HOLIDAY PARK COOLUM BEACH
9                GELATO MIO COOLUM BEACHQLD
10      BROKEN HEAD HOLIDAY BROKEN HEAD NSW
11       COFFEE CLUB COOLUM COOLUM BEACHQLD
12                       COLES COOLUM BEACH
13         FRASER ISLAND FUELS RAINBOW BEAC
14                         KMART LOGANHOLME
15        KINGSLCIFF NORTH HOLID KINGSCLIFF
16    ORIGINKEBABS HYPERDOME LOGANHOLME QLD
17    The Coffee Club Loganh Loganholme QLD
18      EARTH MARKETS LOGANQPS SHAILER PARK
19           CAESARS BARBER SHOP LOGANHOLME
20           FREEDOM FUELS TRQPS KINGSCLIFF
21         REFLECTIONS HOLIDAY NAMBUCCA HEA
22       