In [1]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

In [15]:
df = pd.read_csv("sample_data.csv", index_col=0)

In [10]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Downloading the Punkt Tokenizer Models
# Punkt is a pre-trained unsupervised machine learning model for tokenizing text.
# It divides a text into a list of sentence tokens and is trained to work with multiple languages.
# This is especially useful for splitting text into sentences and further into words, and it helps in text preprocessing.
nltk.download('punkt')

# Downloading the Stopwords Corpus
# Stopwords are words that do not contain significant information and are often removed during text preprocessing.
# They are words like "is", "at", "which", and "on", which do not carry meaningful context when analyzing text.
# NLTK provides a list of such stopwords in various languages, which can be utilized to filter them out from the text data.
nltk.download('stopwords')

# Downloading WordNet Lexical Database
# WordNet is a lexical database for the English language, which groups words into sets of synonyms and describes semantic relationships between them.
# In NLP, it is used for various tasks like determining word meanings, finding synonyms/antonyms, and lemmatizing words.
# Lemmatization is a process of reducing a word to its base/root form, and WordNet provides the necessary lexical knowledge to do this efficiently.
nltk.download('wordnet')

def preprocess_text(text: str) -> str:
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['Processed Transaction'] = df['Transaction Description'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /home/figgeous/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/figgeous/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/figgeous/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Splitting the data into training and testing sets. Currently training on 80% of the data, testing on the rest
X_train, X_test, y_train, y_test = train_test_split(
    df['Processed Transaction'], 
    df['Category'], 
    test_size=0.2, 
    random_state=42
)

model = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

model.fit(X_train, y_train)

predictions = model.predict(X_test)
print("Accuracy: ", round(accuracy_score(y_test, predictions),2))
print("Classification Report:\n", classification_report(y_test, predictions))

new_transactions = ['buying movie ticket', 'paying credit card bill']
processed_transactions = [preprocess_text(text) for text in new_transactions]
predicted_categories = model.predict(processed_transactions)

print("Example of usage:")
for trans, cat in zip(new_transactions, predicted_categories):
    print(f"{trans} => {cat}")

Accuracy:  0.93
Classification Report:
                precision    recall  f1-score   support

Entertainment       1.00      0.75      0.86         4
    Groceries       1.00      1.00      1.00         6
    Utilities       0.83      1.00      0.91         5

     accuracy                           0.93        15
    macro avg       0.94      0.92      0.92        15
 weighted avg       0.94      0.93      0.93        15

Example of usage:
buying movie ticket => Entertainment
paying credit card bill => Utilities
