In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import os
import time

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/gok2s/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/gok2s/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Loading and Preprocessing Functions
Data sets had to go through a process. Because some tuples contained more than one semicolon, which made it difficult to extract the data. Some tuples had a "null" value or no value at all. These also had to be resolved.

In [25]:
# Cell 2: Data Loading and Preprocessing Functions
def load_data():
    def read_file(file_path):
        data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                split_line = line.split(';', 1)  # Split only at the first semicolon
                if len(split_line) == 2:
                    data.append(split_line)
        return data

    categories = read_file('data/Product_Categories.txt')
    explanations = read_file('data/Product_Explanation.txt')

    categories_df = pd.DataFrame(categories, columns=['Product_ID', 'Category'])
    explanations_df = pd.DataFrame(explanations, columns=['Product_ID', 'Description'])

    data = pd.merge(explanations_df, categories_df, on='Product_ID')
    data.dropna(subset=['Category', 'Description'], inplace=True)  # Drop rows with null values in either Category or Description
    return data

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))  # Remove punctuation and digits
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('turkish')]  # Remove stop words
    return ' '.join(tokens)

def preprocess_data(data):
    data['Cleaned_Description'] = data['Description'].apply(clean_text)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data['Cleaned_Description'])
    y = data['Category']
    return X, y, vectorizer

def save_preprocessed_data(X, y, vectorizer, file_prefix='preprocessed'):
    with open(f'{file_prefix}_X.pkl', 'wb') as f:
        pickle.dump(X, f)
    with open(f'{file_prefix}_y.pkl', 'wb') as f:
        pickle.dump(y, f)
    with open(f'{file_prefix}_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

def load_preprocessed_data(file_prefix='preprocessed'):
    with open(f'{file_prefix}_X.pkl', 'rb') as f:
        X = pickle.load(f)
    with open(f'{file_prefix}_y.pkl', 'rb') as f:
        y = pickle.load(f)
    with open(f'{file_prefix}_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    return X, y, vectorizer

In [26]:
# Cell 3: Data Loading and Preprocessing
# Check if preprocessed data exists
# While testing designed for efficiency if you want you can delete files to execute "else" part
if os.path.exists('preprocessed_X.pkl') and os.path.exists('preprocessed_y.pkl') and os.path.exists('preprocessed_vectorizer.pkl'):
    print("Loading preprocessed data...")
    X, y, vectorizer = load_preprocessed_data()
else:
    print("Loading and preprocessing data...")
    data = load_data()
    X, y, vectorizer = preprocess_data(data)
    save_preprocessed_data(X, y, vectorizer)
print("Data loaded and preprocessed successfully.")

# Scaling the data
# Scaling the data ensures that all features are on a similar scale, which helps the models to learn more effectively and efficiently. 
# This is especially important when using algorithms sensitive to feature scaling.
scaler = StandardScaler(with_mean=False)  # with_mean=False to avoid issues with sparse matrices
X = scaler.fit_transform(X)

Loading preprocessed data...
Data loaded and preprocessed successfully.


In [5]:
# Cell 4: Handling Class Imbalance with SMOTE
# Examine the distribution of classes
class_distribution = pd.Series(y).value_counts()
print("Class distribution before balancing:")
print(class_distribution)

Class distribution before balancing:
Category
\n                                                              29
Bilgisayar Ürünleri > Tüketim Malzemeleri > Kartuş\n            12
Telefon > Cep Telefonu\n                                        11
Toner Kartuş Şerit > Kartuş\n                                   11
Yazılım > Oyunlar > PC > ARAL\n                                 10
                                                                ..
BİLGİSAYAR > AĞ/MODEM > MODEM\n                                  1
OEM Ürünleri > Soğutma Sistemleri > İşlemci > ZALMAN\n           1
Fotoğraf Makinesi Aksesuarı > Addison\n                          1
OEM Ürünleri > Soğutma Sistemleri > Sıvı Soğutma > CORSAIR\n     1
Corsair Force GS 240 GB SSD Disk CSSD-F240GBGS-BK\n              1
Name: count, Length: 1165, dtype: int64


I am removing classes each containing less than 5 examples in order to avoid memorizing these less frequent classes by a model and to make the dataset more balanced.

In [22]:
# Remove rare classes
min_samples = 5  # Minimum number of samples required for each class
filtered_classes = class_distribution[class_distribution >= min_samples].index
X = X[y.isin(filtered_classes)]
y = y[y.isin(filtered_classes)]

To create artificial sample sizes for the minority segments, SMOTE is used. The result of this would be an improvement of model performance as well as generalization while at the same time ensuring that each training instance sufficiently represents every category through attaining balance in the distribution of classes.

In [23]:
print("Handling class imbalance using SMOTE...")
smote = SMOTE(k_neighbors=1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Class distribution after balancing:")
print(pd.Series(y_resampled).value_counts())

Handling class imbalance using SMOTE...
Class distribution after balancing:
Category
Bilgisayar > Taşınabilir Bilgisayarlar\n                                             29
Bilgisayar Ürünleri > Tüketim Malzemeleri > Kartuş\n                                 29
Kamera\n                                                                             29
Elektronik Televizyon > Televizyon > LED Televizyon\n                                29
Bilgisayar Bileşenleri > Ses Kartları > ASUS\n                                       29
Bilgisayar Bileşenleri > Bellekler > CORSAIR\n                                       29
Bilgisayar > Yazıcılar > Kartuş-Toner-Drum\n                                         29
Aksesuar Ürünleri > Kablolar > Görüntü Kabloları > S-LINK\n                          29
OYUN &AMP HOBİ > OYUN & YAZILIM > PC\n                                               29
Oyun ve Oyun Konsolu > Oyunlar\n                                                     29
Tüketim Malzemeleri > Kartuş > HP\n

In [7]:
# Cell 5: Splitting Data into Training and Testing Sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
print("Data split successfully.")

Splitting data into training and testing sets...
Data split successfully.


In [8]:
# Cell 6: Helper Function to Print Classification Report
def print_classification_report(report):
    print("accuracy: ", report['accuracy'])
    print("macro avg: ", report['macro avg'])
    print("weighted avg: ", report['weighted avg'])

# Approach
I used many different types of machine learning models such as Logistic Regression, Support Vector Machine (SVM), Random Forest, Decision Tree, Naive Bayes, and K-Nearest Neighbors (KNN) models to forecast product categories grounded in descriptions. I prepared the dataset after balancing class distribution with SMOTE and scaling the features with StandardScaler. I performed hyperparameter tuning using GridSearchCV for each model to optimize performance.

# Logistic Regression

In [20]:
print("Training Logistic Regression...")
logreg = LogisticRegression(class_weight='balanced', max_iter=2000, solver='liblinear', tol=1e-3)
param_grid_logreg = {'C': [0.01, 0.1, 1, 10, 100]}
grid_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5)
grid_logreg.fit(X_train, y_train)
y_pred_logreg = grid_logreg.predict(X_test)
report_logreg = classification_report(y_test, y_pred_logreg, output_dict=True, zero_division=0)
print_classification_report(report_logreg)
print("Best Score for Logistic Regression: ", grid_logreg.best_score_)

Training Logistic Regression...
accuracy:  0.8072916666666666
macro avg:  {'precision': 0.8153679653679653, 'recall': 0.8098965848965849, 'f1-score': 0.7793150150129555, 'support': 192.0}
weighted avg:  {'precision': 0.8525669642857142, 'recall': 0.8072916666666666, 'f1-score': 0.7981469344813167, 'support': 192.0}
Best Score for Logistic Regression:  0.8013071895424837


Balanced performance with a good precision-recall balance, effective for interpretable models.

# Support Vector Machine

In [13]:
print("Training Support Vector Machine...")
svm = SVC(class_weight='balanced')
param_grid_svm = {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5)
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)
report_svm = classification_report(y_test, y_pred_svm, output_dict=True, zero_division=0)
print_classification_report(report_svm)
print("Best Score for Support Vector Machine: ", grid_svm.best_score_)

Training Support Vector Machine...
accuracy:  0.8125
macro avg:  {'precision': 0.8257798646814666, 'recall': 0.8098965848965849, 'f1-score': 0.7858900358900359, 'support': 192.0}
weighted avg:  {'precision': 0.8638018279394138, 'recall': 0.8125, 'f1-score': 0.8063647868335369, 'support': 192.0}
Best Score for Support Vector Machine:  0.7830065359477123


Best individual performer in terms of accuracy and precision, handling the complexity of the dataset well.

# Random Forest

In [14]:
print("Training Random Forest...")
rf = RandomForestClassifier(class_weight='balanced')
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True, zero_division=0)
print_classification_report(report_rf)
print("Best Score for Random Forest: ", grid_rf.best_score_)

Training Random Forest...
accuracy:  0.8072916666666666
macro avg:  {'precision': 0.813746487430698, 'recall': 0.8048460798460798, 'f1-score': 0.777829383092541, 'support': 192.0}
weighted avg:  {'precision': 0.8488917606516292, 'recall': 0.8072916666666666, 'f1-score': 0.7965859828524303, 'support': 192.0}
Best Score for Random Forest:  0.7908496732026145


Robust performance, useful for datasets with varied feature importance, though slightly less accurate than SVM.

# Decision Tree

In [15]:
print("Training Decision Tree...")
dt = DecisionTreeClassifier(class_weight='balanced')
param_grid_dt = {'max_depth': [5, 10, 20, 30]}
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5)
grid_dt.fit(X_train, y_train)
y_pred_dt = grid_dt.predict(X_test)
report_dt = classification_report(y_test, y_pred_dt, output_dict=True, zero_division=0)
print_classification_report(report_dt)
print("Best Score for Decision Tree: ", grid_dt.best_score_)

Training Decision Tree...
accuracy:  0.8072916666666666
macro avg:  {'precision': 0.8068421052631579, 'recall': 0.7823713323713324, 'f1-score': 0.7575994829566848, 'support': 192.0}
weighted avg:  {'precision': 0.860076754385965, 'recall': 0.8072916666666666, 'f1-score': 0.7982516120827144, 'support': 192.0}
Best Score for Decision Tree:  0.7450980392156863


Comparable to Logistic Regression, simpler to interpret but can overfit without proper tuning.

# Naive Bayes

In [17]:
print("Training Naive Bayes...")
nb = MultinomialNB()
param_grid_nb = {'alpha': [0.01, 0.1, 1, 10]}
grid_nb = GridSearchCV(nb, param_grid_nb, cv=5)
grid_nb.fit(X_train, y_train)
y_pred_nb = grid_nb.predict(X_test)
report_nb = classification_report(y_test, y_pred_nb, output_dict=True, zero_division=0)
print_classification_report(report_nb)
print("Best Score for Naive Bayes: ", grid_nb.best_score_)

Training Naive Bayes...
accuracy:  0.7760416666666666
macro avg:  {'precision': 0.7981240981240981, 'recall': 0.7678571428571428, 'f1-score': 0.7443008201522134, 'support': 192.0}
weighted avg:  {'precision': 0.8564980158730159, 'recall': 0.7760416666666666, 'f1-score': 0.7854802075003313, 'support': 192.0}
Best Score for Naive Bayes:  0.7660130718954248


Consistent but generally lower performance, suited for simpler, more linear problems.

# K-Nearest Neighbors

In [18]:
print("Training K-Nearest Neighbors...")
knn = KNeighborsClassifier()
param_grid_knn = {'n_neighbors': [1, 3, 5, 7, 9]}
grid_knn = GridSearchCV(knn, param_grid_knn, cv=5)
grid_knn.fit(X_train, y_train)
y_pred_knn = grid_knn.predict(X_test)
report_knn = classification_report(y_test, y_pred_knn, output_dict=True, zero_division=0)
print_classification_report(report_knn)
print("Best Score for K-Nearest Neighbors: ", grid_knn.best_score_)

Training K-Nearest Neighbors...
accuracy:  0.8229166666666666
macro avg:  {'precision': 0.8318834275772075, 'recall': 0.8136844636844637, 'f1-score': 0.7927287527287528, 'support': 192.0}
weighted avg:  {'precision': 0.8713741028708134, 'recall': 0.8229166666666666, 'f1-score': 0.8162367724867724, 'support': 192.0}
Best Score for K-Nearest Neighbors:  0.7725490196078431


High accuracy, effective for capturing local structure in the data, but computationally intensive for large datasets.

# Ensemble Methods - Stacking Classifier

In [19]:
print("Training Stacking Classifier...")
estimators = [
    ('logreg', LogisticRegression(class_weight='balanced', C=grid_logreg.best_params_['C'], max_iter=2000, solver='liblinear', tol=1e-3)),
    ('svm', SVC(class_weight='balanced', C=grid_svm.best_params_['C'], kernel=grid_svm.best_params_['kernel'])),
    ('rf', RandomForestClassifier(class_weight='balanced', n_estimators=grid_rf.best_params_['n_estimators'], max_depth=grid_rf.best_params_['max_depth']))
]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
stacking_clf.fit(X_train, y_train)
y_pred_stacking = stacking_clf.predict(X_test)
report_stacking = classification_report(y_test, y_pred_stacking, output_dict=True, zero_division=0)
print_classification_report(report_stacking)
print("Best Score for Stacking Classifier: ", stacking_clf.score(X_test, y_test))

Training Stacking Classifier...
accuracy:  0.8020833333333334
macro avg:  {'precision': 0.8228964018437702, 'recall': 0.7917147667147668, 'f1-score': 0.7765171269007586, 'support': 192.0}
weighted avg:  {'precision': 0.8488227780990938, 'recall': 0.8020833333333334, 'f1-score': 0.790623402835679, 'support': 192.0}
Best Score for Stacking Classifier:  0.8020833333333334


Leveraged the strengths of multiple models, providing robust performance and demonstrating the power of ensemble methods.

# Conclusion
The machine learning models tested illustrated that SVM and KNN were strong in terms of accuracy. Nevertheless, the Stacking Classifier amalgamated multiple models in a complementary but equally powerful way. Quality improvement for data, feature improvement, and advanced ensemble techniques are recommended for future studies that could build on the findings of this task. Upon examining various machine learning models, it was revealed that SVM and KNN excelled in terms of accuracy. Nonetheless, the Stacking Classifier managed to intermix several models thus allowing for a stronger and more balanced model at once. In order to improve the model further, I need to focus on data quality enhancement, feature engineering as well as exploring more advanced ensemble techniques.

In [52]:
# Cell 14: Saving the Best Model and Vectorizer
best_model = None
best_score = 0
for grid in [grid_logreg, grid_svm, grid_rf, grid_dt, grid_nb, grid_knn, stacking_clf]:
    if hasattr(grid, 'best_score_') and grid.best_score_ > best_score:
        best_model = grid.best_estimator_
        best_score = grid.best_score_
    elif hasattr(grid, 'score') and grid.score(X_test, y_test) > best_score:
        best_model = grid
        best_score = grid.score(X_test, y_test)

print("Saving the best model and vectorizer...")
pickle.dump(best_model, open('model/best_model.pkl', 'wb'))
pickle.dump(vectorizer, open('model/vectorizer.pkl', 'wb'))
print("Model and vectorizer saved successfully.")

print("Training completed and model saved successfully.")

Saving the best model and vectorizer...
Model and vectorizer saved successfully.
Training completed and model saved successfully.
