In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [2]:
import sys
import os

PROJECT_ROOT = os.path.abspath("E:/aleksa_praksa/nlp_internship/data-internship")

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Project root added:", PROJECT_ROOT)

Project root added: E:\aleksa_praksa\nlp_internship\data-internship


In [None]:
from experiments.models.vectorizer import vectorizer
from experiments.models.Hyperparametar_tuning import run_grid_search, run_random_search

In [None]:
import mlflow
import json

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

In [6]:
def log_results_per_model(language, vectorizer_name, results_dict):

    experiment_name = f"Star_Prediction_{language}_{vectorizer_name}"
    mlflow.set_experiment(experiment_name)

    for model_name, model_info in results_dict.items():

        with mlflow.start_run(run_name=model_name):

            for p, v in model_info["best_params"].items():
                mlflow.log_param(p, v)

            mlflow.log_metric("cv_accuracy", model_info["best_score"])
            mlflow.log_metric("test_accuracy", model_info["test_accuracy"])

            # Save classification report as JSON
            report_json = json.dumps(model_info["classification_report"], indent=2)
            mlflow.log_text(
                report_json,
                f"{model_name}_classification_report.json"
            )

            mlflow.sklearn.log_model(
                model_info["trained_model"],
                name=f"{model_name}_model"
            )

    print(f"Logged all models for {language} - {vectorizer_name}")


In [None]:
df = pd.read_csv("E:/aleksa_praksa/nlp_internship/data-internship/data/tokenized_dataset.csv") 

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,stars,review_body,review_title,language,product_category,review_length,title_length,punctuation_count,all_caps_count,text,preprocessed_text,product_category_star_pred,category_book,category_digital_ebook_purchase,category_other,tokens
0,0,1,Armband ist leider nach 1 Jahr kaputt gegangen,Leider nach 1 Jahr kaputt,de,sports,46,25,0,0,Leider nach 1 Jahr kaputt Armband ist leider n...,Leider nach 1 Jahr kaputt Armband ist leider n...,other,0,0,1,"['Leider', 'nach', '1', 'Jahr', 'kaputt', 'Arm..."
1,1,1,In der Lieferung war nur Ein Akku!,EINS statt ZWEI Akkus!!!,de,home_improvement,34,24,1,0,EINS statt ZWEI Akkus!!! In der Lieferung war ...,EINS statt ZWEI Akkus!!! In der Lieferung war ...,other,0,0,1,"['EINS', 'statt', 'ZWEI', 'Akkus', '!', '!', '..."
2,2,1,"Ein Stern, weil gar keine geht nicht. Es hande...",Achtung Abzocke,de,drugstore,384,15,0,0,"Achtung Abzocke Ein Stern, weil gar keine geht...","Achtung Abzocke Ein Stern, weil gar keine geht...",other,0,0,1,"['Achtung', 'Abzocke', 'Ein', 'Stern', 'weil',..."
3,3,1,"Dachte, das w√§ren einfach etwas festere Binden...",Zu viel des Guten,de,drugstore,205,17,0,0,"Zu viel des Guten Dachte, das w√§ren einfach et...","Zu viel des Guten Dachte, das w√§ren einfach et...",other,0,0,1,"['Zu', 'viel', 'des', 'Guten', 'Dachte', 'das'..."
4,4,1,Meine Kinder haben kaum damit gespielt und nac...,Qualit√§t sehr schlecht,de,toy,114,22,0,0,Qualit√§t sehr schlecht Meine Kinder haben kaum...,Qualit√§t sehr schlecht Meine Kinder haben kaum...,other,0,0,1,"['Qualit√§t', 'sehr', 'schlecht', 'Meine', 'Kin..."


In [47]:
type(df['tokens'].iloc[0])

str

In [9]:
import ast

df['tokens'] = df['tokens'].apply(ast.literal_eval) # str to list


In [49]:
type(df['tokens'].iloc[0])

list

In [10]:

df['text_joined'] = df['tokens'].apply(lambda x: " ".join(x))


In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,stars,review_body,review_title,language,product_category,review_length,title_length,punctuation_count,all_caps_count,text,preprocessed_text,product_category_star_pred,category_book,category_digital_ebook_purchase,category_other,tokens,text_joined
0,0,1,Armband ist leider nach 1 Jahr kaputt gegangen,Leider nach 1 Jahr kaputt,de,sports,46,25,0,0,Leider nach 1 Jahr kaputt Armband ist leider n...,Leider nach 1 Jahr kaputt Armband ist leider n...,other,0,0,1,"[Leider, nach, 1, Jahr, kaputt, Armband, ist, ...",Leider nach 1 Jahr kaputt Armband ist leider n...
1,1,1,In der Lieferung war nur Ein Akku!,EINS statt ZWEI Akkus!!!,de,home_improvement,34,24,1,0,EINS statt ZWEI Akkus!!! In der Lieferung war ...,EINS statt ZWEI Akkus!!! In der Lieferung war ...,other,0,0,1,"[EINS, statt, ZWEI, Akkus, !, !, !, In, der, L...",EINS statt ZWEI Akkus ! ! ! In der Lieferung w...
2,2,1,"Ein Stern, weil gar keine geht nicht. Es hande...",Achtung Abzocke,de,drugstore,384,15,0,0,"Achtung Abzocke Ein Stern, weil gar keine geht...","Achtung Abzocke Ein Stern, weil gar keine geht...",other,0,0,1,"[Achtung, Abzocke, Ein, Stern, weil, gar, kein...",Achtung Abzocke Ein Stern weil gar keine geht ...
3,3,1,"Dachte, das w√§ren einfach etwas festere Binden...",Zu viel des Guten,de,drugstore,205,17,0,0,"Zu viel des Guten Dachte, das w√§ren einfach et...","Zu viel des Guten Dachte, das w√§ren einfach et...",other,0,0,1,"[Zu, viel, des, Guten, Dachte, das, w√§ren, ein...",Zu viel des Guten Dachte das w√§ren einfach etw...
4,4,1,Meine Kinder haben kaum damit gespielt und nac...,Qualit√§t sehr schlecht,de,toy,114,22,0,0,Qualit√§t sehr schlecht Meine Kinder haben kaum...,Qualit√§t sehr schlecht Meine Kinder haben kaum...,other,0,0,1,"[Qualit√§t, sehr, schlecht, Meine, Kinder, habe...",Qualit√§t sehr schlecht Meine Kinder haben kaum...


In [52]:
df['text_joined'].iloc[0]

'Leider nach 1 Jahr kaputt Armband ist leider nach 1 Jahr kaputt gegangen'

## Train, test, split

In [11]:
def split_language_df(df_one_language):
    return train_test_split(
        df_one_language[['review_length', 'category_book',	'category_digital_ebook_purchase',	'category_other',	'text_joined']],     # X
        df_one_language['stars'],     # Y
        test_size = 0.2,
        stratify = df_one_language['stars'],
        random_state = 42,

    )

In [12]:
# English
X_train_en, X_test_en, y_train_en, y_test_en = split_language_df(df[df['language'] == 'en'])

# German
X_train_de, X_test_de, y_train_de, y_test_de = split_language_df(df[df['language'] == 'de'])

# France
X_train_fr, X_test_fr, y_train_fr, y_test_fr = split_language_df(df[df['language'] == 'fr'])

# Spansih
X_train_es, X_test_es, y_train_es, y_test_es = split_language_df(df[df['language'] == 'es'])

In [55]:
X_train_en.head()

Unnamed: 0,review_length,category_book,category_digital_ebook_purchase,category_other,text_joined
309124,215,0,0,1,Great price for a great looking case ! Great p...
259047,159,0,0,1,Okay monitor It is not the best it just looks ...
213294,42,0,0,1,No taste No taste Might as well be eating the bag
282352,37,0,0,1,Three Stars This was a lot smaller than expected
184049,160,0,0,1,Fell off first few hours of wear Lost the prod...


In [56]:
print(y_train_en)

309124    4
259047    3
213294    2
282352    3
184049    1
         ..
231570    2
227320    2
340120    5
288092    4
215719    2
Name: stars, Length: 159330, dtype: int64


In [57]:
X_train_en['text_joined'].iloc[0]

'Great price for a great looking case ! Great price for the money ! Definitely not as protective as other cases but it looks great Glows in the dark just as it says ! Seems to be well built however bubbles did appear as with all cases with liquid inside'

## Vectorization

convert list of tokens into a vector: TF-IDF (large df, spam, sentiment) and BoW (small df, topic modeling), Word Embeddings (Word2Vec)

In [13]:
# English
bow_train_en, bow_test_en, bow_vec_en = vectorizer(X_train_en, X_test_en, "bow")
tfidf_train_en, tfidf_test_en, tfidf_vec_en = vectorizer(X_train_en, X_test_en, "tfidf")



In [37]:
def pickle_save(path, obj):
    
    with open(path, "wb") as f:
        pickle.dump(obj, f)

In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\vectorizer\vec_star\tfidf_vec_en.pkl"
# pickle_save(path, tfidf_vec_en)

In [59]:
bow_train_en.shape # samples, unique tokens in vocabulary

(159330, 57942)

In [60]:
bow_vec_en.vocabulary_ # vocabulary dictionary.

{'Great': 9757,
 'price': 46580,
 'for': 35978,
 'a': 24549,
 'great': 37265,
 'looking': 41402,
 'case': 28812,
 '!': 0,
 'the': 53979,
 'money': 42887,
 'Definitely': 6395,
 'not': 43785,
 'as': 25978,
 'protective': 46902,
 'other': 44440,
 'cases': 28818,
 'but': 28335,
 'it': 39917,
 'looks': 41404,
 'Glows': 9564,
 'in': 39042,
 'dark': 31676,
 'just': 40244,
 'says': 49636,
 'Seems': 19514,
 'to': 54408,
 'be': 26810,
 'well': 56974,
 'built': 28173,
 'however': 38533,
 'bubbles': 28105,
 'did': 32492,
 'appear': 25722,
 'with': 57342,
 'all': 25287,
 'liquid': 41202,
 'inside': 39460,
 'Okay': 15308,
 'monitor': 42893,
 'It': 11493,
 'is': 39887,
 'best': 27059,
 'blurry': 27503,
 'and': 25528,
 'i': 38724,
 'have': 37880,
 'settings': 50239,
 'good': 37030,
 'I': 10824,
 'would': 57550,
 'highly': 38194,
 'recommend': 47838,
 'msi': 43107,
 'its': 39948,
 'little': 41255,
 'bit': 27252,
 'more': 42968,
 'way': 56846,
 'worth': 57534,
 'No': 14931,
 'taste': 53640,
 'Might': 14

In [61]:
print(bow_vec_en.get_feature_names_out()) # tokens

['!' '+1' '+10' ... '√≠' '√≠tem' '–∞']


In [14]:
# German
bow_train_de, bow_test_de, bow_vec_de = vectorizer(X_train_de, X_test_de, "bow")
tfidf_train_de, tfidf_test_de, tfidf_vec_de = vectorizer(X_train_de, X_test_de, "tfidf")

In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\vectorizer\vec_star\tfidf_vec_de.pkl"
# pickle_save(path, tfidf_vec_de)

In [15]:
# French
bow_train_fr, bow_test_fr, bow_vec_fr = vectorizer(X_train_fr, X_test_fr, "bow")
tfidf_train_fr, tfidf_test_fr, tfidf_vec_fr = vectorizer(X_train_fr, X_test_fr, "tfidf")

In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\vectorizer\vec_star\tfidf_vec_fr.pkl"
# pickle_save(path, tfidf_vec_fr)

In [16]:
# Spanish
bow_train_es, bow_test_es, bow_vec_es = vectorizer(X_train_es, X_test_es, "bow")
tfidf_train_es, tfidf_test_es, tfidf_vec_es = vectorizer(X_train_es, X_test_es, "tfidf")


In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\vectorizer\vec_star\bow_vec_es.pkl"
# pickle_save(path, bow_vec_es)

# Building Model

## BoW

In [65]:
param_grid_logreg = {
    "C": [0.1, 1, 3],
    "penalty": ["l2"],
    "solver": ["liblinear"],     
}

param_grid_linsvc = {
    "C": [0.1, 1, 3],
    "loss": ["hinge", "squared_hinge"],
}

param_grid_nb = {
    "alpha": [0.1, 0.5, 1.0],
}

In [66]:
param_grids = [param_grid_logreg, param_grid_linsvc, param_grid_nb]

In [67]:
models = [
    ("Logistic Regression", LogisticRegression(max_iter=300)),
    ("Linear SVM", LinearSVC()),
    ("Naive Bayes", MultinomialNB())
]

In [None]:

results_en_bow = run_grid_search(bow_train_en, y_train_en, bow_test_en, y_test_en, models, param_grids, lang_name="English")


 Grid Search for language: English


Training model: Logistic Regression

Fitting 3 folds for each of 3 candidates, totalling 9 fits





Best params for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV accuracy: 0.5857






Training model: Linear SVM

Fitting 3 folds for each of 6 candidates, totalling 18 fits





Best params for Linear SVM: {'C': 0.1, 'loss': 'hinge'}
Best CV accuracy: 0.5673






Training model: Naive Bayes

Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best params for Naive Bayes: {'alpha': 1.0}
Best CV accuracy: 0.5739



In [None]:
results_de_bow = run_grid_search(bow_train_de, y_train_de, bow_test_de, y_test_de, models, param_grids, lang_name="German")


 Grid Search for language: German


Training model: Logistic Regression

Fitting 3 folds for each of 3 candidates, totalling 9 fits





Best params for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV accuracy: 0.5698






Training model: Linear SVM

Fitting 3 folds for each of 6 candidates, totalling 18 fits





Best params for Linear SVM: {'C': 0.1, 'loss': 'hinge'}
Best CV accuracy: 0.5489






Training model: Naive Bayes

Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best params for Naive Bayes: {'alpha': 1.0}
Best CV accuracy: 0.5476



In [70]:
results_fr_bow = run_grid_search(bow_train_fr, y_train_fr,bow_test_fr, y_test_fr, models, param_grids, lang_name="French")


 Grid Search for language: French


Training model: Logistic Regression

Fitting 3 folds for each of 3 candidates, totalling 9 fits





Best params for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV accuracy: 0.5444






Training model: Linear SVM

Fitting 3 folds for each of 6 candidates, totalling 18 fits





Best params for Linear SVM: {'C': 0.1, 'loss': 'hinge'}
Best CV accuracy: 0.5250






Training model: Naive Bayes

Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best params for Naive Bayes: {'alpha': 1.0}
Best CV accuracy: 0.5270



In [71]:
results_es_bow = run_grid_search(bow_train_es, y_train_es,bow_test_es, y_test_es,models, param_grids,lang_name="Spanish")


 Grid Search for language: Spanish


Training model: Logistic Regression

Fitting 3 folds for each of 3 candidates, totalling 9 fits





Best params for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV accuracy: 0.5354






Training model: Linear SVM

Fitting 3 folds for each of 6 candidates, totalling 18 fits





Best params for Linear SVM: {'C': 0.1, 'loss': 'hinge'}
Best CV accuracy: 0.5139






Training model: Naive Bayes

Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best params for Naive Bayes: {'alpha': 1.0}
Best CV accuracy: 0.5202



In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\models\star_pred_models\lr_es_bow.pkl"
# pickle_save(path, results_es_bow["LogisticRegression"]["trained_model"])

## Tf-IDF

In [72]:
param_grid_logreg = {
    "solver": ["saga"],             
    "penalty": ["l1", "l2"],        
    "C": [0.5, 1.0, 3.0],           
    "max_iter": [300, 600],         
}


param_grid_linsvc = {
    "loss": ["hinge", "squared_hinge"],     
    "C": [0.5, 1.0, 2.0, 5.0],     
    "multi_class": ["ovr"],
}

param_grid_nb = {
    "alpha": [0.1, 0.5, 1.0, 2.0, 5.0],    
    "fit_prior": [True, False]           
}

param_grids = [param_grid_logreg, param_grid_linsvc, param_grid_nb]

In [73]:
models = [
    ('LogisticRegression', LogisticRegression()),
    ('LinearSVC', LinearSVC()),
    ("MultinomialNB", MultinomialNB())
]

In [74]:
n_iters = [8, 8, 8]

In [75]:
results_en_tfidf = run_random_search(tfidf_train_en, y_train_en, tfidf_test_en, y_test_en, models, param_grids, n_iters, lang_name = 'English')


 Random Search for language: English 


Training model: LogisticRegression

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LogisticRegression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 300, 'C': 0.5}
Best CV accuracy: 0.5882


Training model: LinearSVC

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LinearSVC: {'multi_class': 'ovr', 'loss': 'squared_hinge', 'C': 0.5}
Best CV accuracy: 0.5687


Training model: MultinomialNB

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for MultinomialNB: {'fit_prior': True, 'alpha': 5.0}
Best CV accuracy: 0.5644



In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\models\star_pred_models\lr_en_tfidf.pkl"
# pickle_save(path, results_en_tfidf["LogisticRegression"]["trained_model"])

In [76]:
results_de_tfidf = run_random_search(tfidf_train_de, y_train_de,tfidf_test_de, y_test_de,models, param_grids, n_iters,lang_name="German")


 Random Search for language: German 


Training model: LogisticRegression

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LogisticRegression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 600, 'C': 0.5}
Best CV accuracy: 0.5683


Training model: LinearSVC

Fitting 2 folds for each of 8 candidates, totalling 16 fits





Best params for LinearSVC: {'multi_class': 'ovr', 'loss': 'hinge', 'C': 1.0}
Best CV accuracy: 0.5443






Training model: MultinomialNB

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for MultinomialNB: {'fit_prior': False, 'alpha': 2.0}
Best CV accuracy: 0.5308



In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\models\star_pred_models\lr_de_tfidf.pkl"
# pickle_save(path, results_de_tfidf["LogisticRegression"]["trained_model"])

In [77]:
results_fr_tfidf = run_random_search(tfidf_train_fr, y_train_fr, tfidf_test_fr, y_test_fr, models, param_grids, n_iters,lang_name="French")


 Random Search for language: French 


Training model: LogisticRegression

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LogisticRegression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 300, 'C': 0.5}
Best CV accuracy: 0.5436


Training model: LinearSVC

Fitting 2 folds for each of 8 candidates, totalling 16 fits





Best params for LinearSVC: {'multi_class': 'ovr', 'loss': 'hinge', 'C': 1.0}
Best CV accuracy: 0.5214






Training model: MultinomialNB

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for MultinomialNB: {'fit_prior': False, 'alpha': 5.0}
Best CV accuracy: 0.5215



In [None]:
path = r"E:\aleksa_praksa\nlp_internship\data-internship\saved_models\models\star_pred_models\lr_fr_tfidf.pkl"
# pickle_save(path, results_fr_tfidf["LogisticRegression"]["trained_model"])

In [78]:
results_es_tfidf = run_random_search(tfidf_train_es, y_train_es, tfidf_test_es, y_test_es, models, param_grids, n_iters,lang_name="Spanish")


 Random Search for language: Spanish 


Training model: LogisticRegression

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LogisticRegression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 300, 'C': 0.5}
Best CV accuracy: 0.5335


Training model: LinearSVC

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for LinearSVC: {'multi_class': 'ovr', 'loss': 'squared_hinge', 'C': 0.5}
Best CV accuracy: 0.5137


Training model: MultinomialNB

Fitting 2 folds for each of 8 candidates, totalling 16 fits

Best params for MultinomialNB: {'fit_prior': False, 'alpha': 2.0}
Best CV accuracy: 0.5142



In [79]:
print(results_en_bow['Logistic Regression'])

{'best_params': {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, 'best_score': 0.585746563735643, 'test_accuracy': 0.5930007782491904, 'classification_report': {'1': {'precision': 0.647683290943271, 'recall': 0.7357115940208516, 'f1-score': 0.6888967301811338, 'support': 7961.0}, '2': {'precision': 0.4947930939983557, 'recall': 0.45318775100401604, 'f1-score': 0.47307742696187605, 'support': 7968.0}, '3': {'precision': 0.5092605683585466, 'recall': 0.4525125628140704, 'f1-score': 0.479212399388013, 'support': 7960.0}, '4': {'precision': 0.5744499645138396, 'recall': 0.507588109870814, 'f1-score': 0.5389532560926887, 'support': 7973.0}, '5': {'precision': 0.6938340089609558, 'recall': 0.8159578471960858, 'f1-score': 0.7499567598731623, 'support': 7971.0}, 'accuracy': 0.5930007782491904, 'macro avg': {'precision': 0.5840041853549938, 'recall': 0.5929915729811676, 'f1-score': 0.5860193144993748, 'support': 39833.0}, 'weighted avg': {'precision': 0.584015078673686, 'recall': 0.593000778

In [None]:
log_results_per_model("English_", "BoW",  results_en_bow)
log_results_per_model("English", "TFIDF", results_en_tfidf)


log_results_per_model("German",  "BoW",   results_de_bow)
log_results_per_model("German",  "TFIDF", results_de_tfidf)


log_results_per_model("French",  "BoW",   results_fr_bow)
log_results_per_model("French",  "TFIDF", results_fr_tfidf)


log_results_per_model("Spanish", "BoW",   results_es_bow)
log_results_per_model("Spanish", "TFIDF", results_es_tfidf)


2025/11/21 15:02:43 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_English__BoW' does not exist. Creating a new experiment.


üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/391554181688874518/runs/647f693cc18045b1adf32085132d90e3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/391554181688874518




üèÉ View run Linear SVM at: http://127.0.0.1:5000/#/experiments/391554181688874518/runs/1d5aee3e123c4b0996de0e2cfe9c9999
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/391554181688874518


2025/11/21 15:02:57 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_English_TFIDF' does not exist. Creating a new experiment.


üèÉ View run Naive Bayes at: http://127.0.0.1:5000/#/experiments/391554181688874518/runs/2514a17e925f48ecb5681c077d6fe97c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/391554181688874518
Logged all models for English_ - BoW




üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/889529606865708664/runs/a41a7535d97f4a6fa6314185c65983a4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/889529606865708664




üèÉ View run LinearSVC at: http://127.0.0.1:5000/#/experiments/889529606865708664/runs/74f26bff77a24de1a53b0d8712f07cfd
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/889529606865708664


2025/11/21 15:03:09 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_German_BoW' does not exist. Creating a new experiment.


üèÉ View run MultinomialNB at: http://127.0.0.1:5000/#/experiments/889529606865708664/runs/dc7d38db88ce4ab2a6fb65ec10d1641c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/889529606865708664
Logged all models for English - TFIDF




üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/961458323604230828/runs/6c0650a7e80d4d0ca9621eba605fbf70
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/961458323604230828




üèÉ View run Linear SVM at: http://127.0.0.1:5000/#/experiments/961458323604230828/runs/292e9ade0162475694fedac660c17b77
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/961458323604230828


2025/11/21 15:03:22 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_German_TFIDF' does not exist. Creating a new experiment.


üèÉ View run Naive Bayes at: http://127.0.0.1:5000/#/experiments/961458323604230828/runs/c105b7f150124dc1967902988dd25705
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/961458323604230828
Logged all models for German - BoW




üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/784009203988991698/runs/79b4dd638a294ae4b974f1662c72b9a8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/784009203988991698




üèÉ View run LinearSVC at: http://127.0.0.1:5000/#/experiments/784009203988991698/runs/93bce599b10244c1ad701d1766d84b84
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/784009203988991698


2025/11/21 15:05:35 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_French_BoW' does not exist. Creating a new experiment.


üèÉ View run MultinomialNB at: http://127.0.0.1:5000/#/experiments/784009203988991698/runs/2174e095bb234a3db0e53dbef4d1a107
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/784009203988991698
Logged all models for German - TFIDF




üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/611871693143042925/runs/cea06b4c91fd4ce089f4e63ea2b9bdc7
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/611871693143042925




üèÉ View run Linear SVM at: http://127.0.0.1:5000/#/experiments/611871693143042925/runs/11ccfb8405fa4f91a00bb83215d2d376
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/611871693143042925


2025/11/21 15:05:46 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_French_TFIDF' does not exist. Creating a new experiment.


üèÉ View run Naive Bayes at: http://127.0.0.1:5000/#/experiments/611871693143042925/runs/848d9b4b038c4f0f958e276b4592dee0
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/611871693143042925
Logged all models for French - BoW




üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/716947097386304286/runs/8233179a8e2b4363b94ebe00319468ea
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/716947097386304286




üèÉ View run LinearSVC at: http://127.0.0.1:5000/#/experiments/716947097386304286/runs/93c94d7f1c7146c986ad743d5745bd92
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/716947097386304286


2025/11/21 15:05:58 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_Spanish_BoW' does not exist. Creating a new experiment.


üèÉ View run MultinomialNB at: http://127.0.0.1:5000/#/experiments/716947097386304286/runs/2caea7f3c53d4d389e7d84f5d3c3d4a4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/716947097386304286
Logged all models for French - TFIDF




üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/148784029907323926/runs/6104047490df4d81a539723ce0113639
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/148784029907323926




üèÉ View run Linear SVM at: http://127.0.0.1:5000/#/experiments/148784029907323926/runs/19876f30a73041e88c1f054d98fefabe
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/148784029907323926


2025/11/21 15:06:11 INFO mlflow.tracking.fluent: Experiment with name 'Star_Prediction_Spanish_TFIDF' does not exist. Creating a new experiment.


üèÉ View run Naive Bayes at: http://127.0.0.1:5000/#/experiments/148784029907323926/runs/4f4a2fc0f6f04937ad11dff82137aa41
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/148784029907323926
Logged all models for Spanish - BoW




üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/506771660971723247/runs/6daee4e610134b42b0459bf406d68c25
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/506771660971723247




üèÉ View run LinearSVC at: http://127.0.0.1:5000/#/experiments/506771660971723247/runs/9c02f2732a4c4024b6a814274164f2b6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/506771660971723247




üèÉ View run MultinomialNB at: http://127.0.0.1:5000/#/experiments/506771660971723247/runs/f77c7c57dbbe4b07807c96e2b756361c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/506771660971723247
Logged all models for Spanish - TFIDF


In [17]:
from xgboost import XGBClassifier

In [18]:
param_grid_xgb = {
    'max_depth': [4, 6],
    'min_child_weight': [1, 3, 5],
    'n_estimators': [200, 400],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [19]:
model = XGBClassifier(objective="multi:softmax", num_class=5, eval_metric="mlogloss")

In [20]:
grid = RandomizedSearchCV(model, param_grid_xgb, n_iter = 20, cv = 2, scoring = 'accuracy', n_jobs=-1, verbose=1)

In [21]:
y_train_xgb = y_train_en.values - 1
y_test_xgb = y_test_en.values - 1

In [22]:
grid.fit(tfidf_train_en, y_train_xgb)

Fitting 2 folds for each of 20 candidates, totalling 40 fits


0,1,2
,estimator,"XGBClassifier..._class=5, ...)"
,param_distributions,"{'colsample_bytree': [0.6, 0.8, ...], 'learning_rate': [0.05, 0.1], 'max_depth': [4, 6], 'min_child_weight': [1, 3, ...], ...}"
,n_iter,20
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,2
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [23]:
best_params = grid.best_params_


In [24]:
print(f"\nBest params for: {best_params}")
print(f"Best CV accuracy: {grid.best_score_:.4f}\n")


Best params for: {'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best CV accuracy: 0.5768



In [25]:
final_model = XGBClassifier(**best_params)


In [26]:
final_model.fit(tfidf_train_en, y_train_xgb)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [27]:
preds = final_model.predict(tfidf_test_en)

In [None]:
preds = preds + 1 # jedino za ovaj mora da ide od 0

In [30]:
class_report = classification_report(y_test_en, preds)
print(class_report)

              precision    recall  f1-score   support

           1       0.64      0.70      0.67      7961
           2       0.48      0.49      0.48      7968
           3       0.51      0.46      0.48      7960
           4       0.57      0.51      0.54      7973
           5       0.69      0.76      0.72      7971

    accuracy                           0.58     39833
   macro avg       0.58      0.58      0.58     39833
weighted avg       0.58      0.58      0.58     39833

