## Packages

In [13]:
import pickle
import pandas as pd

## Classes and Functions

## Dependencies

In [14]:
train = pd.read_csv('../../assets/data/splits/train/preprocessed.csv')
val = pd.read_csv('../../assets/data/splits/val/preprocessed.csv')

In [15]:
y_train = train['label']
y_val = val['label']

In [16]:
with open('../../assets/traditional_assets/cv_vec.pkl', 'rb') as fout:
    cv_vec = pickle.load(fout)


In [17]:
with open('../../assets/traditional_assets/tfidf_vec.pkl', 'rb') as fout:
    tfidf_vec = pickle.load(fout)

## Grid Params

### KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors': [11, 21, 40, 60, 80, 100],
    'metric': ['cosine'],
    # 'weights': ['distance']
    'weights': ['uniform', 'distance'],
    # 'metric': ['cosine']
}

### SVM

In [19]:
from sklearn.svm import SVC

svm_params = {
    'C': [10, 50, 100],
    'kernel': ['rbf', 'linear', 'poly'],
}

### Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB

nb_params = {
    'alpha': [0.1, 1, 10],
    'fit_prior': [True, False]
}

### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

lr_params = {
    'penalty': ['l2', None],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs', 'newtgon-cg', 'sag', 'saga']
}

## Tuning and Evaluation

In [22]:
import numpy as np
from sklearn.model_selection import PredefinedSplit

split_index = [-1] * len(train) + [0] * len(val)

X = pd.concat([train, val], axis=0, ignore_index=True)

y = np.concatenate((y_train, y_val), axis=0)
pds = PredefinedSplit(test_fold=split_index)





### Count Vectorizer

In [23]:
X_cv = cv_vec.transform(X['title']).toarray()
pd.DataFrame(X_cv, columns=cv_vec.get_feature_names_out())

Unnamed: 0,014cota,015l,033litro,100g,10h30,10x,110mil,13barril,13bi,14h,...,zero,zetta,zhoushan,ziviani,zombou,zona,zoom,zoox,zuckerberg,zup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import GridSearchCV

# from hypopt import GridSearch

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      )

    gs.fit(X_cv, y)
    print(f"Best CV results for {model.__class__.__name__}")
    print("Best Score of train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_))

    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }

    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params.to_csv('../../assets/traditional_assets/best_models_params_cv.csv', index=False)

df_best_models_params
# cv_best_model = gs.best_estimator_
# print("Test Score: " + str(gs.score(X_val_cv, y_val)))
# print("----------------------------------------------------")

In [None]:

cv_best_model = gs.best_estimator_
cv_best_model

### TF-IDF

In [None]:
# X_train_tfidf = tfidf_vec.transform(X_train['title'])
# X_val_tfidf = tfidf_vec.transform(X_val['title'])
# X_train_tfidf
X_tfidf = tfidf_vec.transform(X['title']).toarray()
pd.DataFrame(X_cv, columns=tfidf_vec.get_feature_names_out())

In [None]:
# from sklearn.model_selection import GridSearchCVCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      )
    gs.fit(X_tfidf, y)
    print(f"Best TF-IDF results for {model.__class__.__name__}")
    print("Best Score on train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_) + "\n")
    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }

    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params.to_csv('../../assets/traditional_assets/best_models_params_tfidf.csv', index=False)
df_best_models_params


# decide_best_model =
# print("Test Score: " + str(gs.score(X_val, y_val)))
# print("----------------------------------------------------")


In [None]:
tfidf_best_model = gs.best_estimator_
tfidf_best_model

## Outputs

In [None]:
with open('../../assets/traditional_assets/cv_set,pkl', 'wb') as fout:
    pickle.dump((cv_vec, cv_best_model), fout)

with open('../../assets/traditional_assets/tfidf_set.pkl', 'wb') as fout:
    pickle.dump((tfidf_vec, cv_best_model), fout)
