## Packages

In [None]:
import pickle
import pandas as pd

## Classes and Functions

## Dependencies

In [None]:
X_train_cv = pd.read_csv('../assets/data/splits/train/bow_cv.csv')
X_val_cv = pd.read_csv('../assets/data/splits/val/bow_cv.csv')
X_train_tfidf = pd.read_csv('../assets/data/splits/train/bow_tfidf.csv')
X_val_tfidf = pd.read_csv('../assets/data/splits/val/bow_tfidf.csv')

In [None]:
y_train = pd.read_csv('../assets/data/splits/train/raw.csv')['label']
y_val = pd.read_csv('../assets/data/splits/val/raw.csv')['label']

In [None]:
with open('../../assets/traditional_assets/cv_vec.pkl', 'rb') as fout:
    cv_vec = pickle.load(fout)


In [None]:
with open('../../assets/traditional_assets/tfidf_vec.pkl', 'rb') as fout:
    tfidf_vec = pickle.load(fout)

## Grid Params

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors': [11, 21, 40, 60, 80, 100],
    'metric': ['cosine'],
    # 'weights': ['distance']
    'weights': ['uniform', 'distance'],
    # 'metric': ['cosine']
}

### SVM

In [None]:
from sklearn.svm import SVC

svm_params = {
    'C': [10, 50, 100],
    'kernel': ['rbf', 'linear', 'poly'],
}

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_params = {
    'alpha': [0.1, 1, 10],
    'fit_prior': [True, False]
}

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_params = {
    'penalty': ['l2', None],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs', 'newtgon-cg', 'sag', 'saga']
}

## Tuning and Evaluation

### Count Vectorizer

In [None]:
# opt = GridSearch(model = SVR(), param_grid = param_grid)
# opt.fit(X_train, y_train, X_val, y_val)

In [None]:
# from sklearn.model_selection import GridSearchCV
from hypopt import GridSearch

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearch(model,
                    param_grid=params,
                    )

    gs.fit(X_train_cv, y_train, X_val_cv, y_val, scoring='accuracy')
    print(f"Best CV results for {model.__class__.__name__}")
    print("Best Score of train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_))

    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }

    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params.to_csv('../../assets/traditional_assets/best_models_params_cv.csv', index=False)

df_best_models_params
# cv_best_model = gs.best_estimator_
# print("Test Score: " + str(gs.score(X_val_cv, y_val)))
# print("----------------------------------------------------")

In [None]:

cv_best_model = gs.best_estimator_
cv_best_model

### TF-IDF

In [None]:
# from sklearn.model_selection import GridSearchCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearch(model,
                    param_grid=params,
                    )
    gs.fit(X_train_tfidf, y_train, X_val_tfidf, y_val, scoring='accuracy')
    print(f"Best TF-IDF results for {model.__class__.__name__}")
    print("Best Score on train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_) + "\n")
    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }

    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params.to_csv('../../assets/traditional_assets/best_models_params_tfidf.csv', index=False)
df_best_models_params


# decide_best_model =
# print("Test Score: " + str(gs.score(X_val, y_val)))
# print("----------------------------------------------------")


In [None]:
tfidf_best_model = gs.best_estimator_
tfidf_best_model

## Outputs

In [None]:
with open('../../assets/traditional_assets/cv_set,pkl', 'wb') as fout:
    pickle.dump((cv_vec, cv_best_model), fout)

with open('../../assets/traditional_assets/tfidf_set.pkl', 'wb') as fout:
    pickle.dump((tfidf_vec, cv_best_model), fout)
