In [1]:
import pickle
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import pandas as pd



## Packages and Asset

## Classes and Functions

## Dependencies

In [2]:
train = pd.read_csv('../../assets/data/splits/train/preprocessed.csv')
y_train = train['label']
# X_train and X_val will be created later...

## Models and Params

### KNN

In [3]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['cosine']
}

### SVM

In [4]:
from sklearn.svm import SVC

svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
}

### Naive Bayes

In [5]:
from sklearn.naive_bayes import MultinomialNB

nb_params = {
    'alpha': [0.1, 1, 10],
    'fit_prior': [True, False]
}

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

lr_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear']
}

## CountVectorizer

### Declaration and fit

In [7]:
cv_vec = CountVectorizer(ngram_range=(1, 1))
X_train_cv = cv_vec.fit_transform(train['title']).toarray()



### Features visualization

In [8]:
X_train_names = pd.DataFrame(X_train_cv, columns=cv_vec.get_feature_names_out())
X_train_names

Unnamed: 0,00945litro,014cota,015l,033litro,10h30,110mil,13barril,13bi,14h,16h30,...,zeram,zerar,zero,zerou,ziviani,zona,zoom,zuckerberg,zuckerman,zup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Evaluate models

In [9]:
from sklearn.model_selection import GridSearchCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      scoring='accuracy',
                      cv=1)

    gs.fit(X_train_cv, y_train)
    print(f"Best CV results for {model.__class__.__name__}")
    print("Best Score of train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_))

    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }


    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params
    # cv_best_model = gs.best_estimator_
    # print("Test Score: " + str(gs.score(X_val_cv, y_val)))
    # print("----------------------------------------------------")

KeyboardInterrupt: 

## TF-IDF

### Declaration and fit

In [None]:
tfidf_vec = TfidfTransformer()
X_train_tfidf = tfidf_vec.fit_transform(train['title']).toarray()

### Features visualization

In [None]:
X_train_names = pd.DataFrame(X_train_tfidf, columns=cv_vec.get_feature_names_out())
X_train_names

### Evaluate models

In [None]:
from sklearn.model_selection import GridSearchCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      scoring='accuracy',
                      cv=1)

    gs.fit(X_train_tfidf, y_train)
    print(f"Best TF-IDF results for {model.__class__.__name__}")
    print("Best Score on train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_) + "\n")
    # tfidf_best_model = gs.best_estimator_
    # tfidf_best_model = gs.best_estimator_
    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }


    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params






# decide_best_model =
    # print("Test Score: " + str(gs.score(X_val, y_val)))
    # print("----------------------------------------------------")


## Outputs

In [None]:



# with open('../../assets/traditional_assets/cv_set.pkl', 'rb') as fout:
#     pickle.dump((cv_vec, cv_best_model), fout)

# with open('../../assets/traditional_assets/tfidf_set.pkl', 'rb') as fout:
#     pickle.dump((tfidf_vec, tfidf_best_model), fout)
