In [19]:
import pickle
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import pandas as pd



## Packages and Asset

## Classes and Functions

## Dependencies

In [20]:
train = pd.read_csv('../../assets/data/splits/train/preprocessed.csv')
y_train = train['label']
# X_train and X_val will be created later...

## Models and Params

### KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors': [11,21,40,60,80,100],
    'metric': ['cosine'],
    # 'weights': ['distance']
    'weights': ['uniform', 'distance'],
    # 'metric': ['cosine']
}

### SVM

In [22]:
from sklearn.svm import SVC

svm_params = {
    'C': [10,50,100],
    'kernel': ['rbf','linear','poly'],
}

### Naive Bayes

In [23]:
from sklearn.naive_bayes import MultinomialNB

nb_params = {
    'alpha': [0.1, 1, 10],
    'fit_prior': [True, False]
}

### Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr_params = {
    'penalty': ['l2', None],
    'C': [0.1, 1, 10],
    'solver': ['liblinear','lbfgs','newtgon-cg', 'sag', 'saga']
}

## CountVectorizer

### Declaration and fit

In [25]:
cv_vec = CountVectorizer(ngram_range=(1, 1))
X_train_cv = cv_vec.fit_transform(train['title']).toarray()



### Features visualization

In [26]:
X_train_names = pd.DataFrame(X_train_cv, columns=cv_vec.get_feature_names_out())
X_train_names

Unnamed: 0,00945litro,014cota,015l,033litro,10h30,110mil,13barril,13bi,14h,16h30,...,zeram,zerar,zero,zerou,ziviani,zona,zoom,zuckerberg,zuckerman,zup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Evaluate models

In [27]:
from sklearn.model_selection import GridSearchCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      scoring='f1-micro',
                      cv=2)

    gs.fit(X_train_cv, y_train)
    print(f"Best CV results for {model.__class__.__name__}")
    print("Best Score of train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_))

    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }


    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params
    # cv_best_model = gs.best_estimator_
    # print("Test Score: " + str(gs.score(X_val_cv, y_val)))
    # print("----------------------------------------------------")

Best CV results for KNeighborsClassifier
Best Score of train set: 0.8398519432449105
Best estimator: KNeighborsClassifier(metric='cosine', n_neighbors=21, weights='distance')
Best parameter set: {'metric': 'cosine', 'n_neighbors': 21, 'weights': 'distance'}
Best CV results for SVC
Best Score of train set: 0.8841455891425046
Best estimator: SVC(C=10)
Best parameter set: {'C': 10, 'kernel': 'rbf'}
Best CV results for MultinomialNB
Best Score of train set: 0.8576804441702652
Best estimator: MultinomialNB(alpha=1)
Best parameter set: {'alpha': 1, 'fit_prior': True}
Best CV results for LogisticRegression
Best Score of train set: 0.8818630475015423
Best estimator: LogisticRegression(C=1, penalty='l1', solver='liblinear')
Best parameter set: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


Unnamed: 0,model_name,best_score,best_estimator,best_params
0,KNeighborsClassifier,0.839852,"KNeighborsClassifier(metric='cosine', n_neighb...","{'metric': 'cosine', 'n_neighbors': 21, 'weigh..."
1,SVC,0.884146,SVC(C=10),"{'C': 10, 'kernel': 'rbf'}"
2,MultinomialNB,0.85768,MultinomialNB(alpha=1),"{'alpha': 1, 'fit_prior': True}"
3,LogisticRegression,0.881863,"LogisticRegression(C=1, penalty='l1', solver='...","{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}"


In [32]:
df_best_models_params.to_csv('../../assets/traditional_assets/best_models_params_1.csv', index=False)



_b## TF-IDF

### Declaration and fit

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(ngram_range=(1, 1))
X_train_tfidf = tfidf_vec.fit_transform(train['title']).toarray()

### Features visualization

In [None]:
X_train_names = pd.DataFrame(X_train_tfidf, columns=cv_vec.get_feature_names_out())
X_train_names

### Evaluate models

In [None]:
from sklearn.model_selection import GridSearchCV

model_params = ([KNeighborsClassifier(), SVC(), MultinomialNB(), LogisticRegression()],
                [knn_params, svm_params, nb_params, lr_params])

list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      scoring='f1-micro',
                      cv=2)

    gs.fit(X_train_tfidf, y_train)
    print(f"Best TF-IDF results for {model.__class__.__name__}")
    print("Best Score on train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_) + "\n")
    # tfidf_best_model = gs.best_estimator_
    # tfidf_best_model = gs.best_estimator_
    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }


    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params






# decide_best_model =
    # print("Test Score: " + str(gs.score(X_val, y_val)))
    # print("----------------------------------------------------")


## Outputs

In [None]:



# with open('../../assets/traditional_assets/cv_set.pkl', 'rb') as fout:
#     pickle.dump((cv_vec, cv_best_model), fout)

# with open('../../assets/traditional_assets/tfidf_set.pkl', 'rb') as fout:
#     pickle.dump((tfidf_vec, tfidf_best_model), fout)
