## Packages and Assets

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import json

In [2]:
train = pd.read_csv('../../assets/data/splits/train/preprocessed.csv')
val = pd.read_csv('../../assets/data/splits/val/preprocessed.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [4]:
y_train = train['label']
cv = CountVectorizer(ngram_range=(1, 1))
X_train = cv.fit_transform(train['title']).toarray()
X_train_names = pd.DataFrame(X_train, columns=cv.get_feature_names_out())
X_train_names

Unnamed: 0,015l,10h30,10x,110mil,13barril,14h,15menos,15tri,17a,18h,...,zero,zerou,zona,zoom,zoox,zte,zto,zuckerberg,zuckerman,zup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14359,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14361,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_val = cv.transform(val['title']).toarray()
y_val = val['label']

## Functions

In [6]:
def evaluateModels(X_train, y_train, models, n_splits):
    print(f"{n_splits}-Fold Cross validation")
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)
        cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        print(f"{name}: Mean Accuracy={cv_results.mean():.5f}, Standard Deviation={cv_results.std():.5f}")

In [7]:
def viewPredictedRows(X_test, y_test, y_pred):
    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    df['correct'] = df['y_test'] == df['y_pred']
    df['correct'] = df['correct'].apply(lambda x: 'Correct' if x else 'Incorrect')
    df['title'] = X_test
    return df


In [8]:

def evaluateModelsWithoutKfold(X_train, y_train, X_test, y_test, models):
    results = []
    names = []
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append(accuracy)
        names.append(name)
        print(f"{name}: Accuracy={accuracy:.5f}")

## Tuning Models

### Logistic Regression

In [9]:
# parameter grid
parameters = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

logreg = LogisticRegression(max_iter=10000, multi_class='ovr')
clf = model_selection.GridSearchCV(logreg,  # model
                                   param_grid=parameters,  # hyperparameters
                                   scoring='accuracy',  # metric for scoring
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

KeyboardInterrupt: 

### Naive Bayes

In [None]:
# gridsearch for Naive Bayes
parameters = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
    'fit_prior': [True, False]
}

nb = MultinomialNB()
clf = model_selection.GridSearchCV(nb,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)
clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

### KNN

In [None]:
# gridsearch for KNN
parameters = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine']
}

knn = KNeighborsClassifier()
clf = model_selection.GridSearchCV(knn,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

### SVM

In [None]:
# parameters = [
#     {'kernel': ['linear', 'poly']}
# ]
# svm = SVC(C=1)
# clf = model_selection.GridSearchCV(svm,
#                                    param_grid=parameters,
#                                    scoring='accuracy',
#                                    cv=10)
#
# clf.fit(X_train, y_train)
# print("Tuned Hyperparameters :", clf.best_params_)
# print("Accuracy :", clf.best_score_)

## Evaluating Models

In [None]:
# models = []
#
# models.append(('LR', LogisticRegression(max_iter=10000, multi_class='ovr', C=0.001, penalty='l2', solver='newton-cg')))
# models.append(('SVM', SVC(C=1, kernel='linear')))
# models.append(('KNN', KNeighborsClassifier(metric='cosine', n_neighbors=7, weights='distance')))
# models.append(('NB', MultinomialNB(alpha=0.1, fit_prior=True)))
#
# evaluateModelsWithoutKfold(X_train, y_train, X_val, y_val, models)

In [10]:
models = []

models.append(('LR', LogisticRegression(max_iter=10000, multi_class='ovr', C=0.001, penalty='l2', solver='newton-cg')))
models.append(('SVM', SVC(C=1, kernel='linear')))
models.append(('KNN', KNeighborsClassifier(metric='cosine', n_neighbors=7, weights='distance')))
models.append(('NB', MultinomialNB(alpha=0.1, fit_prior=True)))

evaluateModelsWithoutKfold(X_train, y_train, X_val, y_val, models)

LR: Accuracy=0.70156
SVM: Accuracy=0.88140
KNN: Accuracy=0.84298
NB: Accuracy=0.84076


In [11]:
from sklearn.metrics import classification_report

model = MultinomialNB(alpha=0.1, fit_prior=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       593
           1       0.92      0.86      0.89       636
           2       0.81      0.88      0.84       347
           3       0.64      0.78      0.70       220

    accuracy                           0.84      1796
   macro avg       0.81      0.83      0.82      1796
weighted avg       0.85      0.84      0.84      1796



In [12]:
df_results = viewPredictedRows(val['title'], y_val, y_pred)
df_results


Unnamed: 0,y_test,y_pred,correct,title
0,1,1,Correct,ira india podem dar maozinha petrobras
1,3,3,Correct,itau itub4 ativa investimentos atualiza projec...
2,0,0,Correct,em crise kraft heinz poe venda marca cafe maxw...
3,1,1,Correct,santander mantem cautela acoes petrobras
4,0,0,Correct,financiamento imoveis cresce <NUM> cento janei...
...,...,...,...,...
1791,2,2,Correct,marinha informa inicio retirada combustivel na...
1792,1,1,Correct,ibovespa sobe <NUM> cento exterior puxado petr...
1793,1,1,Correct,bolsonaro diz preco alto combustiveis heranca ...
1794,0,0,Correct,unitedhealthcare dona amil registra lucro <NUM...


## Exports

In [13]:
with open('../../assets/traditional_assets/count_vectorizer_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(cv.vocabulary_, f)

In [14]:
import pickle
filename = 'naive_bayes_model.sav'
pickle.dump(model, open(f"../../assets/traditional_assets/{filename}", 'wb'))

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import SGDClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score

In [None]:
# text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
#                                            alpha=1e-3, random_state=42,
#                                            max_iter=5, tol=None)),
#                      ])
#


In [None]:
# parameters = {
#     'vect__ngram_range': [(1, 1), (1, 2)],
#     'tfidf__use_idf': (True, False),
#     'clf__alpha': (1e-2, 1e-3),
# }

In [None]:
# gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [None]:
# gs_clf = gs_clf.fit(train['text'], train['label'])