## Packages and Assets

In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import model_selection

In [11]:
train = pd.read_csv('../../assets/data/train/preprocessed.csv')
val = pd.read_csv('../../assets/data/val/preprocessed.csv')

Unnamed: 0,title,tags,link,label
0,itau fica tech deve acelerar mercado agentes a...,"['Ágora Investimentos', 'Bancos', 'Corretoras'...",https://www.moneytimes.com.br/itau-fica-mais-t...,2
1,ativa renova aposta renner petro rio vale itau...,"['Ações', 'Ativa Investimentos', 'Carteira Rec...",https://www.moneytimes.com.br/ativa-renova-apo...,2
2,dividendos planner acrescenta <NUM> novos pape...,"['Ações', 'Alupar', 'Bradespar', 'Carteira Rec...",https://www.moneytimes.com.br/dividendos-plann...,0
3,anp amplia prazo petrobras concluir vendas cam...,"['ANP', 'Biocombustível', 'Combustíveis', 'Emp...",https://www.moneytimes.com.br/anp-amplia-prazo...,0
4,juca defende partilha cessao onerosa impacto t...,"['Petrobras', 'Pré-Sal', 'Romero Jucá']",https://www.moneytimes.com.br/juca-defende-par...,0
...,...,...,...,...
9559,cade julga negocio itau unibanco citi quarta v...,"['Bancos', 'Cade', 'Citi', 'Empresas', 'Fusões...",https://www.moneytimes.com.br/cade-julga-negoc...,2
9560,vale vale3 resgata <NUM> <NUM> empregados mina...,['Vale (VALE3)'],https://www.suno.com.br/noticias/acidente-vale...,1
9561,itau vende acoes multiplan,"['Ações', 'Bancos', 'Empresas', 'Itaú Unibanco...",https://www.moneytimes.com.br/itau-vende-acoes...,2
9562,inflacao brasil alta bastante disseminada diz ...,"['Alimentos', 'Brasil', 'Combustíveis', 'Econo...",https://www.moneytimes.com.br/inflacao-no-bras...,0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [13]:
y_train = train['label']

In [14]:
cv = CountVectorizer(ngram_range=(1, 1))
X_train = cv.fit_transform(train['title']).toarray()
X_train_names = pd.DataFrame(X_train, columns=cv.get_feature_names_out())
X_train_names

Unnamed: 0,015l,10h30,10x,13bi,14h,16h30,198l,1bi,1t,1t19,...,zera,zerada,zerado,zeram,zerar,zero,zhoushan,ziviani,zona,zup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9560,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_val = cv.transform(val['title']).toarray()
y_val = val['label']

## Functions

In [15]:
def evaluateModels(X_train, y_train, models, n_splits):
    print(f"{n_splits}-Fold Cross validation")
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)
        cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        print(f"{name}: Mean Accuracy={cv_results.mean():.5f}, Standard Deviation={cv_results.std():.5f}")

In [None]:
def viewPredictedRows(X_test, y_test, y_pred):
    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    df['correct'] = df['y_test'] == df['y_pred']
    df['correct'] = df['correct'].apply(lambda x: 'Correct' if x else 'Incorrect')
    df['title'] = X_test
    return df


In [19]:

def evaluateModelsWithoutKfold(X_train, y_train, X_test, y_test, models):
    results = []
    names = []
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append(accuracy)
        names.append(name)
        print(f"{name}: Accuracy={accuracy:.5f}")

## Tuning Models

### Logistic Regression

In [16]:
# parameter grid
parameters = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

logreg = LogisticRegression(max_iter=10000, multi_class='ovr')
clf = model_selection.GridSearchCV(logreg,  # model
                                   param_grid=parameters,  # hyperparameters
                                   scoring='accuracy',  # metric for scoring
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

KeyboardInterrupt: 

### Naive Bayes

In [None]:
# gridsearch for Naive Bayes
parameters = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
    'fit_prior': [True, False]
}

nb = MultinomialNB()
clf = model_selection.GridSearchCV(nb,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)
clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

### KNN

In [None]:
# gridsearch for KNN
parameters = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
clf = model_selection.GridSearchCV(knn,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

### SVM

In [None]:
parameters = [
    {'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}
]
svm = SVC(C=1)
clf = model_selection.GridSearchCV(svm,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

## Evaluating Models

In [22]:
models = []

models.append(('LR', LogisticRegression(max_iter=10000, multi_class='ovr', C=0.001, penalty='l2', solver='newton-cg')))
models.append(('SVM', SVC(C=1, kernel='linear')))
models.append(('KNN', KNeighborsClassifier(metric='euclidean', n_neighbors=7, weights='distance')))
models.append(('NB', MultinomialNB(alpha=0.1, fit_prior=True)))

evaluateModelsWithoutKfold(X_train, y_train, X_val, y_val, models)

LR: Accuracy=0.77341
SVM: Accuracy=0.92140
KNN: Accuracy=0.89047
NB: Accuracy=0.90050


In [24]:
from sklearn.metrics import classification_report

model = SVC(C=1, kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       622
           1       0.94      0.90      0.92       397
           2       0.84      0.85      0.85       177

    accuracy                           0.92      1196
   macro avg       0.90      0.90      0.90      1196
weighted avg       0.92      0.92      0.92      1196



Unnamed: 0,y_test,y_pred,correct,title
0,2,2,Correct,elekeiroz cai <NUM> cento decisao fundo invest...
1,1,1,Correct,brumadinho anatel disponibiliza localizacao ce...
2,0,0,Correct,novo presidente petrobras petr4 <NUM> pontos e...
3,2,2,Correct,lucro itausa cai <NUM> cento <NUM> bilhao terc...
4,1,1,Correct,vale faz primeira venda minerio ferro blockchain
...,...,...,...,...
1191,2,2,Correct,meta climatica brasil poe risco investimentos ...
1192,2,2,Correct,itau itub4 lucra <NUM> bi 2t21 altade <NUM> ce...
1193,1,1,Correct,minerio ferro dispara china expectativa invest...
1194,1,1,Correct,ministerio publico ira acusar vale colapso bar...


In [None]:
df_results = viewPredictedRows(val['title'], y_val, y_pred)
df_results


In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import SGDClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score

In [None]:
# text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
#                                            alpha=1e-3, random_state=42,
#                                            max_iter=5, tol=None)),
#                      ])
#


In [None]:
# parameters = {
#     'vect__ngram_range': [(1, 1), (1, 2)],
#     'tfidf__use_idf': (True, False),
#     'clf__alpha': (1e-2, 1e-3),
# }

In [None]:
# gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [None]:
# gs_clf = gs_clf.fit(train['text'], train['label'])