# Seleção e treinamento

## Importações

In [1]:
import pandas as pd
import sklearn
from hydra import initialize, compose
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Configurações iniciais

In [2]:
with initialize(version_base=None, config_path="../config/"):
    cfg = compose(config_name='main')
    
df = pd.read_csv(f"../{cfg.data.processed}")

##### Importância de variáveis

In [26]:
feature_to_coef = {
    word: coef for word, coef in zip(
        count_vec.get_feature_names_out(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('decreased', 0.6952588143032427)
('short', 0.6427959626245119)
('lower', 0.608285701216592)
('fell', 0.5383668164959778)
('fall', 0.4560871326187892)
('rose', -0.3875909393045954)
('increase', -0.34741050013696123)
('business', -0.33609153845255824)
('increased', -0.29286778602207486)
('services', -0.289972809240041)


##### Importância de variáveis

In [93]:
feature_to_coef = {
    word: coef for word, coef in zip(
        wc_vectorizer.get_feature_names_out(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('disclose', 1.3281169301357885)
('technology', 1.130759237038491)
('full', 1.0355184771440253)
('moving', 1.0135198546436532)
('founders', 0.9892759356925733)
('split', -0.7412451981975544)
('karara', -0.6219360372942364)
('soullor', -0.6069452890438308)
('karczewicz', -0.5794196327184595)
('synosia', -0.5318054468327866)


### Importância de variáveis

In [31]:
feature_to_coef = {
    word: coef for word, coef in zip(
        count_vec.get_feature_names_out(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('decreas', 1.2934759873309167)
('short', 1.141798721397832)
('fell', 1.0398384296482561)
('lower', 0.9974760458442409)
('fall', 0.9862907535166991)
('rose', -0.712635314491691)
('increas', -0.6735653604109106)
('servic', -0.5325790836249364)
('offer', -0.4861027444871953)
('strong', -0.4798587861220686)


In [59]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(cleaned_text)

X = ngram_vectorizer.transform(cleaned_text)

### Treino e teste

In [60]:
y = df['encoded_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 42)

### Classificador

#### Regressão Logística

In [43]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, max_iter=1000)
    lr.fit(X_train, y_train)
    print (f"Accuracy for C={c}: {accuracy_score(y_test, lr.predict(X_test))}")

Accuracy for C=0.01: 0.6219303255282695
Accuracy for C=0.05: 0.6693318103940605
Accuracy for C=0.25: 0.6670474014848657
Accuracy for C=0.5: 0.6647629925756711
Accuracy for C=1: 0.6653340948029697


#### SVM

In [63]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print (f"Accuracy for C={c}: {accuracy_score(y_test, svm.predict(X_test))}") 



Accuracy for C=0.01: 0.6699029126213593
Accuracy for C=0.05: 0.6693318103940605
Accuracy for C=0.25: 0.6721873215305539




Accuracy for C=0.5: 0.6721873215305539




Accuracy for C=1: 0.6704740148486579


### Importância de variáveis

In [45]:
feature_to_coef = {
    word: coef for word, coef in zip(
        ngram_vectorizer.get_feature_names_out(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('announced', 1.3281169301357885)
('cncv reported', 1.130759237038491)
('astrazenecas iressa', 1.0355184771440253)
('bought more', 1.0135198546436532)
('association', 0.9892759356925733)
('china no', -0.7412451981975544)
('beer bought', -0.6219360372942364)
('chic', -0.6069452890438308)
('beer brand', -0.5794196327184595)
('close ready', -0.5318054468327866)


## Divisória

In [10]:
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}
predictions = {}

In [12]:
for name, classifier in classifiers.items():
    if name == 'Logistic Regression':
        for c in [0.01, 0.05, 0.25, 0.5, 1]:
            l = LogisticRegression(C=c, max_iter=1000)
            l.fit(X_train_scaled, y_train)
            y_pred = l.predict(X_test_scaled)
            predictions[f'{name} (C={c})'] = y_pred
    else:
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        predictions[name] = y_pred

### Representações

In [8]:
tfidf = TfidfTransformer()
tfidf.fit(X)
tfidf.transform(X)

<5836x9607 sparse matrix of type '<class 'numpy.float64'>'
	with 79167 stored elements in Compressed Sparse Row format>

In [9]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
kernel_list = ["linear", "poly", "rbf", "sigmoid"]
for kernel_item in kernel_list:
    svm = SVC(kernel=kernel_item)
    svm.fit(X_train_scaled, y_train)
    y_pred_svm = svm.predict(X_test_scaled)
    classifiers[f'SVM ({kernel_item} kernel)'] = svm
    predictions[f'SVM ({kernel_item} kernel)'] = y_pred_svm

In [13]:
metrics_data = []

for name, y_pred in predictions.items():
    y_true = y_test  # Substitua por seus rótulos reais

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    metrics_data.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,SVM (linear kernel),0.661412,0.654569,0.661412,0.657494
1,SVM (poly kernel),0.550377,0.572524,0.550377,0.452333
2,SVM (rbf kernel),0.70048,0.677315,0.70048,0.667049
3,SVM (sigmoid kernel),0.656614,0.642783,0.656614,0.633146
4,Multinomial Naive Bayes,0.690199,0.684019,0.690199,0.685917
5,K-Nearest Neighbors,0.571624,0.548693,0.571624,0.556115
6,Decision Tree,0.601097,0.60364,0.601097,0.601667
7,Random Forest,0.647019,0.644738,0.647019,0.625991
8,Logistic Regression (C=0.01),0.675805,0.660464,0.675805,0.663659
9,Logistic Regression (C=0.05),0.667581,0.652132,0.667581,0.656899
