In [1]:
import pandas as pd
import sklearn
from hydra import initialize, compose
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
with initialize(version_base=None, config_path="../config/"):
    cfg = compose(config_name='main')

In [3]:
df = pd.read_csv(f"../{cfg.data.processed}")

In [4]:
# encoder = OneHotEncoder()
# y = pd.DataFrame(encoder.fit_transform(df[['Sentiment']]).toarray())
df = df.replace({"positive":1,"neutral":0,"negative":-1})
y = df['Sentiment']

In [5]:
y

0       1
1      -1
2       1
3       0
4       0
       ..
5831   -1
5832    0
5833    0
5834    0
5835    1
Name: Sentiment, Length: 5836, dtype: int64

In [6]:
count_vec = CountVectorizer()
X = df['text']
count_vec.fit(X)
X = count_vec.transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [8]:
tfidf = TfidfTransformer()
tfidf.fit(X)
tfidf.transform(X)

<5836x9607 sparse matrix of type '<class 'numpy.float64'>'
	with 79167 stored elements in Compressed Sparse Row format>

In [9]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}
predictions = {}

In [11]:
kernel_list = ["linear", "poly", "rbf", "sigmoid"]
for kernel_item in kernel_list:
    svm = SVC(kernel=kernel_item)
    svm.fit(X_train_scaled, y_train)
    y_pred_svm = svm.predict(X_test_scaled)
    classifiers[f'SVM ({kernel_item} kernel)'] = svm
    predictions[f'SVM ({kernel_item} kernel)'] = y_pred_svm

In [12]:
for name, classifier in classifiers.items():
    if name == 'Logistic Regression':
        for c in [0.01, 0.05, 0.25, 0.5, 1]:
            l = LogisticRegression(C=c, max_iter=1000)
            l.fit(X_train_scaled, y_train)
            y_pred = l.predict(X_test_scaled)
            predictions[f'{name} (C={c})'] = y_pred
    else:
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        predictions[name] = y_pred

In [13]:
metrics_data = []

for name, y_pred in predictions.items():
    y_true = y_test  # Substitua por seus r√≥tulos reais

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    metrics_data.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,SVM (linear kernel),0.661412,0.654569,0.661412,0.657494
1,SVM (poly kernel),0.550377,0.572524,0.550377,0.452333
2,SVM (rbf kernel),0.70048,0.677315,0.70048,0.667049
3,SVM (sigmoid kernel),0.656614,0.642783,0.656614,0.633146
4,Multinomial Naive Bayes,0.690199,0.684019,0.690199,0.685917
5,K-Nearest Neighbors,0.571624,0.548693,0.571624,0.556115
6,Decision Tree,0.601097,0.60364,0.601097,0.601667
7,Random Forest,0.647019,0.644738,0.647019,0.625991
8,Logistic Regression (C=0.01),0.675805,0.660464,0.675805,0.663659
9,Logistic Regression (C=0.05),0.667581,0.652132,0.667581,0.656899
