In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from pre_process import preprocess_text

# Carregar os dados do CSV
data = pd.read_csv("../DataSetForSCR_Final.csv", encoding="unicode_escape")

In [21]:

# . Funcao que retorna o tipo de ngrama
def get_ngram_description(ngram_range):
    if ngram_range == (1,1):
        return "Unigrama"
    elif ngram_range == (1,2):
        return "Unigrama + Bigrama"
    elif ngram_range == (2,2):
        return "Bigrama"
    return "N/A"

# . Funcao que retorna o tipo de de limpeza
def cleaning_type_description(cleaning_type):
    if cleaning_type == 1:
        return "Nenhuma"
    elif cleaning_type == 2:
        return "Parcial"
    elif cleaning_type == 3:
        return "Total"
    return "N/A"

# . Criando array para armazenar os resultados
resultados = []

### ! melhorar o countvectorizer (usar unigrama e bigrama, buscar o vocabulario para ver as palavras que sobraram, tentar definir um min_df (5,10,15), testar o binary=True)
min_df_arr = [1,5,10,15]
ngram_range_arr = [(1,1), (1,2), (2,2)] # Unigrama , Unigrama + Bigrama, Bigrama
binary_arr = [True, False]
cleaning_type_arr = [1,2,3]

# Iterar sobre os 3 arrays (todas as combinações)
for min_df in min_df_arr:
    for ngram_range in ngram_range_arr:
        for binary in binary_arr:
            for cleaning_type in cleaning_type_arr:
                # Limpar os dados
                if cleaning_type == 1:
                    # !! SEM LIMPEZA
                    X = data["Sentence"]
                    Y = data["Label"]
                elif cleaning_type == 2:
                    # !! C LIMPEZA PARCIAL
                    X = data["Sentence"].str.lower()
                    X = X.str.replace("[^a-zA-Z0-9 ]", " ")
                    Y = data["Label"]
                elif cleaning_type == 3:
                    # !! C LIMPEZA
                    X = data["Sentence"].apply(preprocess_text)
                    Y = data["Label"]
                    
                # Dividir os dados em treino e teste
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

                # Criar um objeto CountVectorizer
                count_vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range, binary=binary)

                # Ajustar e transformar os dados de treino
                X_train_count = count_vectorizer.fit_transform(X_train)

                # Transformar os dados de teste
                X_test_count = count_vectorizer.transform(X_test)

                # Treinar os modelos
                class_NB = MultinomialNB()
                class_SVM = SVC()
                class_LR_l1 = LogisticRegression(penalty='l1', solver='liblinear')
                class_LR_l2 = LogisticRegression(penalty='l2')

                # Treinar os modelos
                class_NB.fit(X_train_count, Y_train)
                class_SVM.fit(X_train_count, Y_train)
                class_LR_l1.fit(X_train_count, Y_train)
                class_LR_l2.fit(X_train_count, Y_train)


                # Fazer previsões
                Y_pred_NB = class_NB.predict(X_test_count)
                Y_pred_SVM = class_SVM.predict(X_test_count)
                Y_pred_LR_l1 = class_LR_l1.predict(X_test_count)
                Y_pred_LR_l2 = class_LR_l2.predict(X_test_count)

                # Calcular a acurácia
                acc_NB = accuracy_score(Y_test, Y_pred_NB)
                acc_SVM = accuracy_score(Y_test, Y_pred_SVM)
                acc_LR_l1 = accuracy_score(Y_test, Y_pred_LR_l1)
                acc_LR_l2 = accuracy_score(Y_test, Y_pred_LR_l2)

                # Calcular a precisão, revocação e f1-score
                cr_NB = classification_report(Y_test, Y_pred_NB, output_dict=True)
                cr_SVM = classification_report(Y_test, Y_pred_SVM, output_dict=True)
                cr_LR_l1 = classification_report(Y_test, Y_pred_LR_l1, output_dict=True)
                cr_LR_l2 = classification_report(Y_test, Y_pred_LR_l2, output_dict=True)

                # Calcular a matriz de confusão
                cm_NB = pd.DataFrame(confusion_matrix(Y_test, Y_pred_NB), 
                  index=["No-SCR", "SCR"]).to_dict()
                cm_SVM = pd.DataFrame(confusion_matrix(Y_test, Y_pred_SVM), 
                  index=["No-SCR", "SCR"]).to_dict()
                cm_LR_l1 = pd.DataFrame(confusion_matrix(Y_test, Y_pred_LR_l1),
                    index=["No-SCR", "SCR"]).to_dict()
                cm_LR_l2 = pd.DataFrame(confusion_matrix(Y_test, Y_pred_LR_l2),
                    index=["No-SCR", "SCR"]).to_dict()
                

                # Armazenar os resultados
                resultados.append({
                    "model": "Naive Bayes",
                    "min_df": min_df,
                    "ngram_range": get_ngram_description(ngram_range),
                    "binary": binary,
                    "cleaning_type": cleaning_type_description(cleaning_type),
                    "accuracy": acc_NB,
                    "classification_report": cr_NB,
                    "confusion_matrix": cm_NB,
                    "vocabulary_size": len(count_vectorizer.vocabulary_),
                })

                resultados.append({
                    "model": "SVM",
                    "min_df": min_df,
                    "ngram_range": get_ngram_description(ngram_range),
                    "binary": binary,
                    "cleaning_type": cleaning_type_description(cleaning_type),
                    "accuracy": acc_SVM,
                    "classification_report": cr_SVM,
                    "confusion_matrix": cm_SVM,
                    "vocabulary_size": len(count_vectorizer.vocabulary_),
                })

                resultados.append({
                    "model": "Logistic Regression (L1)",
                    "min_df": min_df,
                    "ngram_range": get_ngram_description(ngram_range),
                    "binary": binary,
                    "cleaning_type": cleaning_type_description(cleaning_type),
                    "accuracy": acc_LR_l1,
                    "classification_report": cr_LR_l1,
                    "confusion_matrix": cm_LR_l1,
                    "vocabulary_size": len(count_vectorizer.vocabulary_),
                })

                resultados.append({
                    "model": "Logistic Regression (L2)",
                    "min_df": min_df,
                    "ngram_range": get_ngram_description(ngram_range),
                    "binary": binary,
                    "cleaning_type": cleaning_type_description(cleaning_type),
                    "accuracy": acc_LR_l2,
                    "classification_report": cr_LR_l2,
                    "confusion_matrix": cm_LR_l2,
                    "vocabulary_size": len(count_vectorizer.vocabulary_),
                })
                


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [22]:
# Criar um DataFrame com os resultados
df_resultados = pd.DataFrame(resultados)

# Salvar os resultados em um arquivo CSV
df_resultados.to_csv("results.csv", index=False)