In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

Pré-processamento

In [None]:
def preprocess_text(text):
    # Remover pontuações
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenização e lematização
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(tokens)

Carregar os dados + pré-processamento

In [None]:
train_data = pd.read_csv('train.csv')
X_train = train_data['comment_text'].apply(preprocess_text)
y_train = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

test_data = pd.read_csv('test.csv')
X_test = test_data['comment_text'].apply(preprocess_text)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


Vetorização do texto

In [None]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

Ajustando os pesos

In [None]:
class_weights = class_weight.compute_class_weight('balanced', classes=[0, 1], y=y_train.to_numpy().ravel())


Modelo Naive Bayes

In [None]:
classifiers = {}
for label in y_train.columns:
    clf = MultinomialNB()
    clf.fit(X_train_tfidf, y_train[label])
    classifiers[label] = clf

In [None]:
y_pred_val = pd.DataFrame({label: clf.predict(X_val_tfidf) for label, clf in classifiers.items()})


Avaliação do Modelo

In [None]:
print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred_val))
print("\nClassification Report on Validation Set:\n", classification_report(y_val, y_pred_val))


Previsões

In [None]:
y_pred_test = pd.DataFrame({label: clf.predict(X_test_tfidf) for label, clf in classifiers.items()})


In [None]:
predictions_df = pd.concat([test_data['id'], y_pred_test], axis=1)


In [None]:
predictions_df.to_csv('predictions.csv', index=False)