In [1]:
from pathlib import Path

import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.pipeline import make_pipeline, Pipeline
import nltk
import re  

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer


from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv('train.csv', index_col=[0])
test_df = pd.read_csv('test.csv', index_col=[0])

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess_text(text):
    # Transformar o texto em minúsculas
    text = text.lower()
    # Remover caracteres especiais e números, mantendo apenas letras e espaços
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [5]:
def remove_stopwords_nltk(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [6]:
def lemmatize(text):
    # Tokenizar o texto em palavras
    word_tokens = word_tokenize(text)
    # Lematizar cada palavra e juntar novamente em uma string
    lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized_text)


In [7]:
# Aplicar pré-processamento de texto
train_df['review'] = train_df['review'].apply(preprocess_text)

# Remover stopwords usando NLTK
train_df['review'] = train_df['review'].apply(remove_stopwords_nltk)

In [8]:
train_df['review'] = train_df['review'].apply(lemmatize)

In [9]:
reviews = train_df['review']
labels = train_df['label']

In [10]:
# Definindo os parâmetros do pipeline
parameters = {
    'vectorizer__max_features': [5000, 10000, 12000, 15000,None],  # Número máximo de features no vetorizador
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'classifier__alpha': [0.1, 0.5, 1.0, 1.2]  # Parâmetro alpha do classificador MultinomialNB
}


In [11]:
vectorizer = CountVectorizer(
    #max_features=40000, 
    #stop_words='english', # Stopwords em inglês
    strip_accents="ascii",
    ngram_range=(1,2),
    max_df=0.5,
    min_df=1
)


In [12]:
nb = MultinomialNB(alpha=2)
transformer = TfidfTransformer(use_idf=False)
# Inicializar o pipeline
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('transformer', transformer),
    ('classifier', nb)
])

In [13]:
pipeline.fit(reviews, labels)

In [14]:
X_train = vectorizer.transform(reviews)

In [15]:
# Treinar novamente o pipeline com as features selecionadas
pipeline_selected = Pipeline([
    ('transformer', transformer),
    #('selector', selector),
    ('classifier', nb)
])

In [16]:
pipeline_selected.fit(X_train, labels)

In [17]:
# Fazer previsões usando o modelo treinado
predictions = pipeline.predict(test_df['review'])

In [18]:
# Criar DataFrame de submissão
submission_df = pd.DataFrame({'id': test_df.index, 'label': predictions})

In [19]:
# Salvar DataFrame em um arquivo CSV para submissão
submission_df.to_csv('submission.csv', index=False)