# Gradient Boost & Extreme Gradient Boost

### Importar bibliotecas

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Limpar o texto usando NLTK
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


### Baixar recursos necessários do NLTK

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True


### Carregar o dataset

In [5]:
dataset = "data.csv"
df = pd.read_csv(dataset)

text_column = 'Sentence'
sentiment_column = 'Sentiment'

### Função para limpar o texto

In [6]:
def preprocess_text(text):
    # Remover caracteres especiais e números
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Converter para minúsculas
    text = text.lower()
    
    # Tokenização
    words = nltk.word_tokenize(text)
    
    # Remover stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lematização
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Reconstituir o texto
    text = ' '.join(words)
    
    return text


### Aplicar a função de pré-processamento ao texto

In [7]:

df['cleaned_text'] = df[text_column].apply(preprocess_text)

### Converter as classes para valores numéricos

In [8]:
class_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df[sentiment_column] = df[sentiment_column].map(class_mapping)

### Dividir o conjunto de dados em treino e teste

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df[sentiment_column], test_size=0.2, random_state=42)

### Vetorização do texto usando TF-IDF

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### Treinar o modelo Gradient Boosting

In [11]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_tfidf, y_train)

### Fazer previsões no conjunto de teste

In [12]:
gb_predictions = gb_model.predict(X_test_tfidf)

### Treinar o modelo XGBoost

In [13]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train_tfidf, y_train)


### Fazer previsões no conjunto de teste


In [14]:
xgb_predictions = xgb_model.predict(X_test_tfidf)

### Avaliar a acurácia do modelo Gradient Boosting

In [15]:
print("Acurácia do Gradient Boosting:", accuracy_score(y_test, gb_predictions))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, gb_predictions))
print("\nRelatório de Classificação:")
print(classification_report(y_test, gb_predictions))

Acurácia do Gradient Boosting: 0.6706586826347305

Matriz de Confusão:
[[ 26 125  24]
 [ 23 572  27]
 [  7 179 186]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.46      0.15      0.23       175
           1       0.65      0.92      0.76       622
           2       0.78      0.50      0.61       372

    accuracy                           0.67      1169
   macro avg       0.63      0.52      0.53      1169
weighted avg       0.67      0.67      0.63      1169



### Avaliar a acurácia do modelo XGBoost

In [16]:
print("\nAcurácia do XGBoost:", accuracy_score(y_test, xgb_predictions))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, xgb_predictions))
print("\nRelatório de Classificação:")
print(classification_report(y_test, xgb_predictions))


Acurácia do XGBoost: 0.679213002566296

Matriz de Confusão:
[[ 35 110  30]
 [ 53 530  39]
 [ 13 130 229]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.35      0.20      0.25       175
           1       0.69      0.85      0.76       622
           2       0.77      0.62      0.68       372

    accuracy                           0.68      1169
   macro avg       0.60      0.56      0.57      1169
weighted avg       0.66      0.68      0.66      1169

