Graziele Fagundes Martins e Jo√£o Vitor Farias

In [None]:
import re
import nltk
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Baixar stopwords do NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# Fun√ß√£o para pr√©-processamento do texto
def preprocess_text(text):
    text = text.lower()  # Converter para min√∫sculas
    text = re.sub(r'\d+', '', text)  # Remover n√∫meros
    text = re.sub(r'[^\w\s]', '', text)  # Remover caracteres especiais
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remover stopwords
    return text

# Fun√ß√£o para carregar e processar os dados
def load_data(text_file, label_file, sample_size=1000):
    with open(text_file, "r", encoding="utf-8") as f:
        texts = [preprocess_text(line.strip()) for line in f.readlines()]
    with open(label_file, "r", encoding="utf-8") as f:
        labels = [line.strip() for line in f.readlines()]
    
    sample_size = min(sample_size, len(texts))
    texts, labels = texts[:sample_size], labels[:sample_size]
    return texts, labels

In [None]:
# Dataset de sentimentos
X_train, y_train = load_data("datasets\\sentiment\\train_text.txt", "datasets\\sentiment\\train_labels.txt", sample_size=2500)
X_test, y_test = load_data("datasets\\sentiment\\test_text.txt", "datasets\\sentiment\\test_labels.txt", sample_size=500)

# Vetoriza√ß√£o do texto usando TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Treinamento dos modelos
rfc_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model.fit(X_train_tfidf.toarray(), y_train)
y_pred_rfc = rfc_model.predict(X_test_tfidf.toarray())

ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train_tfidf.toarray(), y_train)
y_pred_ada = ada_model.predict(X_test_tfidf.toarray())

# Avalia√ß√£o dos modelos
print("Resultados Random Forest:")
print(classification_report(y_test, y_pred_rfc))
print("Acur√°cia Random Forest:", accuracy_score(y_test, y_pred_rfc))

print("\nResultados Ada Boost:")
print(classification_report(y_test, y_pred_ada))
print("Acur√°cia Ada Boost:", accuracy_score(y_test, y_pred_ada))

# Fun√ß√£o para testar o modelo com entrada personalizada
def predict_sentiment(text, model, vectorizer):
    processed_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([processed_text])
    prediction = model.predict(text_tfidf)
    return prediction[0]

# Testando o modelo com valida√ß√£o
dict = {'0': "Negative", '1': "Neutral", '2': "Positive"}
X_val, y_val = load_data("datasets\\sentiment\\val_text.txt", "datasets\\sentiment\\val_labels.txt", sample_size=400)
y_val_pred = [predict_sentiment(text, rfc_model, vectorizer) for text in X_val]
print("Acur√°cia Random Forest (valida√ß√£o):", accuracy_score(y_val, y_val_pred))

print("Alguns exemplos de classifica√ß√£o Random Forest:")
for i in range(10):
    print(f"Texto: {X_val[i]}")
    print(f"Sentimento: {dict[y_val[i]]}")
    print(f"Predi√ß√£o: {dict[y_val_pred[i]]}")
    print()

y_val_pred = [predict_sentiment(text, ada_model, vectorizer) for text in X_val]
print("Acur√°cia Ada Boost (valida√ß√£o):", accuracy_score(y_val, y_val_pred))

print("Alguns exemplos de classifica√ß√£o Ada Boost:")
for i in range(10):
    print(f"Texto: {X_val[i]}")
    print(f"Sentimento: {dict[y_val[i]]}")
    print(f"Predi√ß√£o: {dict[y_val_pred[i]]}")
    print()

In [None]:
# Dataset de previs√£o de emoji
X_train, y_train = load_data("datasets\\emoji\\train_text.txt", "datasets\\emoji\\train_labels.txt", sample_size=3000)
X_test, y_test = load_data("datasets\\emoji\\test_text.txt", "datasets\\emoji\\test_labels.txt", sample_size=1000)

# Vetoriza√ß√£o do texto usando TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

rfc_model_emoji = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model_emoji.fit(X_train_tfidf.toarray(), y_train)
y_pred_rfc_emoji = rfc_model_emoji.predict(X_test_tfidf.toarray())

ada_model_emoji = AdaBoostClassifier(random_state=42)
ada_model_emoji.fit(X_train_tfidf.toarray(), y_train)
y_pred_ada_emoji = ada_model_emoji.predict(X_test_tfidf.toarray())

print("\nResultados Random Forest (Emoji):")
print(classification_report(y_test, y_pred_rfc_emoji))
print("Acur√°cia Random Forest (Emoji):", accuracy_score(y_test, y_pred_rfc_emoji))

print("\nResultados Ada Boost (Emoji):")
print(classification_report(y_test, y_pred_ada_emoji))
print("Acur√°cia Ada Boost (Emoji):", accuracy_score(y_test, y_pred_ada_emoji))

# Testando o modelo com valida√ß√£o
emoji_dict = {
'0'   :  ' ‚ù§',
'1'   :  ' üòç',		
'2'   :  ' üòÇ',	
'3'   :  ' üíï',	
'4'   :  ' üî•',	
'5'   :  ' üòä',		
'6'   :  ' üòé',	
'7'   :  ' ‚ú®',		
'8'   :  ' üíô',		
'9'   :  ' üòò',	
'10'  :   'üì∑',		
'11'  :   'üá∫üá∏',	
'12'  :   '‚òÄ',	
'13'  :   'üíú',		
'14'  :   'üòâ',	
'15'  :   'üíØ',	
'16'  :   'üòÅ',		
'17'  :   'üéÑ',	
'18'  :   'üì∏',	
'19'  :   'üòú'	
}	

X_val, y_val = load_data("datasets\\emoji\\val_text.txt", "datasets\\emoji\\val_labels.txt", sample_size=400)
y_val_pred = [predict_sentiment(text, rfc_model_emoji, vectorizer) for text in X_val]
print("Acur√°cia Random Forest (Emoji) (valida√ß√£o):", accuracy_score(y_val, y_val_pred))
print("Alguns exemplos de classifica√ß√£o Random Forest:")
for i in range(10):
    print(f"Texto: {X_val[i]}")
    print(f"Correto: {emoji_dict[y_val[i]]}")
    print(f"Predi√ß√£o: {emoji_dict[y_val_pred[i]]}")
    print()

y_val_pred = [predict_sentiment(text, ada_model_emoji, vectorizer) for text in X_val]
print("Acur√°cia Ada Boost (Emoji) (valida√ß√£o):", accuracy_score(y_val, y_val_pred))
print("Alguns exemplos de classifica√ß√£o Ada Boost:")
for i in range(10):
    print(f"Texto: {X_val[i]}")
    print(f"Correto: {emoji_dict[y_val[i]]}")
    print(f"Predi√ß√£o: {emoji_dict[y_val_pred[i]]}")
    print()