In [17]:
"""
@created_at: 03/06
Esta é a versão que funciona da ELM para o Dataset Sundanese Tweeter Dataset
Utilizando tanto as stopwords quando o CSV fornecido
"""
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [18]:
# carregar o dataset
df = pd.read_csv('sundanese_tweets.csv')

# extrair colunas relevantes do dataset
corpus = df['tweet'].tolist()
targets = df['label'].tolist()

# carregar as stopwords
with open('stopwords.txt', 'r') as file:
    stopwords = file.read().splitlines()

In [19]:
# definir as variáveis de teste e de treino com o método train_test_split
train_features, test_features, train_targets, test_targets = train_test_split(corpus, targets, test_size=0.1, random_state=123)

# converter os vetores com TfidVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords, lowercase=True, norm='l1')

# transforma as entradas e saídas com o TfidVectorizer
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

In [20]:
class ELM:
    def __init__(self, num_inputs, num_hidden):
        self.num_inputs = num_inputs
        self.num_hidden = num_hidden
        self.input_weights = np.random.uniform(-1, 1, (num_hidden, num_inputs))
        self.hidden_biases = np.random.uniform(-1, 1, num_hidden)
        self.output_weights = None
    
    def train(self, X, y):
        X = np.array(X)
        y = np.array(y)
        hidden_activations = self._calculate_hidden_activations(X)
        self.output_weights = np.linalg.pinv(hidden_activations) @ y
        
    def predict(self, X):
        X = np.array(X)
        hidden_activations = self._calculate_hidden_activations(X)
        y_pred = hidden_activations @ self.output_weights
        return y_pred
    
    def _calculate_hidden_activations(self, X):
        total_samples = X.shape[0]
        hidden_activations = np.zeros((total_samples, self.num_hidden))
        for i in range(total_samples):
            hidden_activations[i] = np.tanh(
                np.dot(self.input_weights, X[i]) + self.hidden_biases
            )
        return hidden_activations

In [21]:

def buildELM(train_features, test_features, train_targets, test_targets, label_encoder, num_neurons=200):
    """
    Build de uma ELM
    Activation: tanh
    """
    # Codificar os rótulos usando LabelEncoder
    train_targets_encoded = label_encoder.fit_transform(train_targets)
    
    # Converter os rótulos codificados para one-hot encoding
    onehot_encoder = OneHotEncoder(sparse_output=False)
    train_targets_onehot = onehot_encoder.fit_transform(train_targets_encoded.reshape(-1, 1))

    elm = ELM(train_features.shape[1], num_neurons)
    elm.train(train_features, train_targets_onehot)

    # Codificar os rótulos de teste usando LabelEncoder
    test_targets_encoded = label_encoder.transform(test_targets)
    # Converter os rótulos codificados de teste para one-hot encoding
    test_targets_onehot = onehot_encoder.transform(test_targets_encoded.reshape(-1, 1))

    predictions = elm.predict(test_features)
    # Converter as previsões para as classes originais
    predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

    # Calcular métricas
    accuracy = metrics.accuracy_score(test_targets, predicted_classes)
    precision = metrics.precision_score(test_targets, predicted_classes, average='weighted')
    recall = metrics.recall_score(test_targets, predicted_classes, average='weighted')
    f1_score = metrics.f1_score(test_targets, predicted_classes, average='weighted')
    confusion_matrix = metrics.confusion_matrix(test_targets, predicted_classes)
    classification_report = metrics.classification_report(test_targets, predicted_classes)

    # Imprimir métricas
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1_score)
    print("Confusion Matrix:")
    print(confusion_matrix)
    print("Classification Report:")
    print(classification_report)


    score = np.round(metrics.accuracy_score(test_targets, predicted_classes), 2)
    print("Mean accuracy of predictions: " + str(score))
    print("Salvando o modelo...")

    with open('modelo_elm.pkl', 'wb') as file:
        pickle.dump(elm, file)

### Testa o modelo treinado

O próximo bloco realiza o carregamento do modelo que foi gerado para que este possa ser utilizado com os dados de teste

In [22]:
# Testando com novas frases
# Carregue o modelo treinado
with open('modelo_elm.pkl', 'rb') as file:
    elm = pickle.load(file)

# correct_predictions = ['anger', 'anger', 'joy', 'fear', 'anger']
# predictions = ['anger', 'anger', 'sadness', 'joy', 'anger']
data = pd.read_csv('test.csv')
# Extrair colunas relevantes
novas_frases = data['tweet'].tolist()

new_features = vectorizer.transform(novas_frases).toarray()

predictions = elm.predict(new_features)
predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

for phrase, predicted_class in zip(novas_frases, predicted_classes):
    print(f"Previsão: {predicted_class} | Frase: {phrase}")
    # print("Frase:", phrase)
    # print("Previsão:", predicted_class)
    # print()

Previsão: fear | Frase: meuni asa rame nya crime dimamana... kemarin juga rampok di Domino Pizza Margahayu, siang-siang deuih eta mah bawa senjata api. Sarieun kieu nya Bandung
Previsão: fear | Frase: Merinding ningali mic check rrq. Pernah ngasaan sih, juara te emang atoh jeung bangga pisan
Previsão: sadness | Frase: mentang2 mamah ges dulur sabelah, meuni te dianggap. Heeh da mentang2 bapak ges maot" sedih dengernya.
Previsão: anger | Frase: meni serem atuh pas buka grup keluarga isinya gini :( takut banget ... dalem kondisi kaya gini org2 jd pada gila demi dpt duit.
Previsão: sadness | Frase: Meni sedih udah mah ga boleh keluar, di dalem main game diamuk, serba salah :(
Previsão: sadness | Frase: Meni sedih tapi pami ningal Kopo- Cigondewah mah angger hilir mudik macet
Previsão: sadness | Frase: Meni sedih nempo pimpinan urang
Previsão: sadness | Frase: Meni sedih laguna bang Bry... Inget jaman ngora
Previsão: sadness | Frase: Meni saredih nya, teu paruguh jadina hirup teh:(
Previsã