# Import library

In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Embedding, LSTM, Dense

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings

ModuleNotFoundError: No module named 'tensorflow'

# Fungsi Membaca file

In [None]:
def read_file(file_name):
    data_list  = []
    with open(file_name, 'r') as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data_list.append([label, text])
    return data_list

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Read file dari GDrive

In [None]:
# file_name = "olympic.txt"
file_name = "/content/drive/MyDrive/Colab Notebooks/text2emoji-master/data/psychExp.txt"
psychExp_txt = read_file(file_name)

# Print jumlah data

In [None]:
print("The number of instances: {}".format(len(psychExp_txt)))

The number of instances: 7480


# Membaca contoh data

In [None]:
print("Data example: ")
print(psychExp_txt[0])

Data example: 
['1. 0. 0. 0. 0. 0. 0.', 'During the period of falling in love, each time that we met and especially when we had not met for a long time.']


# Membuat Fitur

In [None]:
import re
from collections import Counter

In [None]:
def ngram(token, n):
    output = []
    for i in range(n-1, len(token)):
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram)
    return output

In [None]:
def create_feature(text, nrange=(1, 1)):
    text_features = []
    text = text.lower()

    # 1. treat alphanumeric characters as word tokens
    # Since tweets contain #, we keep it as a feature
    # Then, extract all ngram lengths
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1):
        text_features += ngram(text_alphanum.split(), n)

    # 2. treat punctuations as word token
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)

    # 3. Return a dictinaory whose keys are the list of elements
    # and their values are the number of times appearede in the list.
    return Counter(text_features)

# Print Fitur

In [None]:
print(create_feature("I love you!"))
print(create_feature(" aly wins the gold!!!!!!  #olympics"))
print(create_feature(" aly wins the gold!!!!!!  #olympics", (1, 2)))

Counter({'i': 1, 'love': 1, 'you': 1, '!': 1})
Counter({'aly': 1, 'wins': 1, 'the': 1, 'gold': 1, '#olympics': 1, '!!!!!!': 1, '#': 1})
Counter({'aly': 1, 'wins': 1, 'the': 1, 'gold': 1, '#olympics': 1, 'aly wins': 1, 'wins the': 1, 'the gold': 1, 'gold #olympics': 1, '!!!!!!': 1, '#': 1})


# Membuat fungsi konversi label

In [None]:
def convert_label(item, name):
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)):
        if items[idx] == 1:
            label += name[idx] + " "

    return label.strip()

In [None]:
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
y_all = []
for label, text in psychExp_txt:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

# Print hasil Konversi Label dan contoh fitur

In [None]:
print("features example: ")
print(X_all[0])

features example: 
Counter({'time': 2, 'we': 2, 'met': 2, 'during': 1, 'the': 1, 'period': 1, 'of': 1, 'falling': 1, 'in': 1, 'love': 1, 'each': 1, 'that': 1, 'and': 1, 'especially': 1, 'when': 1, 'had': 1, 'not': 1, 'for': 1, 'a': 1, 'long': 1, 'during the': 1, 'the period': 1, 'period of': 1, 'of falling': 1, 'falling in': 1, 'in love': 1, 'love each': 1, 'each time': 1, 'time that': 1, 'that we': 1, 'we met': 1, 'met and': 1, 'and especially': 1, 'especially when': 1, 'when we': 1, 'we had': 1, 'had not': 1, 'not met': 1, 'met for': 1, 'for a': 1, 'a long': 1, 'long time': 1, 'during the period': 1, 'the period of': 1, 'period of falling': 1, 'of falling in': 1, 'falling in love': 1, 'in love each': 1, 'love each time': 1, 'each time that': 1, 'time that we': 1, 'that we met': 1, 'we met and': 1, 'met and especially': 1, 'and especially when': 1, 'especially when we': 1, 'when we had': 1, 'we had not': 1, 'had not met': 1, 'not met for': 1, 'met for a': 1, 'for a long': 1, 'a long t

In [None]:
print("Label example:")
print(y_all[0])

Label example:
joy


# Label Emoji

In [None]:
emoji_dict = {"joy":"😂", "fear":"😱", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😳"}

# Training dan testing dengan CNN

In [None]:
# ...

# Tokenize the text and convert it to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text for _, text in psychExp_txt])
X_sequences = tokenizer.texts_to_sequences([text for _, text in psychExp_txt])

# Pad sequences to have the same length
X_padded = pad_sequences(X_sequences)

# Convert labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_all)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=123)

# Build the CNN model
embedding_dim = 50  # adjust as needed
max_sequence_len = X_padded.shape[1]  # length of padded sequences
num_classes = len(set(y_encoded))

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_len))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
train_acc = accuracy_score(y_train, np.argmax(model.predict(X_train), axis=1))
test_acc = accuracy_score(y_test, np.argmax(model.predict(X_test), axis=1))
print("Training Accuracy: {:.4f}".format(train_acc))
print("Test Accuracy: {:.4f}".format(test_acc))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 0.9612
Test Accuracy: 0.5441


In [None]:
t1 = "I love swimming, really enjoyed it!"
t2 = "I hate you, you fucking disgusting"
t3 = "I am afraid of thunder"
t4 = "i am so angry, am very fed up with you"
t5 = "My uncle died"
t6 = "I enjoy being alive"
t7 = "I am so sorry, i didnt realise this could be like this"
t8 =" i am so happy"
texts = [t1, t2, t3, t4, t5, t6, t7,t8]

# Pengujian dengan CNN

In [None]:
# Tokenize and pad the input texts
text_sequences = tokenizer.texts_to_sequences(texts)
text_padded_cnn = pad_sequences(text_sequences, maxlen=max_sequence_len)

# Make predictions using the trained CNN model
predictions_cnn = model.predict(text_padded_cnn)

# Decode the predicted labels back to emotions
predicted_labels_cnn = label_encoder.inverse_transform(np.argmax(predictions_cnn, axis=1))

# Print the results
for label, text in zip(predicted_labels_cnn, texts):
    print("{} {}".format(emoji_dict[label], text))


😂 I love pork, really enjoyed it!
😒 I hate you, you fucking disgusting
😱 I am afraid of thunder
😠 i am so angry, am very fed up with you
😢 My uncle died
😳 I enjoy being alive
😳 I am so sorry, i didnt realise this could be like this


# Training dengan LSTM

In [None]:
# ...

# Tokenize the text and convert it to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text for _, text in psychExp_txt])
X_sequences = tokenizer.texts_to_sequences([text for _, text in psychExp_txt])

# Pad sequences to have the same length
X_padded = pad_sequences(X_sequences)

# Convert labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_all)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=123)

# Build the LSTM model
embedding_dim = 50  # adjust as needed
max_sequence_len = X_padded.shape[1]  # length of padded sequences
num_classes = len(set(y_encoded))

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_len))
model.add(LSTM(100))  # You can adjust the number of LSTM units as needed
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
train_acc = accuracy_score(y_train, np.argmax(model.predict(X_train), axis=1))
test_acc = accuracy_score(y_test, np.argmax(model.predict(X_test), axis=1))
print("Training Accuracy: {:.4f}".format(train_acc))
print("Test Accuracy: {:.4f}".format(test_acc))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 0.9166
Test Accuracy: 0.5000


# Hasil pengujian dengan LSTM

In [None]:
# Tokenize and pad the input texts
text_sequences_lstm = tokenizer.texts_to_sequences(texts)
text_padded_lstm = pad_sequences(text_sequences_lstm, maxlen=max_sequence_len)

# Make predictions using the trained LSTM model
predictions_lstm = model.predict(text_padded_lstm)

# Decode the predicted labels back to emotions
predicted_labels_lstm = label_encoder.inverse_transform(np.argmax(predictions_lstm, axis=1))

# Print the results with emojis
for label, text in zip(predicted_labels_lstm, texts):
    print("{} {}".format(emoji_dict[label], text))


😂 I love pork, really enjoyed it!
😒 I hate you, you fucking disgusting
😱 I am afraid of thunder
😠 i am so angry, am very fed up with you
😢 My uncle died
😂 I enjoy being alive
😢 I am so sorry, i didnt realise this could be like this


# Training dengan Char-Level CNN

In [None]:
# ...

# Tokenize the characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text for _, text in psychExp_txt])
X_sequences = tokenizer.texts_to_sequences([text for _, text in psychExp_txt])

# Pad sequences to have the same length
X_padded = pad_sequences(X_sequences)

# Convert labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_all)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=123)

# Build the character-level CNN model
embedding_dim = 50  # adjust as needed
max_sequence_len = X_padded.shape[1]  # length of padded sequences
num_classes = len(set(y_encoded))

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_len))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
train_acc = accuracy_score(y_train, np.argmax(model.predict(X_train), axis=1))
test_acc = accuracy_score(y_test, np.argmax(model.predict(X_test), axis=1))
print("Training Accuracy: {:.4f}".format(train_acc))
print("Test Accuracy: {:.4f}".format(test_acc))

# ...

# Make predictions using the trained model
predictions_char_cnn = model.predict(X_test)

# Decode the predicted labels back to emotions
predicted_labels_char_cnn = label_encoder.inverse_transform(np.argmax(predictions_char_cnn, axis=1))

# # Print the results
for label, text in zip(predicted_labels_char_cnn, texts):
    print("{} {}".format(emoji_dict[label], text))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 0.5640
Test Accuracy: 0.4820
😂 I love pork, really enjoyed it!
😠 I hate you, you fucking disgusting
😳 I am afraid of thunder
😢 i am so angry, am very fed up with you
😂 My uncle died
😱 I enjoy being alive
😢 I am so sorry, i didnt realise this could be like this


# Hasil pengujian dengan Character level CNN, CNN, LSTM

In [None]:
# Print the results with emojis and model labels
print("{:<50} | {:<20} | {:<20} | {:<20}".format("Text", "CNN Output", "Char CNN Output", "LSTM Output"))
print("-" * 120)

for text, label_cnn, label_char_cnn, label_lstm in zip(texts, predicted_labels_cnn, predicted_labels_char_cnn, predicted_labels_lstm):
    print("{:<50} | {:<20} | {:<20} | {:<20}".format(text, f"{emoji_dict[label_cnn]} ({label_cnn})", f"{emoji_dict[label_char_cnn]} ({label_char_cnn})", f"{emoji_dict[label_lstm]} ({label_lstm})"))

Text                                               | CNN Output           | Char CNN Output      | LSTM Output         
------------------------------------------------------------------------------------------------------------------------
I love swimming, really enjoyed it!                | 😂 (joy)              | 😂 (joy)              | 😂 (joy)             
I hate you, you fucking disgusting                 | 😒 (disgust)          | 😠 (anger)            | 😒 (disgust)         
I am afraid of thunder                             | 😱 (fear)             | 😳 (guilt)            | 😱 (fear)            
i am so angry, am very fed up with you             | 😠 (anger)            | 😢 (sadness)          | 😠 (anger)           
My uncle died                                      | 😢 (sadness)          | 😂 (joy)              | 😢 (sadness)         
I enjoy being alive                                | 😳 (guilt)            | 😱 (fear)             | 😂 (joy)             
I am so sorry, i didnt realise this cou