In [2]:
import pandas as pd
import numpy as np

In [3]:
# read emoticon dataset
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_emoticon_Y = train_emoticon_df['label'].tolist()

test_emoticon_X = pd.read_csv("datasets/test/test_emoticon.csv")['input_emoticon'].tolist()

In [4]:
val_emoticon = pd.read_csv("datasets/valid/valid_emoticon.csv")
val_emoticon_X = val_emoticon['input_emoticon'].tolist()
val_emoticon_Y = val_emoticon['label'].tolist()


In [5]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of all emojis across the dataset
emojis = list(set([emoji for sample in train_emoticon_X for emoji in sample]))

# Initialize OneHotEncoder with 'handle_unknown' set to 'ignore' to avoid issues with unseen emojis
encoder = OneHotEncoder(categories=[emojis]*13, sparse=False, handle_unknown='ignore')

# Convert the dataset (each sample is 13 emojis) into a list of lists (2D array)
emoji_sequences = [list(sample) for sample in train_emoticon_X]
emoji_sequences_val = [list(sample) for sample in val_emoticon_X]

# Fit the encoder and transform the data
encoded_X = encoder.fit_transform(emoji_sequences)
encoded_X_val = encoder.transform(emoji_sequences_val)
print(f"Shape of one-hot encoded data: {encoded_X.shape}")


Shape of one-hot encoded data: (7080, 2782)




In [15]:
# apply xgboost model
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize the model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=2, n_estimators=1000, max_depth=7, learning_rate=0.1, n_jobs=-1)

# Train the model
model.fit(encoded_X, train_emoticon_Y)

# Predict the labels
train_predictions = model.predict(encoded_X)
val_predictions = model.predict(encoded_X_val)

# Calculate the accuracy
train_accuracy = accuracy_score(train_emoticon_Y, train_predictions)
val_accuracy = accuracy_score(val_emoticon_Y, val_predictions)


In [6]:
# split the data into 5 forms one consisting of 20% of the data 40% of the data 60% of the data 80% of the data and 100% of the data
from sklearn.model_selection import train_test_split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.2, random_state=42)
X_train_100 = encoded_X
y_train_100 = train_emoticon_Y



























In [7]:
# train a neural network model using the one-hot encoded data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

# Initialize the MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)

In [8]:
# Train the model
clf.fit(X_train_20, y_train_20)

# Predict the labels
y_pred = clf.predict(encoded_X_val)

# Calculate the accuracy
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy for 20%: {accuracy}")


Accuracy for 20%: 0.7586912065439673


In [9]:
# Train the model
clf.fit(X_train_40, y_train_40)

# Predict the labels
y_pred = clf.predict(encoded_X_val)

# Calculate the accuracy
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy for 40%: {accuracy}")


Accuracy for 40%: 0.8098159509202454


In [10]:
clf.fit(X_train_60, y_train_60)
y_pred = clf.predict(encoded_X_val)
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy for 60%: {accuracy}")

Accuracy for 60%: 0.8404907975460123


In [11]:
clf.fit(X_train_80, y_train_80)
y_pred = clf.predict(encoded_X_val)
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy for 80%: {accuracy}")


Accuracy for 80%: 0.8834355828220859


In [12]:
clf.fit(X_train_100, y_train_100)
y_pred = clf.predict(encoded_X_val)
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy for 100%: {accuracy}")

Accuracy for 100%: 0.901840490797546


In [13]:
# try tpe hyperparameter optimization
from hyperopt import hp, fmin, tpe, Trials
from sklearn.model_selection import cross_val_score

# Define the search space
space = {
    'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(100,), (100, 100), (100, 100, 100)]),
    'alpha': hp.uniform('alpha', 0.0001, 0.1)
}

# Define the objective function
def objective(params):
    # Initialize the MLPClassifier
    clf = MLPClassifier(max_iter=1000, random_state=42, **params)
    
    # Calculate the cross-validation score
    cv_score = cross_val_score(clf, X_train_100, y_train_100, cv=3).mean()
    
    return -cv_score

# Initialize the Trials object
trials = Trials()

# Run the hyperparameter search
best = fmin(objective, space, algo=tpe.suggest, max_evals=10, trials=trials)

print(best)



100%|██████████| 10/10 [23:54<00:00, 143.45s/trial, best loss: -0.8456214689265537]
{'alpha': 0.017995112423157395, 'hidden_layer_sizes': 1}


In [14]:
# Train the model with the best hyperparameters
clf = MLPClassifier(max_iter=1000, random_state=42, **best)
clf.fit(X_train_100, y_train_100)

# Predict the labels
y_pred = clf.predict(encoded_X_val)

# Calculate the accuracy
accuracy = accuracy_score(val_emoticon_Y, y_pred)
print(f"Accuracy with hyperparameter optimization: {accuracy}")


Accuracy with hyperparameter optimization: 0.9059304703476483
