In [5]:
import pandas as pd
import numpy as np

In [6]:
# read emoticon dataset
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_emoticon_Y = train_emoticon_df['label'].tolist()


In [7]:
val_emoticon = pd.read_csv("datasets/valid/valid_emoticon.csv")
val_emoticon_X = val_emoticon['input_emoticon'].tolist()
val_emoticon_Y = val_emoticon['label'].tolist()


In [8]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of all emojis across the dataset
emojis = list(set([emoji for sample in train_emoticon_X for emoji in sample]))

# Initialize OneHotEncoder with 'handle_unknown' set to 'ignore' to avoid issues with unseen emojis
encoder = OneHotEncoder(categories=[emojis]*13, sparse=False, handle_unknown='ignore')

# Convert the dataset (each sample is 13 emojis) into a list of lists (2D array)
emoji_sequences = [list(sample) for sample in train_emoticon_X]
emoji_sequences_val = [list(sample) for sample in val_emoticon_X]

# Fit the encoder and transform the data
encoded_X = encoder.fit_transform(emoji_sequences)
encoded_X_val = encoder.transform(emoji_sequences_val)
print(f"Shape of one-hot encoded data: {encoded_X.shape}")


Shape of one-hot encoded data: (7080, 2782)




In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(encoded_X, train_emoticon_Y, test_size=0.2, random_state=42)
X_train_100 = encoded_X
y_train_100 = train_emoticon_Y

In [10]:
# train a svm model for binary classification
from sklearn.svm import SVC
from skopt import BayesSearchCV

In [11]:
svm = SVC(kernel='linear', random_state=42)

search_space = {
    # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Different kernels to try
    'C': (1e-6, 1e+3, 'log-uniform'),                # Regularization parameter
    'gamma': (1e-6, 1e+1, 'log-uniform'),            # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'degree': (1, 5),                                # Degree of the polynomial kernel (if 'poly' is chosen)
    'coef0': (0.0, 10.0)                             # Independent term in 'poly' and 'sigmoid' kernels
}


In [23]:
# Set up the Bayesian optimization with BayesSearchCV
bayes_cv_tuner = BayesSearchCV(
    estimator=svm,
    search_spaces=search_space,
    n_iter=32,   # Number of parameter settings that are sampled
    cv=5,        # 5-fold cross-validation
    random_state=42,
    n_jobs=-1    # Use all available cores
)

bayes_cv_tuner.fit(X_train_100, y_train_100)

print("Best hyperparameters:", bayes_cv_tuner.best_params_)
print("Best cross-validation score:", bayes_cv_tuner.best_score_)

test_accuracy = bayes_cv_tuner.score(encoded_X_val, val_emoticon_Y)
print("Test set accuracy:", test_accuracy)

Best hyperparameters: OrderedDict([('C', 8.56969192382994), ('coef0', 10.0), ('degree', 5), ('gamma', 10.0)])
Best cross-validation score: 0.8600282485875705
Test set accuracy: 0.8957055214723927


In [12]:
# train 5 SVM's on split dataset 
# Best hyperparameters: OrderedDict([('C', 8.56969192382994), ('coef0', 10.0), ('degree', 5), ('gamma', 10.0)])
svm20 = SVC(kernel='linear', C=8.56969192382994, coef0=10.0, degree=5,gamma=10.0,random_state=42)
svm40 = SVC(kernel='linear', C=8.56969192382994, coef0=10.0, degree=5,gamma=10.0,random_state=42)
svm60 = SVC(kernel='linear', C=8.56969192382994, coef0=10.0, degree=5,gamma=10.0,random_state=42)
svm80 = SVC(kernel='linear', C=8.56969192382994, coef0=10.0, degree=5,gamma=10.0,random_state=42)
svm100 = SVC(kernel='linear', C=8.56969192382994, coef0=10.0, degree=5,gamma=10.0,random_state=42)

In [13]:
# fit the models
svm20.fit(X_train_20, y_train_20)
svm40.fit(X_train_40, y_train_40)
svm60.fit(X_train_60, y_train_60)
svm80.fit(X_train_80, y_train_80)
svm100.fit(X_train_100, y_train_100)

In [14]:
y20 = svm20.predict(encoded_X_val)
y40 = svm20.predict(encoded_X_val)
y60 = svm20.predict(encoded_X_val)
y80 = svm20.predict(encoded_X_val)
y100 = svm100.predict(encoded_X_val)

In [16]:
# Calculate and print accuracy for each model
accuracy_20 = accuracy_score(val_emoticon_Y, y20)
accuracy_40 = accuracy_score(val_emoticon_Y, y40)
accuracy_60 = accuracy_score(val_emoticon_Y, y60)
accuracy_80 = accuracy_score(val_emoticon_Y, y80)
accuracy_100 = accuracy_score(val_emoticon_Y, y100)

print(f"Accuracy of SVM model trained on 20% of data: {accuracy_20}")
print(f"Accuracy of SVM model trained on 40% of data: {accuracy_40}")
print(f"Accuracy of SVM model trained on 60% of data: {accuracy_60}")
print(f"Accuracy of SVM model trained on 80% of data: {accuracy_80}")
print(f"Accuracy of SVM model trained on 100% of data: {accuracy_100}")

Accuracy of SVM model trained on 20% of data: 0.7361963190184049
Accuracy of SVM model trained on 40% of data: 0.7361963190184049
Accuracy of SVM model trained on 60% of data: 0.7361963190184049
Accuracy of SVM model trained on 80% of data: 0.7361963190184049
Accuracy of SVM model trained on 100% of data: 0.8957055214723927
