In [20]:
import numpy as np
import pandas as pd

# Data reading and Processing

In [21]:
train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_X = train_seq_df['input_str']
train_seq_Y = train_seq_df['label']
valid_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
valid_seq_X = valid_seq_df['input_str']
valid_seq_Y = valid_seq_df['label']

In [22]:
# read emoticon dataset
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon']
train_emoticon_Y = train_emoticon_df['label']
valid_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
valid_emoticon_X = valid_emoticon_df['input_emoticon']
valid_emoticon_Y = valid_emoticon_df['label']

In [23]:
train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']
valid_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
valid_feat_X = valid_feat['features']
valid_feat_Y = valid_feat['label']

In [24]:
# flatten the features  
train_feat_X = np.array([x.flatten() for x in train_feat_X])
valid_feat_X = np.array([x.flatten() for x in valid_feat_X])

In [25]:
train_seq_X = train_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
train_seq_X.columns = [f'col_{i+1}' for i in range(50)]
train_seq_X = train_seq_X.astype(int)
valid_seq_X = valid_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
valid_seq_X.columns = [f'col_{i+1}' for i in range(50)]
valid_seq_X = valid_seq_X.astype(int)

In [26]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of all emojis across the dataset
emojis = list(set([emoji for sample in train_emoticon_X for emoji in sample]))

# Initialize OneHotEncoder with 'handle_unknown' set to 'ignore' to avoid issues with unseen emojis
encoder = OneHotEncoder(categories=[emojis]*13, sparse=False, handle_unknown='ignore')

# Convert the dataset (each sample is 13 emojis) into a list of lists (2D array)
emoji_sequences = [list(sample) for sample in train_emoticon_X]
emoji_sequences_val = [list(sample) for sample in valid_emoticon_X]

# Fit the encoder and transform the data
train_emoticon_X = encoder.fit_transform(emoji_sequences)
valid_emoticon_X = encoder.transform(emoji_sequences_val)




In [27]:
train_emoticon_X.shape, train_seq_X.shape, train_feat_X.shape

((7080, 2782), (7080, 50), (7080, 9984))

In [28]:
# append the features
train_X = np.concatenate([train_seq_X, train_emoticon_X, train_feat_X], axis=1)
valid_X = np.concatenate([valid_seq_X, valid_emoticon_X, valid_feat_X], axis=1)

# Model Training and Performance Analysis

In [29]:
# apply xgboost for binary classification
import xgboost as xgb
from sklearn.metrics import accuracy_score

model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, seed=42)
model.fit(train_X, train_seq_Y)
preds = model.predict(valid_X)
accuracy = accuracy_score(valid_seq_Y, preds)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9897750511247444


In [14]:
# split into 20,40,60,80,100% training data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_X_80, test_X_80, train_Y_80, test_Y_80 = train_test_split(train_X, train_seq_Y, test_size=0.2, random_state=2)
model80 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, seed=42)
model80.fit(train_X_80, train_Y_80)
preds80 = model80.predict(valid_X)
accuracy80 = accuracy_score(valid_seq_Y, preds80)
print(f"Accuracy: {accuracy80}")

Accuracy: 0.9815950920245399


In [15]:
train_X_60, test_X_60, train_Y_60, test_Y_60 = train_test_split(train_X, train_seq_Y, test_size=0.4, random_state=2)
model60 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, seed=42)
model60.fit(train_X_60, train_Y_60)
preds60 = model60.predict(valid_X)
accuracy60 = accuracy_score(valid_seq_Y, preds60)
print(f"Accuracy: {accuracy60}")

Accuracy: 0.9713701431492843


In [16]:
train_X_40, test_X_40, train_Y_40, test_Y_40 = train_test_split(train_X, train_seq_Y, test_size=0.6, random_state=2)
model40 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, seed=42)
model40.fit(train_X_40, train_Y_40)
preds40 = model40.predict(valid_X)
accuracy40 = accuracy_score(valid_seq_Y, preds40)
print(f"Accuracy: {accuracy40}")

Accuracy: 0.9754601226993865


In [17]:
train_X_20, test_X_20, train_Y_20, test_Y_20 = train_test_split(train_X, train_seq_Y, test_size=0.8, random_state=2)
model20 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, seed=42)
model20.fit(train_X_20, train_Y_20)
preds20 = model20.predict(valid_X)
accuracy20 = accuracy_score(valid_seq_Y, preds20)
print(f"Accuracy: {accuracy20}")

Accuracy: 0.9468302658486708


# Predicting on test data

In [18]:
# load test data and predict
test_seq_X = pd.read_csv("datasets/test/test_text_seq.csv")
test_emoticon_X = pd.read_csv("datasets/test/test_emoticon.csv")
test_feat_X = np.load("datasets/test/test_feature.npz", allow_pickle=True)

In [19]:
test_feat_X = np.array([x.flatten() for x in test_feat_X['features']])

In [30]:
emoji_sequences_test = [list(sample) for sample in test_emoticon_X['input_emoticon']]
test_emoticon_X = encoder.transform(emoji_sequences_test)

In [31]:
test_seq_X = test_seq_X['input_str'].apply(lambda x: pd.Series(list(x)))
test_seq_X.columns = [f'col_{i+1}' for i in range(50)]
test_seq_X = test_seq_X.astype(int)

In [35]:
test_X = np.concatenate([test_seq_X, test_emoticon_X, test_feat_X], axis=1)

In [39]:
preds_test = model.predict(test_X)
# save preds_test to txt file
np.savetxt("pred_combined.txt", preds_test, fmt="%d")