In [13]:
import numpy as np
import pandas as pd

In [14]:
train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_X = train_seq_df['input_str']
train_seq_Y = train_seq_df['label']
valid_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
valid_seq_X = valid_seq_df['input_str']
valid_seq_Y = valid_seq_df['label']

In [15]:
# read emoticon dataset
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon']
train_emoticon_Y = train_emoticon_df['label']
valid_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
valid_emoticon_X = valid_emoticon_df['input_emoticon']
valid_emoticon_Y = valid_emoticon_df['label']

In [16]:
train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']
valid_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
valid_feat_X = valid_feat['features']
valid_feat_Y = valid_feat['label']

In [17]:
# flatten the features  
train_feat_X = np.array([x.flatten() for x in train_feat_X])
valid_feat_X = np.array([x.flatten() for x in valid_feat_X])

In [18]:
# apply pca to reduce the dimensionality of the features
from sklearn.decomposition import PCA
pca = PCA(n_components=96*13)
train_feat_X = pca.fit_transform(train_feat_X)
valid_feat_X = pca.transform(valid_feat_X)

In [19]:
train_seq_X = train_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
train_seq_X.columns = [f'col_{i+1}' for i in range(50)]
train_seq_X = train_seq_X.astype(int)
valid_seq_X = valid_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
valid_seq_X.columns = [f'col_{i+1}' for i in range(50)]
valid_seq_X = valid_seq_X.astype(int)

In [20]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of all emojis across the dataset
emojis = list(set([emoji for sample in train_emoticon_X for emoji in sample]))
encoder = OneHotEncoder(categories=[emojis]*13, sparse=False, handle_unknown='ignore')

# Convert the dataset (each sample is 13 emojis) into a list of lists (2D array)
emoji_sequences = [list(sample) for sample in train_emoticon_X]
emoji_sequences_val = [list(sample) for sample in valid_emoticon_X]

train_emoticon_X = encoder.fit_transform(emoji_sequences)
valid_emoticon_X = encoder.transform(emoji_sequences_val)




In [21]:
# apply pca on the emoticon features
pca_emoji = PCA(n_components=107)
train_emoticon_X = pca_emoji.fit_transform(train_emoticon_X)
valid_emoticon_X = pca_emoji.transform(valid_emoticon_X)

In [22]:
train_emoticon_X.shape, train_seq_X.shape, train_feat_X.shape

((7080, 107), (7080, 50), (7080, 1248))

In [23]:
# append the features
train_X = np.concatenate([train_seq_X, train_emoticon_X, train_feat_X], axis=1)
valid_X = np.concatenate([valid_seq_X, valid_emoticon_X, valid_feat_X], axis=1)

In [26]:
# train a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=2)
clf.fit(train_X, train_seq_Y)
pred_Y = clf.predict(valid_X)
print(accuracy_score(valid_seq_Y, pred_Y))


0.9100204498977505


In [27]:
# train a logistic regression classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=1000)
clf.fit(train_X, train_seq_Y)
pred_Y = clf.predict(valid_X)
print(accuracy_score(valid_seq_Y, pred_Y))

0.9856850715746421
