# SVM Emoji Predictor

In [7]:
import pandas as pd
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [8]:
chunks = pd.read_csv("merged_emoji_data.csv", chunksize=1000)  # read in chunks of 1000 rows since there is a lot of data
final_merged_df = pd.concat(chunks, ignore_index=True)  # combine the chunks

In [9]:
# change the text, title, and ucode_short to type string
final_merged_df['text'] = final_merged_df['text'].astype('string')
final_merged_df['title'] = final_merged_df['title'].astype('string')
final_merged_df['ucode_short'] = final_merged_df['ucode_short'].astype('string')

# drop all the NA values 
final_merged_df = final_merged_df.dropna()

In [10]:
final_merged_df.dtypes

text            string[python]
title           string[python]
ucode_short     string[python]
ucode                   object
cleaned_text            object
dtype: object

In [11]:
label_encoder = LabelEncoder()
final_merged_df['emoji_label'] = label_encoder.fit_transform(final_merged_df['ucode'])

sample_size = 75000  # based on your testing

data_sampled, _ = train_test_split(
    final_merged_df,
    train_size=sample_size,
    stratify=final_merged_df['emoji_label'],  
    random_state=42
)
# shuffle the reduced dataset
data_sampled = data_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

# split data into training and testing sets
X = data_sampled['cleaned_text']
y = data_sampled['emoji_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000) 

# create an SVM pipeline with scaling and TF-IDF
pipeline = make_pipeline(
    vectorizer,
    StandardScaler(with_mean=False),  # scale the TF-IDF features; `with_mean` is False for sparse data
    SVC(kernel='linear', C=1.0, random_state=42)  # Linear SVM; adjust `C` for regularization strength
)

# train SVM
pipeline.fit(X_train, y_train)

# predict and evaluate
y_pred = pipeline.predict(X_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.1289
              precision    recall  f1-score   support

           ☀       0.34      0.38      0.36       361
           ✅       0.24      0.31      0.27       362
           ✔       0.21      0.26      0.23       354
           ✨       0.06      0.06      0.06       359
           🍳       0.17      0.20      0.18       356
           🎉       0.25      0.20      0.22       362
           🎊       0.16      0.18      0.17       358
           🐇       0.24      0.19      0.21       362
           🐣       0.25      0.32      0.28       362
           🐰       0.14      0.11      0.13       362
           👀       0.05      0.06      0.06       355
           👉       0.21      0.28      0.24       363
           👍       0.13      0.13      0.13       344
           👻       0.10      0.09      0.10       356
           💀       0.09      0.09      0.09       362
           💞       0.05      0.04      0.04       357
           💟       0.10      0.09      0.10       353
          

In [13]:
joblib.dump(pipeline, 'SVM_Predictor.pkl')

['SVM_Predictor.pkl']

In [14]:
# load the saved model
loaded_pipeline = joblib.load('SVM_Predictor.pkl')
print("Model loaded successfully")

Model loaded successfully


In [15]:
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove all mentions (e.g., @username)
    text = re.sub(r'@[\w]+', '', text)  # removes anything starting with @
    # remove emojis if not needed (optional, you can skip this part if emojis should be kept)
    text = emoji.replace_emoji(text, replace='')  # this removes all emojis

    # remove non-alphanumeric characters (keep letters and numbers)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [16]:
import emoji
# define the input texts
new_texts = [
    "I'm anxious about class. Everything is stressing me out.",
    "Easter egg hunt this Sunday!",
    "Good job on the promotion! I'm very proud of you."
]

# clean the texts 
new_texts_cleaned = [clean_text(text) for text in new_texts]

# use the trained SVM pipeline for predictions
predicted_labels = loaded_pipeline.predict(new_texts_cleaned)

# decode the predicted labels into emojis
predicted_emoji = label_encoder.inverse_transform(predicted_labels)

# display the predicted emojis for each input text
for text, emoji_code in zip(new_texts, predicted_emoji):
    print(f"Text: {text} -> Predicted Emoji: {emoji_code}")

Text: I'm anxious about class. Everything is stressing me out. -> Predicted Emoji: 🤣
Text: Easter egg hunt this Sunday! -> Predicted Emoji: 🥚
Text: Good job on the promotion! I'm very proud of you. -> Predicted Emoji: ✔
