In [162]:
import pandas as pd
file_path = 'tweet_data/train.txt'

def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def process_data(data):
    blocks = data.strip().split("<START>")[1:]
    processed_blocks = []

    for block in blocks:
        lines = block.strip().split("\n")[1:-1]  
        words = []
        special_word = None
        special_word_count = 0  
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 2 and parts[1] == "O":
                words.append(parts[0])
            elif len(parts) == 2 and parts[1].startswith(":") and parts[1].endswith(":"):
                if not special_word:
                    special_word = parts[1]  
                    special_word_count += 1
                elif special_word == parts[1]:  
                    special_word_count += 1

        if special_word_count <= 1:  
            sentence = " ".join(words)
            processed_blocks.append((sentence, special_word))

    return processed_blocks

def create_dataframe(processed_data):
    return pd.DataFrame(processed_data, columns=["Sentence", "First Special Word"])

data = read_data(file_path)
processed_data = process_data(data)
df = create_dataframe(processed_data)

In [163]:
df

Unnamed: 0,Sentence,First Special Word
0,CeeC is going to be another Tboss What is 45 m...,:face_with_tears_of_joy:
1,This gif kills me Death is literally gushing t...,:weary_face:
2,LOVE TEST Raw Real,:purple_heart:
3,i swear we dont gotta look it finds,:face_with_tears_of_joy:
4,We would like to wish everyone a very Happy Ne...,:party_popper:
...,...,...
6159366,Follow everyone who likes this,:white_heavy_check_mark:
6159367,we LOVE a simlish,:rolling_on_the_floor_laughing:
6159368,No fan base is more dedicated than the 1D fan ...,:yellow_heart:
6159369,Im surprised yall havent purposely hit me yet,


In [164]:
df['First Special Word'].value_counts()

First Special Word
:face_with_tears_of_joy:            1140886
:loudly_crying_face:                 455284
:red_heart:                          406399
:smiling_face_with_heart-eyes:       321974
:fire:                               279014
:weary_face:                         160562
:folded_hands:                       152782
:person_shrugging:                   140072
:two_hearts:                         132720
:thinking_face:                      122735
:smiling_face_with_smiling_eyes:     122696
:sparkles:                           119607
:person_facepalming:                 106278
:hundred_points:                     101273
:rolling_on_the_floor_laughing:      100979
:raising_hands:                      100211
:eyes:                                99903
:face_with_rolling_eyes:              98064
:party_popper:                        84268
:skull:                               84019
:backhand_index_pointing_right:       76931
:clapping_hands:                      76298
:purple_heart

In [167]:
df.iloc[223424]

Sentence              So True A ️ True BidyMind amp Soul Blessings 2...
First Special Word                                         :heart_suit:
Name: 223424, dtype: object

In [168]:
emoji = pd.read_csv('emojipedia/full_emoji.csv')
emoji.head()

Unnamed: 0,#,emoji,unicode,name,Apple,Google,Facebook,Windows,Twitter,JoyPixels,Samsung,Gmail,SoftBank,DoCoMo,KDDI
0,1,😀,U+1F600,grinning face,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAPAKIFAJh3AP/z...",,,
1,2,😃,U+1F603,grinning face with big eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIFAF5LAP/z...","data:image/png;base64,R0lGODlhDwAPAKIAAP///wAA...","data:image/png;base64,R0lGODlhDAAMAIABAMxm////...","data:image/png;base64,R0lGODlhDgAPALMJAP//mf/M..."
2,3,😄,U+1F604,grinning face with smiling eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIGAF5LAJh3...","data:image/png;base64,R0lGODlhDwAPAHcAMSH+GlNv...",,
3,4,😁,U+1F601,beaming face with smiling eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIGAIoAAf/v...","data:image/png;base64,R0lGODlhDwAPAHcAMSH+GlNv...","data:image/png;base64,R0lGODlhDAAMAIABAP+ZAP//...","data:image/png;base64,R0lGODlhDgAPALMIAJmZmf//..."
4,5,😆,U+1F606,grinning squinting face,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhEAAMAKIFAF5LAP/z...",,"data:image/png;base64,R0lGODlhDAAMAIABAMxm////...",


In [22]:
with open("tweet_data/testing.txt") as myfile:
    mydata = (line for line in myfile)
    testing = pd.DataFrame(mydata, columns=['line'])
    print(testing)

testing

                                                 line
0   I fall asleep every movie night me and my boyf...
1                        i’m coming for it all... 💰\n
2       I’ll take a mixer. K thanks 😘😂 just kidding\n
3   Hearing this new and thinking to myself: " 🤔 h...
4   LeBron starts play with the short pull-up and ...
5       I’ll take a mixer. K thanks 😘😂 just kidding\n
6   Magnus wasn't sure if that french toast was go...
7   Brand New Look of StylishStar from  😎😎😎👌👌👌 Aud...
8   Now you can celebrate tiny victories in the ki...
9        Have a happy Wednesday my dear friends🌺🍃🌺🌱\n
10                    Tiffany liked &amp; comment😆💗💗💗


Unnamed: 0,line
0,I fall asleep every movie night me and my boyf...
1,i’m coming for it all... 💰\n
2,I’ll take a mixer. K thanks 😘😂 just kidding\n
3,"Hearing this new and thinking to myself: "" 🤔 h..."
4,LeBron starts play with the short pull-up and ...
5,I’ll take a mixer. K thanks 😘😂 just kidding\n
6,Magnus wasn't sure if that french toast was go...
7,Brand New Look of StylishStar from 😎😎😎👌👌👌 Aud...
8,Now you can celebrate tiny victories in the ki...
9,Have a happy Wednesday my dear friends🌺🍃🌺🌱\n


In [169]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Sentence'])
sequences = tokenizer.texts_to_sequences(df['Sentence'])

# Pad sequences
max_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')


In [170]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Encode emojis
label_encoder = LabelEncoder()
emoji_labels = label_encoder.fit_transform(df['First Special Word'])
Y = to_categorical(emoji_labels)

In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [172]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_length))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(Y.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 119, 64)           320000    
                                                                 
 lstm_2 (LSTM)               (None, 119, 64)           33024     
                                                                 
 dropout_1 (Dropout)         (None, 119, 64)           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 50)                1650      
                                                                 
Total params: 389,778
Trainable params: 389,778
Non-tr

2024-04-17 22:23:57.558782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-17 22:23:57.560226: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-17 22:23:57.561141: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [173]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)


Epoch 1/10


2024-04-17 22:24:43.186712: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-17 22:24:43.187504: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-17 22:24:43.188190: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

  3269/153985 [..............................] - ETA: 2:47:45 - loss: 3.4389 - accuracy: 0.1833

KeyboardInterrupt: 

In [125]:
def predict_emoji(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(padded)
    return label_encoder.inverse_transform(pred.argsort(axis=1)[:,-4:][0])[::-1]  # Top 3 predictions

# Example
print(predict_emoji("I'm so happy!"))


[':smiling_face_with_heart-eyes:' ':loudly_crying_face:' ':two_hearts:'
 ':red_heart:']


In [127]:
def convert_emoji_format(emoji_str):
    # Remove the colons at the start and end
    cleaned_str = emoji_str.strip(':')
    # Replace underscores with spaces
    result_str = cleaned_str.replace('_', ' ')
    return result_str

In [161]:
test_sentence = "Ill be lying if I said I wasnt missing you,Love you Brah"

preds = predict_emoji(test_sentence)
if len(preds) >= 0:
    for i in range(len(preds)):
        print(test_sentence, emoji[emoji['name'] == convert_emoji_format(preds[i])]['emoji'].item())
else:
    print('No predictions. :(')

Ill be lying if I said I wasnt missing you,Love you Brah 😂
Ill be lying if I said I wasnt missing you,Love you Brah 😭
Ill be lying if I said I wasnt missing you,Love you Brah 😩
Ill be lying if I said I wasnt missing you,Love you Brah 💯


In [110]:
emoji[emoji['name'] == 'rolling on the floor laughing']['emoji']

'6    🤣\nName: emoji, dtype: object'

In [58]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.3490217626094818
