## Libraries

In [26]:
import pandas as pd
import re
import emoji
import nltk
from langdetect import detect, DetectorFactory
from nltk.tokenize import TweetTokenizer


Processing Data

In [None]:
# Read the .txt file line by line into a DataFrame
with open('Clean Dataset/emojify_rawdata', 'r', encoding='utf-8') as file:
    data = file.readlines()


In [28]:
# Create a DataFrame where each line is a row
df2 = pd.DataFrame(data, columns=['tweet'])

# Display the first few rows
df2.head()

Unnamed: 0,tweet
0,Squad arriving for Game 2 🚀\n
1,Dude is like 5’8 140 pounds his dick was long and strong(always the little dudes carrying the 🍆) 🤪🙃\n
2,FOLLOWERS👇\n
3,I CANT BREATIUHW 💀💀💀\n
4,2️⃣4️⃣ hours 'til our schedule drops!\n


## Dropping Duplicates

In [30]:
#Trying to drop duplicates 

# Compute a hash for each row
df2['hash'] = pd.util.hash_pandas_object(df2, index=False)

# Drop duplicates based on the hash column
df2 = df2.drop_duplicates(subset=['hash'])

# Drop the hash column after filtering
df2 = df2.drop(columns=['hash'])


In [29]:
#Check number of tweets
print(len(df2))

18883592


## Keeping only tweets with 1 emoji

In [6]:
#Keeping only tweets with 1 emoji in them for simplicity

def filter_tweets_with_one_emoji(df, tweets_column):
    """
    Filters a DataFrame to include only rows where the specified column contains exactly one emoji.

    Args:
    - df (pd.DataFrame): The input DataFrame.
    - tweets_column (str): The name of the column containing the tweets.

    Returns:
    - pd.DataFrame: A filtered DataFrame with rows containing exactly one emoji.
    """
    # Function to count emojis in a string
    def count_emojis(tweet):
        return sum(1 for char in tweet if char in emoji.EMOJI_DATA)

    # Filter rows with exactly one emoji
    filtered_df = df[df[tweets_column].apply(count_emojis) == 1]

    return filtered_df

df2 = filter_tweets_with_one_emoji(df2, tweets_column="tweet")


In [7]:
#Check number of tweets
print(len(df2))

5187482


## Cleaning Tweets

In [None]:
#Making text lowercase, removing any URLs, Hashtags, or Mentions
def clean_text(text):
    # Lowercase the text
    text = text.lower().strip()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    return text

df2['clean_tweet'] = df2['tweet'].apply(clean_text)


In [None]:
df2.head()

In [9]:
df2.head(6)

Unnamed: 0,tweet,clean_tweet
0,Squad arriving for Game 2 🚀\n,squad arriving for game 2 🚀
2,FOLLOWERS👇\n,followers👇
5,NEW || Zach &amp; Jack at Limelight tonight! (...,new || zach &amp; jack at limelight tonight! (...
7,I am SO scared of birds🤧\n,i am so scared of birds🤧
10,This is one of my favorite songs to sing in th...,this is one of my favorite songs to sing in th...
11,Took my goat to get groomed for the first time...,took my goat to get groomed for the first time 😂


In [10]:
#Tokenizing the data
tokenizer=TweetTokenizer()

def tokenize_and_replace_emojis(text):
    tokens=tokenizer.tokenize(text)
    return tokens

df2['tokens'] = df2['clean_tweet'].apply(tokenize_and_replace_emojis)



In [11]:
df2 = df2[['tokens']]

In [12]:

pd.set_option('display.max_colwidth', None)
df2

Unnamed: 0,tokens
0,"[squad, arriving, for, game, 2, 🚀]"
2,"[followers, 👇]"
5,"[new, |, |, zach, &, jack, at, limelight, tonight, !, (, april, 17, ), ©, ️nvmbesson]"
7,"[i, am, so, scared, of, birds, 🤧]"
10,"[this, is, one, of, my, favorite, songs, to, sing, in, this, episode, ❤, ️]"
...,...
18883571,"[this, dude, is, so, bay, area, 😂]"
18883572,"[take, advantage, of, me, &, i, ’, ll, ice, you, out, 🤫]"
18883575,"[i, stay, getting, ignored, 😔]"
18883583,"[i, want, some, crawfish, but, i, only, want, it, from, this, one, place, 😩]"


In [13]:
import os

In [14]:
folder_path = "C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset"
filename1 = 'emojify_cleaned.csv'

file_path1 = os.path.join(folder_path, filename1)

df2.to_csv(file_path1, index=False)


## Getting a Smaller Sample

In [15]:
random_sample = df2.sample(n=10000, random_state=42)


In [16]:
folder_path = "C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset"
filename1 = 'emojify_cleaned_10k.csv'

file_path1 = os.path.join(folder_path, filename1)

random_sample.to_csv(file_path1, index=False)

In [17]:
print(len(random_sample))

10000


In [18]:
random_sample.head()

Unnamed: 0,tokens
12733535,"[if, not, later, ,, when, ?, 🍑]"
1382379,"[🔗, |, the, izombie, 4x08, page, is, now, up, !, containing, trivia, ,, quotes, ,, reviews, ,, a, look, at, liv's, style, ,, comic, slides, gifs, ,, the, b, …]"
17448398,"[hearing, bts, at, work, still, amazes, me, 😍]"
1359951,"[ayyy, this, is, lit, 🔥]"
11483372,"[well, richard, i, jumped, in, the, shower, and, saved, her, so, back, off, 😡]"


In [23]:
random_sample2=random_sample.sample(n=500, random_state=42)

In [24]:
folder_path = "C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset"
filename1 = 'emojify_cleaned_500.csv'

file_path1 = os.path.join(folder_path, filename1)

random_sample2.to_csv(file_path1, index=False)