In [89]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data preparation

Walk through files containing all of my texts. 

In [90]:
import pandas as pd

all_texts = pd.read_json("imessages.jsonl", lines=True)

all_texts['timestamp'] = pd.to_datetime(all_texts['timestamp'])

# Localize timestamp
all_texts['timestamp'] = all_texts['timestamp'].dt.tz_localize('UTC')
all_texts['timestamp'] = all_texts['timestamp'].dt.tz_convert('America/Los_Angeles')

all_texts.dtypes

message_id                                       int64
text                                            object
timestamp          datetime64[ns, America/Los_Angeles]
is_from_me                                        bool
display_name                                    object
is_group_chat                                     bool
chat_identifier                                 object
dtype: object

First, I'll filter out any messages without text or without chat IDs. This could be for any number of reasons, namely if the chat was deleted.

In [91]:
all_texts = all_texts[all_texts['text'].notna()].copy()
all_texts = all_texts[all_texts['chat_identifier'].notna()]

I'll hash the phone numbers to anonymize the data.

In [92]:
from typing import List
import hashlib

def hash_and_truncate(value, length=8):
    full_hash = hashlib.sha256(value.encode()).hexdigest()
    return full_hash[:length]

all_texts['chat_identifier'] = all_texts['chat_identifier'].apply(hash_and_truncate)

all_texts.drop(['display_name'], axis=1, inplace=True)

all_texts.dtypes

message_id                                       int64
text                                            object
timestamp          datetime64[ns, America/Los_Angeles]
is_from_me                                        bool
is_group_chat                                     bool
chat_identifier                                 object
dtype: object

## Emojis

Extracting emojis and cleaning data.

In [93]:
import re

with open('EMOJI_RAW.txt', 'r') as file:
    EMOJI_RAW = file.read()

EMOJI = re.compile(EMOJI_RAW)

emoji_df = pd.read_csv("emoji_df.csv")

def extract_emojis(text):
    return EMOJI.findall(text.lower())

In [94]:
!pip install "swifter[notebook]" tqdm

import swifter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [95]:
all_texts['emojis'] = all_texts['text'].swifter.apply(extract_emojis)
all_texts.dtypes

Pandas Apply:   0%|          | 0/345076 [00:00<?, ?it/s]

message_id                                       int64
text                                            object
timestamp          datetime64[ns, America/Los_Angeles]
is_from_me                                        bool
is_group_chat                                     bool
chat_identifier                                 object
emojis                                          object
dtype: object

In [96]:
from collections import Counter
import numpy as np

all_emojis = pd.Series(np.concatenate(all_texts['emojis'].values))

emoji_counts = Counter(all_emojis)
most_common_emojis = emoji_counts.most_common()

most_common_emojis[:10]

[('😭', 8250),
 ('💀', 2745),
 ('😁', 925),
 ('😀', 731),
 ('🙏', 723),
 ('😃', 721),
 ('💪', 641),
 ('😍', 493),
 ('🥲', 484),
 ('❤️', 427)]

Next, I want to remove all of the emojis from the messages. This is the column we'll use as training inputs for the predictor.

`clean_text` + `emojis` = `text`

In [97]:
# Convert the emojis column into a regex pattern
emoji_pattern = '|'.join(map(re.escape, set(all_emojis)))

# Remove emojis from the message column
all_texts['clean_text'] = all_texts['text'].str.replace(emoji_pattern, '', regex=True)
all_texts['clean_text'] = all_texts['clean_text'].str.rstrip()

In [114]:
all_texts['last_emoji'] = all_texts['emojis'].apply(lambda x: x[0:1])

In [115]:
all_texts.dtypes

message_id                                       int64
text                                            object
timestamp          datetime64[ns, America/Los_Angeles]
is_from_me                                        bool
is_group_chat                                     bool
chat_identifier                                 object
emojis                                          object
clean_text                                      object
last_emoji                                      object
dtype: object

In [118]:
# Save to filesystem
all_texts.to_pickle('all_texts.pkl')