In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Training ChatBot off your iMessage texts

Grabbing the training data doesn't seem to be fully automatable. So I'll guide you through how to parse iMessages.

# Gather Training Data
If you haven't yet...
1. Go into `System Preferences` -> `Security` -> `Full Disk Access` and give `Terminal` permissions.
2. Go to `/Users/{your username}/Library/Messages/chat.db` and copy that file into the `imessage_bot` folder. I have .gitignore set to ignore all .db files so you don't have to worry about your personal info getting uploaded to GitHub.

## Get messages for a given phone number

In [3]:
import sqlite3
import pandas as pd

In [4]:
phone_number = '+16266895189'

### 
conn = sqlite3.connect('chat.db')
cursor = conn.cursor()

cursor.execute(f'SELECT * FROM handle WHERE id="{phone_number}"')
tables = cursor.fetchall()

print(f"For phone number: {phone_number}, the associated id is {[idx[0] for idx in tables]}")


For phone number: +16266895189, the associated id is [140, 8]


In [141]:
idx = 8 # Get from previous slide

### Find text messages from a specific conversation
def get_texts(idx: int):
    cursor.execute(f"SELECT text, is_from_me FROM message WHERE handle_id = {idx};")
    tables = cursor.fetchall()

    df = pd.DataFrame(tables, columns=['text', 'is_from_me'])
    df.fillna('', inplace=True)
    
    return df

df = get_texts(idx)
df.head()

Unnamed: 0,text,is_from_me
0,https://usc.zoom.us/j/8072267340?pwd=b0RVNjFaS...,1
1,let me innn,0
2,O LOL,1
3,IM NOT SHOWERING,0
4,o what y is it blue,0


## Parse it into call and reponse pairs for spaCy training

In [165]:
def group_messages(df):
        
    output = []
    current_group = {'text': '', 'response': ''}
    
    if len(df) == 0:
        return output
    
    previous = df['is_from_me'].iloc[0]
    while previous == 1:
        df = df.drop(index=0)
        df = df.reset_index(drop=True)
        if len(df) == 0:
            return output
        previous = df['is_from_me'].iloc[0]
    
    for _, rows in df.iterrows():
        i = rows['is_from_me']
        text = rows['text']
        if (i == 0) and (previous == 0):
            current_group['text'] += ' ' + text
        elif (i == 1) and (previous == 0):
            current_group['response'] += text
            previous = 1
        elif (i == 1) and (previous == 1):
            current_group['response'] += ' ' + text
        elif (i == 0) and (previous == 1):
            output.append(current_group)
            current_group = {'text': text, 'response': ''}
            previous = 0
        else:
            raise ValueError("Column 'is_from_me' in chat.db has changed.")
    if len(output) == 0:
        return output
    output.pop()
    return output


training_data = group_messages(df)

In [166]:
output_df = pd.DataFrame(training_data)
output_df

Unnamed: 0,text,response
0,let me innn,O LOL
1,IM NOT SHOWERING o what y is it blue,Idfk BRUH Bruh moment Ah sucks to suck
2,https://play.typeracer.com?rt=mslf1n3h7,￼ I fee like the original set was too editor...
3,ahhhh THESE R CRAZY UR INSANE,Leg ones go HUH
4,GO CRAZY ur insane i looked closer ur crazyy h...,My phone gone die *gonna So text me here when ...
...,...,...
1662,bruhh i was waitinh for u,I’m sorry my ally pie
1663,clarky?,Yes
1664,answer my ft ￼ good morning love :) sorry i ha...,Still positive I’m gonna curl up and die
1665,ur joking do u not get to test everyday,He’s getting the nurse


In [157]:
train_data = []
for data in training_data:
    train_data.append(data['text'])
    train_data.append(data['response'])

# Creating Chatbot using `Chatterbot`

In [144]:
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer

# Create a new chat bot named Clark
chatbot = ChatBot('Clark')
trainer = ListTrainer(chatbot)
trainer.train(train_data)

List Trainer: [                    ] 2%

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:997)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


List Trainer: [##################  ] 90%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [187]:
prompt = """Do you wanna cuddle?"""
response = chatbot.get_response(prompt)

print(prompt)
print(response)

Do you wanna cuddle?
O just wanted to ask if u wanted to hang sun to do hw!


# Training the Bot on ALL of my Text Messages

In [188]:
from tqdm import tqdm

conn = sqlite3.connect('chat.db')
cursor = conn.cursor()

all_texts = [get_texts(idx) for idx in range(1,207 + 1)]
all_grouped_texts = [group_messages(df) for df in all_texts if (len(group_messages(df)) != 0)]


train_data = []
for conversation in tqdm(all_grouped_texts):
    for data in conversation:
        # train_data.append(data['text'])
        train_data.append(data['response'])
        


100%|█████████████████████████████████████████████████| 87/87 [00:00<00:00, 131213.39it/s]


In [189]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Load the preexisting GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize the training data
train_tokenized = [tokenizer.encode(text) for text in train_data]

# Fine-tune the GPT-2 model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized
)

trainer.train()

# Generate text with the fine-tuned model
generated_text = model.generate(
    input_ids=tokenizer.encode("This is an example"),
    max_length=50,
    do_sample=True,
    top_k=50
)

# Decode the generated text
decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(decoded_text)


ModuleNotFoundError: No module named 'transformers'

In [183]:
prompt = """Meow"""

response = chatbot.get_response(prompt)

print(prompt)
print(response)

Meow
￼i designed this poster in 2 hours im so drained ￼ sunset!! im at cabin now 🦃hello friends!! this is a groupchat for my friendsgiving get together for WEDNESDAY NOV 23RD at 7PM! My address is 2809 Ellendale Place. ✨the vibes are: i invited a lot of ppl who dont know each other but i vibe w u and want u to meet each other and itll be a good chill time this will b potluck style so i made a spreadsheet to see who is bringing what! feel free to invite other chill ppl if u want to as well im v excited for this guys yayay https://docs.google.com/spreadsheets/d/15H0pe2EBCmoN_WQmwJNVmPK9W9HTfxu3MN6XZ8rHS-Q/edit?usp=sharing Loved “I’ll bring some poke !” how is ur day clarky clark wait did u ft me haha
