In [1]:
import random
import pandas as pd
import json
import numpy as np
import sys
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast

## Checking path

In [2]:
sys.path

['/Users/samuelchu/PROG/cse151b/AI_text_generator',
 '/Users/samuelchu/miniconda3/lib/python310.zip',
 '/Users/samuelchu/miniconda3/lib/python3.10',
 '/Users/samuelchu/miniconda3/lib/python3.10/lib-dynload',
 '',
 '/Users/samuelchu/miniconda3/lib/python3.10/site-packages',
 '/Users/samuelchu/miniconda3/lib/python3.10/site-packages/PyQt5_sip-12.11.0-py3.10-macosx-11.1-arm64.egg']

## Reading data

In [3]:
data_file = open('data/CSE151B_groupchat_3mo.json')
json_data = json.load(data_file)
json_data['messages'][1]['content']

"Can one of you guys download your facebook messenger data and upload the json file for this groupchat to the github repo? I can't download the 3 month one for some reason"

In [4]:
one_minute = 60000

In [5]:
len(json_data['messages'])

1664

In [6]:
json_data = json_data['messages'][::-1] # reversing data

In [7]:
junk_messages = ['reacted', 'X:1759 T:F\"ur Elise T:Bagatelle', 'Counter({50:', '--']

## Saving all lines (lines only, without names)

In [8]:
all_lines = []
for i in range(len(json_data)):
    if 'content' in json_data[i]:
        msg = json_data[i]['content'].replace('ð\x9f\x98\x82', ' hahaha').encode('latin1').decode('utf-8').replace('\n', ' ')
        if ('reacted' not in msg and 'to your message' not in msg) and \
            '\u200d♂️' not in msg and '\u200d♂️' not in msg and \
            'https' not in msg and 'https' not in msg and \
            '.com' not in msg and '.com' not in msg and \
            np.all([junk not in msg for junk in junk_messages]):
            all_lines.append(msg.strip().lower())

In [9]:
all_lines[:5]

['pa2!',
 'thanks ester!',
 'ester named the group cse 151b pa2.',
 'what should our group name be 😎 ester and i used transformers for pa1',
 'lol anything works']

In [10]:
# Writing only lines to a text file

with open('data/all_lines.txt', 'w+') as f:
    for i in all_lines:
        f.write(i)
        f.write('\n')

## Creating tokenizer

In [11]:
def create_tokenizer(text_fp, tokenizer_save_fp='tokenizer/', tokenizer_name='bert-base-uncased', max_vocab_size=32000):
    lines = []
    with open(text_fp) as file:
        for line in file: 
            line = line.strip() 
            lines.append(line)
            
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    tokenizer = tokenizer.train_new_from_iterator(iter(lines), max_vocab_size)
    print("Vocab size is: ", tokenizer.vocab_size)
    tokenizer.save_pretrained(tokenizer_save_fp)

In [12]:
tokenizer_save_path = 'LSTM/tokenizer/'
create_tokenizer('data/all_lines.txt', tokenizer_save_fp=tokenizer_save_path)

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]




Vocab size is:  3804


## Creating prompt-response pairs

In [13]:
prompt_response = []
for i in range(len(json_data)-1):
    this_dict = {}
    time_difference = json_data[i+1]['timestamp_ms'] - json_data[i]['timestamp_ms']
    
    #Keeps responses within 2minutes of previous message, removes photos
    if json_data[i]['sender_name'] != json_data[i+1]['sender_name'] and \
        time_difference <= one_minute*2 and 'content' in json_data[i] and \
        'content' in json_data[i+1]:
        
        this_dict['prompt'] = json_data[i]['content'].replace('ð\x9f\x98\x82', ' hahaha').encode('latin1').decode('utf-8').replace('\n', ' ').strip().lower()
        this_dict['response'] = json_data[i+1]['content'].replace('ð\x9f\x98\x82', ' hahaha').encode('latin1').decode('utf-8').replace('\n', ' ').strip().lower()
        this_dict['user'] = json_data[i+1]['sender_name']
        
        
        #removes reactions and links
        if not(('reacted' in this_dict['prompt'] and 'to your message' in this_dict['prompt']) or \
        ('reacted' in this_dict['response'] and 'to your message' in this_dict['response'])) and \
        '\u200d♂️' not in this_dict['response'] and '\u200d♂️' not in this_dict['prompt'] and \
        'https' not in this_dict['response'] and 'https' not in this_dict['prompt'] and \
        '.com' not in this_dict['response'] and '.com' not in this_dict['prompt']:
            prompt_response.append(this_dict)

In [20]:
count = 0
for i in prompt_response:
    print(f'this is {count}: {i}\n')
    count += 1

this is 0: {'prompt': 'pa2!', 'response': 'thanks ester!', 'user': 'Kong Xian Ying'}

this is 1: {'prompt': "are you guys able to access a gpu of any sort? datahub is still down and i can't get google colab to work either", 'response': 'thinking if we can use our data science capstone’s 🤔', 'user': 'Kong Xian Ying'}

this is 2: {'prompt': 'thinking if we can use our data science capstone’s 🤔', 'response': 'i guess so!', 'user': 'Ester Tsai'}

this is 3: {'prompt': 'i guess so!', 'response': 'yeahh', 'user': 'Kong Xian Ying'}

this is 4: {'prompt': 'so i can only work on one project at a time', 'response': 'ohh i see i seeeee', 'user': 'Kong Xian Ying'}

this is 5: {'prompt': 'it says "request exceeds limit of 1 gpus" hahaa', 'response': 'ohhh okay i was wondering if u run it on the same session', 'user': 'Kong Xian Ying'}

this is 6: {'prompt': 'in case it gets killed and u gotta restart the kernel 😅', 'response': 'i can just clear some space in my private folder and git clone the proj

In [21]:
valid_items = [4,5,7,8,13,15,16,17,18,22,23,24,25,31,34,38,41,43,48,50,53,57,58]

In [53]:
with open('prompt_response.jsonl', 'w') as json_file:
    for entry in prompt_response:
        json.dump(entry, json_file)
        json_file.write('\n')

## Creating train, validation and test set

In [76]:
def generate_train_val_test(data, cutoff_percentage=0.8, test_count=10):
    random.shuffle(data) 
    n = len(data)
    train_cutoff = int(n*cutoff_percentage)
    train_data = data[:train_cutoff]
    val_data = data[train_cutoff:-test_count]
    test_data = data[-test_count:]
    return train_data, val_data, test_data

In [99]:
sentence_lst = []
with open('data/prompt_response.jsonl', 'r') as json_file:
    for row in json_file:
        content = json.loads(row)
        sentence_lst.append(content)

In [100]:
train, val, test = generate_train_val_test(sentence_lst)

In [102]:
len(train), len(val), len(test)

(400, 91, 10)

In [91]:
from torch.utils.data import DataLoader

def create_data_loader(data, batch_size=10, loop=True, shuffle=True):

    data_loader = DataLoader(
                data,
                batch_size=batch_size,
                shuffle=shuffle)

    if loop:
        return infinite_loader(data_loader)
    else:
        # print(data_loader)
        return iter(data_loader)

def infinite_loader(data_loader):
    while True:
        yield from data_loader

In [92]:
train = create_data_loader(train)

In [95]:
train_batch = next(train)

In [98]:
len(train_batch['prompt'])

10