# Data Cleaning & Splitting for LLaMA-LoRA Model
By Chris Bonner

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import re

In [14]:
filename = "daniel"

df = pd.read_csv(f"../data/{filename}.csv")
df.head()

Unnamed: 0,ID,Timestamp,Contents,Attachments
0,1064351465474359327,2023-01-16 01:12:44.063000+00:00,i assume that means they know the inner workin...,
1,1063595066863648778,2023-01-13 23:07:04.581000+00:00,a gardener nurturing new technologies,
2,1062955373658652722,2023-01-12 04:45:09.836000+00:00,,https://cdn.discordapp.com/attachments/1000455...
3,1056350862945030284,2022-12-24 23:21:11.776000+00:00,more or less that yeah,
4,1056089409696575549,2022-12-24 06:02:16.464000+00:00,class traitor got what he had coming smh,


### Initial Data Cleaning

In [15]:
# Removes attachments/ID column, unneeded for training
df = df.drop(columns = ["ID", "Attachments"])
# Converts Timestamp to datetime
df.Timestamp = pd.to_datetime(df.Timestamp).dt.tz_localize(None)

# removes messages with no contents (such as links/gifs)
df = df.dropna()
# removes users tagged in messages (users are linked under "<@1234...>")
df.Contents = df.Contents.apply(lambda l: re.sub(r"<(@|#)[!]?[0-9]+>", "", l))
# removes links from messages
df.Contents = df.Contents.apply(lambda l: re.sub(r"(www|http[s]*)\S+\w?", "", l))

# remove Wordle messages (spam)
df = df[df.Contents.map(lambda l: re.match(r"Wordle [0-9]+.+", l) == None)]

# We are only training on relevant data from 2022 and later
time = datetime.fromisocalendar(2022, 1, 1)
df = df[df.Timestamp > time]

# removes any rows with no message after previous cleaning methods
df = df[df.Contents.str.len() > 0]
df

Unnamed: 0,Timestamp,Contents
0,2023-01-16 01:12:44.063000+00:00,i assume that means they know the inner workin...
1,2023-01-13 23:07:04.581000+00:00,a gardener nurturing new technologies
3,2022-12-24 23:21:11.776000+00:00,more or less that yeah
4,2022-12-24 06:02:16.464000+00:00,class traitor got what he had coming smh
5,2022-12-21 15:01:44.336000+00:00,now i get chat gtp to be code monke for me
...,...,...
262346,2022-12-02 14:47:08.123000+00:00,if you enjoy the subject i don't see why it'd ...
262347,2022-11-26 01:30:57.154000+00:00,force add go brrrr
262348,2022-11-26 01:09:03.026000+00:00,hokies spa isn’t having the best time
262349,2022-11-16 23:38:19.309000+00:00,got into the ones i was able to course request


Daniel added instruction, input and output splitting for model parsing

In [23]:
a = df.Contents
a = a.str.rsplit(n = 1)
out = pd.DataFrame()
out["instruction"] = "Daniel Autocomplete"
out["input"] = a[0]
out["output"] = a[1]
out

Unnamed: 0,instruction,input,output
0,,i assume that means they know the inner workin...,a gardener nurturing new
1,,it,technologies


In [32]:
df[["input", "output"]] = df["Contents"].str.rsplit(n = 1, expand = True)
df["instruction"] = "Daniel Autocomplete"
df.head()

Unnamed: 0,Timestamp,Contents,input,output,instruction
0,2023-01-16 01:12:44.063000+00:00,i assume that means they know the inner workin...,i assume that means they know the inner workin...,it,Daniel Autocomplete
1,2023-01-13 23:07:04.581000+00:00,a gardener nurturing new technologies,a gardener nurturing new,technologies,Daniel Autocomplete
3,2022-12-24 23:21:11.776000+00:00,more or less that yeah,more or less that,yeah,Daniel Autocomplete
4,2022-12-24 06:02:16.464000+00:00,class traitor got what he had coming smh,class traitor got what he had coming,smh,Daniel Autocomplete
5,2022-12-21 15:01:44.336000+00:00,now i get chat gtp to be code monke for me,now i get chat gtp to be code monke for,me,Daniel Autocomplete


In [35]:
out = df.drop(columns = ["Timestamp", "Contents"])

### Split data into test and train sets

In [38]:
# 80/20 split for training and test datasets
train, test = train_test_split(out, test_size=0.2, shuffle=True)
train, test = train.to_json(orient = "records"), test.to_json(orient= "records")

with open(f'../data/{filename}_test.json', 'w') as f:
    f.write(test)
with open(f'../data/{filename}_train.json', 'w') as f:
    f.write(train)