In [145]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split

In [146]:
filename = "output"

df = pd.read_csv(f"../data/{filename}.csv")
df.head()

Unnamed: 0,ID,Timestamp,Contents,Attachments
0,1030731513786015814,2022-10-15 06:39:02.952000+00:00,<@855712826723532810> this reminded me of ur dogs,https://cdn.discordapp.com/attachments/1000192...
1,1030167053585367111,2022-10-13 17:16:05.153000+00:00,so we’re slightly coping preseason,
2,1030151290019381390,2022-10-13 16:13:26.826000+00:00,<@855712826723532810> how bad are the wizards ...,
3,1030147482132426782,2022-10-13 15:58:18.955000+00:00,noooo 😭 😭 😭 😭,
4,1029721464645111808,2022-10-12 11:45:28.471000+00:00,💜💜💜💜🇵🇱🇵🇱🇵🇱🇵🇱🇵🇱,


### Initial Data Cleaning

In [147]:
# Removes attachments/ID column, unneeded for training
df = df.drop(columns=["ID", "Attachments"])
# Converts Timestamp to datetime
df.Timestamp = pd.to_datetime(df.Timestamp).dt.tz_localize(None)

# removes messages with no contents (such as links/gifs)
df = df.dropna()
# removes users tagged in messages (users are linked under "<@1234...>")
df.Contents = df.Contents.apply(lambda l: re.sub(r"<(@|#)[!]?[0-9]+>", "", l))
# removes links from messages
df.Contents = df.Contents.apply(lambda l: re.sub(r"(www|http[s]*)\S+\w?", "", l))

# remove Wordle messages (spam)
df = df[df.Contents.map(lambda l: re.match(r"Wordle [0-9]+.+", l) == None)]

# We are only training on relevant data from 2022 and later
time = datetime.fromisocalendar(2022, 1, 1)
df = df[df.Timestamp > time]

# removes any rows with no message after previous cleaning methods
df = df[df.Contents.str.len() > 0]
df

Unnamed: 0,Timestamp,Contents
0,2022-10-15 06:39:02.952,this reminded me of ur dogs
1,2022-10-13 17:16:05.153,so we’re slightly coping preseason
2,2022-10-13 16:13:26.826,how bad are the wizards this szn
3,2022-10-13 15:58:18.955,noooo 😭 😭 😭 😭
4,2022-10-12 11:45:28.471,💜💜💜💜🇵🇱🇵🇱🇵🇱🇵🇱🇵🇱
...,...,...
125492,2022-07-08 00:59:43.660,im going to the beach from the 9th-15th
125494,2022-06-24 01:26:44.005,It takes two
125497,2022-06-18 01:46:36.253,not a virus 😎
125498,2022-06-18 01:46:30.837,click that


### Split data into test and train sets

In [148]:
train, test = train_test_split(df, test_size=0.2, shuffle=True)
train.to_csv("../data/train.csv", index=False)
test.to_csv("../data/test.csv", index=False)