# Search

Search through every message you've sent on discord. This uses a search method much more powerful than `CTRL+F`.
Run this notebook, and after everything is finished, you can use the last cell in this notebook to make searches.

In [None]:
%pip install -q torch transformers sentence_transformers
%conda install -qy pytorch faiss-gpu

In [5]:
# @title If you get an error here, change your runtime type to "GPU"
import torch;
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert(device.type == 'cuda')

In [None]:
# @title Timezone
import pandas as pd

pd.set_option('display.max_colwidth', None)

TimeZone = "US/Eastern" #@param {type:"string"}
date = pd.Timestamp.now(tz=TimeZone)
print("See if your timezone is correct: ", date)

In [7]:
#@title (Optionally) Get Data From Google Drive

GetPackageFromDrive = "no" #@param ["yes", "no"]
GoogleDriveLocation = "/content/drive/MyDrive/package.zip" #@param {type:"string"}

if GetPackageFromDrive == "yes" and GoogleDriveLocation is not None:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  import shutil
  shutil.copyfile(GoogleDriveLocation, "package.zip")

In [8]:
# @title Unzip Package
import os
import zipfile

if not os.path.isdir("package"):
  assert os.path.isfile("package.zip")
  with zipfile.ZipFile("package.zip", 'r') as zip_ref:
      zip_ref.extractall("package")
assert os.path.isdir("package")

In [16]:
# @title Load Data

import json

channel_name_map = None
with open("package/messages/index.json") as index:
  channel_name_map = json.load(index)

import os
import pandas as pd

df_acc = []
subdirs = next(os.walk('package/messages'))[1]
for subdir in subdirs:
  df = pd.read_csv(os.path.join("package/messages", subdir, "messages.csv"))
  df["Channel"] = channel_name_map.get(f'c{subdir}', None)
  df_acc.append(df)
df = pd.concat(df_acc)
df = df[['Timestamp', 'Contents', 'Channel']]
assert 'Timestamp' in df, 'Contents' in df
print(f'{len(df.index)} Messages loaded')

287189 Messages loaded


In [18]:
# @title Clean Data

df.reset_index(inplace=True)

df.dropna(subset=["Contents"], inplace=True)

contents = df['Contents']
contents = contents.str.strip()

# emotes eg <:moon2S:901721403550728193> | <a:moon2S:901721403550728193> (a means animated) | :moon2S: -> moon2S
contents = contents.str.replace(r'(<a?)?:([\w\d]+):(\d{18}>)?', r'\2', regex=True)
# user tag
contents = contents.str.replace('<@[!&]?[0-9]+>', '[Name]', regex=True)

df['Contents'] = contents

In [19]:
# @title Filter Data

def drop(df, mask) -> int:
  original_df_size = len(df.index)
  df.drop(df[mask].index, inplace=True)
  dropped_rows = original_df_size - len(df.index)
  return dropped_rows

def drop_by_regex(df, regex) -> int:
  if isinstance(regex, list) or isinstance(regex, tuple) or isinstance(regex, set):
    regex = '|'.join(regex)
  match_mask = df["Contents"].str.match(regex)
  return drop(df, match_mask)

patterns_to_remove = dict(
  URL = r"^(https?:\/\/(?:www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$",
  EMAIL = r"^[\w\-\.]+@([\w\-]+\.)+[\w\-]{2,4}$",

  DISCORD_COMMAND = r"^[!$%].+",
  CODE_BLOCK = r"```(.|\n)*```",

  YES = r"^(?i)yes|ya|yup|ye$",
  NO = r"^(?i)no|nope|nah$",

  OK = r"^(?i)(ok(ay|ie|ie dokie)?)$"
)

df.dropna(subset=["Contents"], inplace=True)

for name, pattern in patterns_to_remove.items():
  dropped = drop_by_regex(df, pattern)
  print(dropped, name, "removed")

print(drop(df, df['Contents'].str.len() <= 2), "troglodyte grunts removed")

7739 URL removed
0 EMAIL removed
27695 DISCORD_COMMAND removed
70 CODE_BLOCK removed
3805 YES removed
10342 NO removed
1638 OK removed
13740 troglodyte grunts removed


In [None]:
df = df['Contents'].value_counts()
df = pd.DataFrame({
    'Message': df.index,
    'Number of times you said it': df.values     
})

In [55]:
# @title Turn messages into embeddings

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(df['Message'].to_list())

In [56]:
import faiss

d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(sentence_embeddings)

In [63]:
# @title Search

SearchQuery = "You smell like dog water" #@param {type:"string"}
NumberOfResults = 30

xq = model.encode([SearchQuery])
D, I = index.search(xq, NumberOfResults)
res = df.iloc[I[0]]
res

Unnamed: 0,Message,Number of times you said it
116987,I'd smell like a dog's mouth,1
85661,you do be smelling your dog,1
157434,Yeah but it’s dogshit,1
84980,but it was dogshit,1
80333,"""I smell spray on tan""",1
77054,Photos make me look like dogshit,1
56702,I wash when it's smelly,1
44443,ik it's probably dogshit,1
101992,It's making a puddle I can lap it up like a dog,1
76348,pho place is actually dogshit compared to suspect noodle house,1
