## Imports ##

In [1]:
import nltk
from nltk.corpus import words
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## Data Preparation ##

In [2]:
nltk.download("words")

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
data = pd.read_csv("kaggle_RC_2019-05.csv")
data.head()

Unnamed: 0,subreddit,body,controversiality,score
0,gameofthrones,Your submission has been automatically removed...,0,1
1,aww,"Dont squeeze her with you massive hand, you me...",0,19
2,gaming,It's pretty well known and it was a paid produ...,0,3
3,news,You know we have laws against that currently c...,0,10
4,politics,"Yes, there is a difference between gentle supp...",0,1


In [4]:
# remove controversial statements
data = data[data["controversiality"] != 1]
#data = data[["body"]]
print(data.head())

       subreddit                                               body  controversiality  score
0  gameofthrones  Your submission has been automatically removed...                 0      1
1            aww  Dont squeeze her with you massive hand, you me...                 0     19
2         gaming  It's pretty well known and it was a paid produ...                 0      3
3           news  You know we have laws against that currently c...                 0     10
4       politics  Yes, there is a difference between gentle supp...                 0      1


In [5]:
corpus = set(words.words())

def clean_text(string):
  list_ = string.split(" ")
  # convert all to lower case except special case: "I"
  lower_cases = [word.lower() for word in list_ if word != "I"]
  # only include words that appear in the english language
  valid_words = [word for word in lower_cases if word in corpus]
  return valid_words

In [6]:
data['body'] = data['body'].apply(lambda x: clean_text(x))
print(data.head())

       subreddit                                               body  controversiality  score
0  gameofthrones  [your, submission, been, automatically, remove...                 0      1
1            aww  [dont, squeeze, her, with, you, massive, you, ...                 0     19
2         gaming  [pretty, well, known, and, it, was, a, product...                 0      3
3           news  [you, know, we, have, against, that, currently...                 0     10
4       politics  [there, is, a, difference, between, gentle, su...                 0      1


In [7]:
all_dataset_words = []
def compile_all_words(word_list):
  all_dataset_words.extend(word_list)
  return word_list

data['body'] = data['body'].apply(compile_all_words)

all_unique_words = set(all_dataset_words)

In [8]:
num_unique_words = len(all_unique_words)
num_unique_words

30857

In [9]:
# create mapping of words to their numbers and vice versa
string_to_int = {s: i for i, s in enumerate(all_unique_words)}
int_to_string = {i: s for s, i in string_to_int.items()}

In [10]:
# build training data and labels

block_size = 3  # trigram model
X, y = [], []

for index, row in data.iterrows():
  word_list = row['body']
  for i in range(len(word_list) - block_size + 1):
    context = word_list[i:i + block_size - 1]  # context (2 words for trigram)
    target = word_list[i + block_size - 1]  # target (3rd word in trigram)

    # Convert context and target to integer representation
    context_indices = [string_to_int[word] for word in context]
    target_index = string_to_int[target]

    X.append(context_indices)
    y.append(target_index)

In [11]:
X = torch.tensor(X)
y = torch.tensor(y)

In [12]:
X.shape, y.shape

(torch.Size([17779504, 2]), torch.Size([17779504]))

In [13]:
# create embedding tensor -> 90 unique words will be embedded each into 10 values
C = torch.randn((num_unique_words, 10))

In [14]:
C.shape

torch.Size([30857, 10])

In [15]:
# lookup of embeddings for every context tensor in X
emb = C[X]
emb.shape

torch.Size([17779504, 2, 10])

## Split into Training and Testing ##

In [16]:
n_samples = len(X)
reduced_size = n_samples // 3  # Reduce to 1/3
indices = torch.randperm(n_samples)[:reduced_size]
X_reduced = X[indices]
y_reduced = y[indices]


X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y_reduced, test_size=0.2, random_state=42
)

print(f"Training data: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing data: X_test={X_test.shape}, y_test={y_test.shape}")

Training data: X_train=torch.Size([4741200, 2]), y_train=torch.Size([4741200])
Testing data: X_test=torch.Size([1185301, 2]), y_test=torch.Size([1185301])
