In [None]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from collections import Counter

In [None]:
data = pd.read_csv('reddit-comments-2015-08.csv',nrows=500)

In [None]:
body = data['body'].values

sentences = [sent_tokenize(text) for text in body]

wrds = []
for sentence in sentences:
    tokenized_sentence = []
    for word in sentence:
        if isinstance(word, str) and word.strip():
            tokenized_sentence.extend(word_tokenize(word))
    wrds.append(tokenized_sentence)

In [None]:
stopwrd = set(stopwords.wrds('english'))

cleanwrd = []
for sentence in wrds:
    filtered_sentence = [word for word in sentence if word.lower() not in stopwrd]
    cleanwrd.append(filtered_sentence)

all_wrds = [word for sentence in cleanwrd for word in sentence]
word_counts = Counter(all_wrds)

In [None]:
I_threshhold = 5
infrequent_wrds = [word for word, count in word_counts.items() if count < I_threshhold]

cleanwrd = [[word for word in sentence if word not in infrequent_wrds] for sentence in cleanwrd]


data['cleanwrd'] = cleanwrd

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
word_dim = 100
hidden_dim = 200


output_dim = 2

bptt_truncate = 8

W = np.random.uniform(-np.sqrt(1.0/hidden_dim), np.sqrt(1.0/hidden_dim), (hidden_dim, hidden_dim))

U = np.random.uniform(-np.sqrt(1.0/word_dim), np.sqrt(1.0/word_dim), (hidden_dim, word_dim))

V = np.random.uniform(-np.sqrt(1.0/hidden_dim), np.sqrt(1.0/hidden_dim), (output_dim, hidden_dim))


In [None]:
#activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def fwd_propogation(x):
    len = len(x)

    s = np.zeros((len + 1, hidden_dim))

    s[-1] = np.zeros(hidden_dim)

    a = np.zeros((len, output_dim))


    for t in range(len):

        s[len] = sigmoid(U[:, x[t]] + W.dot(s[t-1]))

        a[len] = sigmoid(V.dot(s[t]))


    return a, s

In [None]:
def calculate_loss(y, a):
  x = -np.mean(y * np.log(a) + (1 - y) * np.log(1 - a))
  return x

In [None]:
word_index = {word: idx for idx, word in enumerate(set(word for sentence in cleanwrd for word in sentence))}

vocabsize = len(word_index)

word_dim = 100


x_train = [[min(word_index[word], word_dim - 1) for word in sentence] for sentence in cleanwrd]

y_train = np.random.randint(2, size=(len(x_train), output_dim))

o_train, _ = fwd_propogation(x_train[0])



loss = calculate_loss(y_train[0], o_train)

print("Loss:")
loss

Loss: 0.5667208316106862
