# Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Data

In [24]:
# Read data
data = pd.read_csv("data/data.csv")

# Process the dataset by removing punctuation for TF-IDF
def preprocess(text):
    text = re.sub(r'[^A-Za-z0-9]+', " ", text)
    text = text.lower()
    return text
data["text"] = data.get("text").apply(preprocess)
data.head()

Unnamed: 0,book_id,text,birth_yr
0,84,and now with the world before me whither shou...,1797
1,84,you have been ill very ill and even the const...,1797
2,84,i intended to reason this passion is detrimen...,1797
3,84,how is this i must not be trifled with and i ...,1797
4,84,a few days after the turk entered his daughte...,1797


In [25]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    max_features=50000,
    tokenizer=word_tokenize,
    stop_words=stopwords.words("english")
)
tfidf

TfidfVectorizer(max_features=50000,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                sublinear_tf=True,
                tokenizer=<function word_tokenize at 0x7fe525414c10>)

In [26]:
X = tfidf.fit_transform(data["text"]).toarray()
y = np.array(data["birth_yr"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=1)



In [6]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((5579, 50000), (5579,), (1395, 50000), (1395,), (1744, 50000), (1744,))

In [7]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
y_valid = torch.tensor(y_valid, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

# Model

In [9]:
class LSTM_REGR(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(LSTM_REGR, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size,
            num_layers=n_layers, 
            batch_first=True
        )
        self.linear = nn.Linear(
            hidden_size, 
            1
        )

    def forward(self, x):
        x, _ = self.lstm(x.float())
        x = self.linear(x)
        return x


# Training

In [23]:
def train_model(model, optimizer, loss_fn, loader, n_epochs = 10, display_prog = True, n_display = 10):
    valid_rmse_list = []
    for epoch in range(n_epochs):
        model.train()
        progress_bar = tqdm(loader)
        for X_batch, y_batch in progress_bar:
            y_pred = model(X_batch.long())
            loss = loss_fn(y_pred, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_rmse = RMSE(y_pred, y_batch)
            progress_bar.set_description(f"Epoch {epoch+1}/{n_epochs}: Train RMSE = {train_rmse:.4f}")
        
        # Validation
        if ((epoch+1) % n_display != 0):
            continue
        _, valid_rmse = evaluate(model, X_valid, y_valid)
        if display_prog:
            print(f"Epoch {epoch+1}: Valid RMSE = {valid_rmse:.4f}")
        
        # Save model
        torch.save(model.state_dict(), f"models/sleep_train/LSTM_epoch_{epoch+1}.model")
        valid_rmse_list.append(valid_rmse)
    
    return np.argmin(valid_rmse_list), valid_rmse_list

        
def evaluate(model, X, y):
    model.eval()
    with torch.no_grad():
        y_pred = model(X.long())
        rmse = RMSE(y_pred, y)
    return y_pred, rmse

def predict(model, X, y):
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

    model.eval()
    with torch.no_grad():
        y_pred = model(X.long())
    return y_pred

def RMSE(y_pred, y):
    return np.sqrt(np.mean(
        np.square(y_pred.detach().numpy() - y.detach().numpy())
    ))

In [12]:
input_size = tfidf.max_features
hidden_size = 200

model = LSTM_REGR(input_size, hidden_size, n_layers=6)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
train_loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=128)

In [17]:
# Run if you want to train the model

# n_epochs = 4
# best_epoch, valid_rmse_list = train_model(
#     model, 
#     optimizer, 
#     loss_fn, 
#     train_loader, 
#     n_epochs = n_epochs,
#     n_display = 2
# )

# with open("models/sleep_train/log.txt", "w") as log_file:
#     log_file.write(f"Best Epoch: {best_epoch}\n{valid_rmse_list}")

Epoch 1/4: Train RMSE = 1831.7705: 100%|██████████| 44/44 [00:31<00:00,  1.40it/s]


Epoch 1: Valid RMSE = 1840.1041


Epoch 2/4: Train RMSE = 1836.5269: 100%|██████████| 44/44 [00:34<00:00,  1.29it/s]
Epoch 3/4: Train RMSE = 1822.9502: 100%|██████████| 44/44 [00:30<00:00,  1.45it/s]


Epoch 3: Valid RMSE = 1821.5438


Epoch 4/4: Train RMSE = 1807.0775: 100%|██████████| 44/44 [00:33<00:00,  1.32it/s]


# Evaluate Best Model

In [1]:
with open("models/sleep_train/log.txt", "r") as log_file:
    text = list(log_file)
    best_epoch = int(text[0].split(": ")[1])
    valid_rmse_list = text[1][1:-1].split(", ")

FileNotFoundError: [Errno 2] No such file or directory: 'Project_Files/models/sleep_train/log.txt'

In [20]:
best_model = LSTM_REGR(input_size, hidden_size, n_layers=6)
print(f"loading model with epoch {best_epoch}")
best_model.load_state_dict(torch.load(f"models/sleep_train/LSTM_epoch_{best_epoch}.model"))
best_model.eval()

loading model with epoch 30


LSTM_REGR(
  (lstm): LSTM(50000, 200, num_layers=6, batch_first=True)
  (linear): Linear(in_features=200, out_features=1, bias=True)
)

In [22]:
y_pred, test_rmse = evaluate(best_model, X_test, y_test)
print(f"Test RMSE: {test_rmse}")

Test RMSE: 122.70301055908203
