In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code':
  os.chdir(r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import random

# Set the random seed for reproducibility
random.seed(42)

In [3]:
path = '../wikitext-2/'
train_path = path + 'wiki.train.tokens'
valid_path = path + 'wiki.valid.tokens'
test_path = path + 'wiki.test.tokens'

def read_file(path):
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        file_contents = file.read()
    return file_contents

train_string = read_file(train_path)
valid_string = read_file(valid_path)
test_string = read_file(test_path)
print(train_string[:100])

In [4]:
train_list = train_string.split("\n")
valid_list = valid_string.split("\n")
test_list = test_string.split("\n")
train_list[:5]

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp('''"Let's go to N.Y.!"''')

for token in doc:
    print(token.text)

nlp.vocab.strings['Let']

In [6]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output.reshape(-1, output.size(2)))
        return output

In [7]:
def train_model(model, train_iter, val_iter, num_epochs, lr):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        
        for batch in train_iter:
            optimizer.zero_grad()
            
            x = batch.text[:, :-1]
            y = batch.text[:, 1:].flatten()
            
            output = model(x)
            loss = criterion(output, y)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_iter)
        
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_iter:
                x = batch.text[:, :-1]
                y = batch.text[:, 1:].flatten()
                
                output = model(x)
                loss = criterion(output, y)
                
                val_loss += loss.item()
        
        val_loss /= len(val_iter)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'language_model.pt')
        
        print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

In [8]:
def generate_text(model, seed_text, max_length):
    model.eval()
    
    with torch.no_grad():
        tokens = seed_text.split()
        current_length = len(tokens)
        
        while current_length < max_length:
            x = torch.tensor([[TEXT.vocab.stoi[token] for token in tokens]]).to(device)
            
            output = model(x)
            last_word_logits = output[0, -1]
            
            probabilities = F.softmax(last_word_logits, dim=0).numpy()
            predicted_index = np.random.choice(len(probabilities), p=probabilities)
            predicted_word = TEXT.vocab.itos[predicted_index]
            
            tokens.append(predicted_word)
            current_length += 1
            
    generated_text = ' '.join(tokens)
    return generated_text

In [9]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Preprocess the dataset
train_iter, val_iter, test_iter = preprocess_dataset()

# Define the model
vocab_size = len(train_iter.dataset.fields['text'].vocab)
embedding_dim = 100
hidden_dim = 128
num_layers = 2
model = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)

# Train the model
num_epochs = 10
learning_rate = 0.001
train_model(model, train_iter, val_iter, num_epochs, learning_rate)

# Generate text using the trained model
seed_text = "The weather is"
max_length = 20
generated_text = generate_text(model, seed_text, max_length)
print(generated_text)