# Exercise 2
## Neural machine translation with attention
### By: Daniel Mehta

---

## Imports and Config

In [6]:
import os
import re
import random
from pathlib import Path

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Setting up seed
SEED = 5501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [5]:
# setting device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


---

## Dataset Path Setup

In [7]:
# setting path to dataset
data_dir = Path("spa-eng")
data_path = data_dir / "spa.txt"

In [8]:
if not data_path.exists():
    raise FileNotFoundError(f"Dataset not found at {data_path}")
print(f"Dataset located at: {data_path}")

Dataset located at: spa-eng\spa.txt


---

## Data Exploration and Cleaning

In [10]:
# reading the file and split into lines
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

In [11]:
print(f"Total sentence pairs in file: {len(lines)}")
print("Sample lines:")
for i in range(5):
    print(lines[i])

Total sentence pairs in file: 142511
Sample lines:
Go.	Ve.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)
Go.	Vete.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)
Go.	Vaya.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986657 (cueyayotl)
Go.	Váyase.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #6586271 (arh)
Hi.	Hola.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #431975 (Leono)


In [12]:
#Separating into English and Spanish
pairs = [line.split("\t") for line in lines]
english_sentences = [pair[0] for pair in pairs] #English (target)
spanish_sentences = [pair[1] for pair in pairs] #Spanish (source)

print("\nExample pair:")
print("EN:", english_sentences[0])
print("ES:", spanish_sentences[0])


Example pair:
EN: Go.
ES: Ve.


---

## Tokenization & vocab building