<a href="https://colab.research.google.com/github/gc2321/3546-Deep-Learning/blob/main/pytorch/4_1_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

## Load Data

In [2]:
file = "/content/gdrive/MyDrive/neural_data/shakespeare.txt"

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [6]:
with open(file,'r',encoding='utf8') as f:
    text = f.read()

In [8]:
text[:1000]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak'st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world's due, by the grave and thee.\n\n\n                     2\n  When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty's field,\n  Thy youth's proud livery so gazed on now,\n  Will be a tattered weed of small worth held:  \n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To sa

In [10]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [11]:
len(text)

5445609

## Encode Entire Text

In [12]:
all_characters = set(text)

In [13]:
all_characters

{'\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '}'}

In [14]:
decoder = dict(enumerate(all_characters))

In [15]:
encoder = {char: ind for ind,char in decoder.items()}

In [16]:
encoded_text = np.array([encoder[char] for char in text])

In [17]:
encoded_text[:500]

array([49, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       64, 64, 64, 64, 64, 46, 49, 64, 64, 34, 69, 21, 50, 64,  5, 31, 71,
       69, 40, 78, 51, 64, 61, 69, 40, 31, 51, 19, 69, 40, 78, 64, 13, 40,
       64, 25, 40, 78, 71, 69, 40, 64, 71, 74, 61, 69, 40, 31, 78, 40, 30,
       49, 64, 64, 42, 60, 31, 51, 64, 51, 60, 40, 69, 40,  7, 58, 64,  7,
       40, 31, 19, 51, 58,  6, 78, 64, 69, 21, 78, 40, 64, 50, 71, 62, 60,
       51, 64, 74, 40, 41, 40, 69, 64, 25, 71, 40, 30, 49, 64, 64, 73, 19,
       51, 64, 31, 78, 64, 51, 60, 40, 64, 69, 71, 43, 40, 69, 64, 78, 60,
       21, 19, 10, 25, 64,  7, 58, 64, 51, 71, 50, 40, 64, 25, 40, 61, 40,
       31, 78, 40, 30, 49, 64, 64, 80, 71, 78, 64, 51, 40, 74, 25, 40, 69,
       64, 60, 40, 71, 69, 64, 50, 71, 62, 60, 51, 64,  7, 40, 31, 69, 64,
       60, 71, 78, 64, 50, 40, 50, 21, 69, 58,  0, 49, 64, 64, 73, 19, 51,
       64, 51, 60, 21, 19, 64, 61, 21, 74, 51, 69, 31, 61, 51, 40, 25, 64,
       51, 21, 64, 51, 60

## One Hot Encoding

In [18]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text

    num_uni_chars = number of unique characters (len(set(text)))
    '''

    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay

    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))

    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0


    # Reshape it so it matches the batch sahe
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))

    return one_hot

In [19]:
one_hot_encoder(np.array([1,2,0]),3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)