<a href="https://colab.research.google.com/github/efekaanefe/Rapper-AI/blob/main/rapper_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torch.autograd import Variable
import torch.nn.functional as F


In [2]:
with open('eminem-lyrics.txt', 'r') as f:
    text = f.read()

words = text.split()

len(words)

102390

Vocabulary size: 14734


In [31]:
# Create a character-level vocabulary
chars = sorted(list(set(text)))
char_size = len(chars)

words = text.split()
unique_words = sorted(list(set(words)))
vocab_size = len(unique_words)

print(f'Unique characters size: {char_size}')
print(f'Unique words/vocabulary size: {vocab_size}')


# Create mappings from characters to indices and vice versa
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

encoded_text = np.array([char_to_idx[char] for char in text])

seq_length = 100 # Sequence length
num_samples = len(encoded_text) // seq_length

input_sequences = []
target_sequences = []

for i in range(num_samples):
    start_idx = i * seq_length
    end_idx = start_idx + seq_length
    input_sequences.append(encoded_text[start_idx:end_idx])
    target_sequences.append(encoded_text[start_idx + 1:end_idx + 1])

input_sequences = torch.tensor(np.array(input_sequences), dtype=torch.long)
target_sequences = torch.tensor(np.array(target_sequences), dtype=torch.long)

print(input_sequences.shape, target_sequences.shape, encoded_text.shape)


Vocabulary size (unique char size actuall!!!): 95
torch.Size([5225, 100]) torch.Size([5225, 100]) (522527,)


In [32]:
print(f"1st input sequence: \n{input_sequences[0]}", end="\n\n")
print("Input sequence as words:")
print(''.join([idx_to_char[int(i)] for i in input_sequences[0]]), end="\n\n")
print("Target sequence as words:")
print(''.join([idx_to_char[int(i)] for i in target_sequences[0]]))


1st input sequence: 
tensor([44, 66,  1, 83, 63, 59, 66, 13,  1, 78, 66, 67, 77,  1, 67, 77,  1, 34,
        71, 67, 72, 63, 71,  1, 60, 59, 60, 83, 13,  1, 60, 59, 61, 69,  1, 79,
        74,  1, 67, 72,  1, 78, 66, 59, 78,  1, 71, 73, 78, 66, 63, 76, 64, 79,
        61, 69, 67, 72, 65,  1, 59, 77, 77,  0, 44, 72, 63,  1, 78, 67, 71, 63,
         1, 64, 73, 76,  1, 83, 73, 79, 76,  1, 71, 73, 78, 66, 63, 76,  1, 64,
        79, 61, 69, 67, 72, 65,  1, 71, 67, 72])

Input sequence as words:
Oh yeah, this is Eminem baby, back up in that motherfucking ass
One time for your mother fucking min

Target sequence as words:
h yeah, this is Eminem baby, back up in that motherfucking ass
One time for your mother fucking mind


In [33]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out.reshape(out.size(0) * out.size(1), out.size(2)))
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new_zeros(num_layers, batch_size, hidden_size),
                weight.new_zeros(num_layers, batch_size, hidden_size))
