<a href="https://colab.research.google.com/github/dhk92/dv-comment-generator/blob/main/DV_comment_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data import**

In [None]:
from google.colab import drive
from os.path import join
import os
import torch
import torch.nn as nn
import torch.nn.functional as functional
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd
from collections import Counter


ROOT="/content/drive"
drive.mount(ROOT)
PROJ="My Drive/Deep learning/DV_comment_generator"
PROJ_PATH=join(ROOT,PROJ)

!rsync -aP "{PROJ_PATH}"/* ./

def read_comments(comments):
  files = [f for f in os.listdir('.') if f.endswith('.json')]
  for file in files:
    print(file)
    with open(file) as fil:
        data = json.load(fil)
        for article in data:
          for comment in article['comments']:
            if comment != '':
              comments.append(comment)
          

comments = []
read_comments(comments)
print(comments)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
sending incremental file list
dvcom04.json
dvcom01.json
dvcom03.json
dvcom02.json
['I see what you did there 😉', 'Flott hjá henni', 'Æi greyið, hún hlýtur að vera með sinaskeiðabólgu fyrst að hún getur bara 3 sinnum á dag.', 'Já ætli hún fái ekkert annað grei stulkan...en nb ... hverslags fréttir eru þetta ... jahérnahét', 'Þak á verðtrygginguna. Miðað við allt sem stjórnvöld geta gert í tengslum við kórónuveiruna ættu þeir að setja þak þannig að fólk þurfi ekki að standa í endurfjármögnun.', 'helt að þu værir látinn ekki orð eftir að Drifan neitaði Bjarna að afnema VERÐTRYGGINUNA', 'Finnst blaðamanninum þessi snúður vera bardagakappi? Er þetta leiðtogi lífs þíns herra blaðamaður? Því kallar þú hann ekki því nafni sem hann er, ofbeldismaður, kókfíkill eða fáviti?', 'Hann Gulli er á verulega slæmum stað í lífinu.', 'Gunnar Nelson er bardagakappinn en þessi Júl

**Data Setup**

In [None]:
class Dataset():
    def __init__(self):
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        words = []
        for comment in comments:
          for word in comment.split(' '):
            words.append(word)
        return words

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - 50

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+50),
            torch.tensor(self.words_indexes[index+1:index+51]),
        )

SyntaxError: ignored

In [None]:
dataset = Dataset()
print(dataset.get_uniq_words())

['að', 'er', 'og', 'í', 'á', 'ekki', 'sem', 'það', 'til', 'um', 'með', 'en', 'við', 'fyrir', 'eru', 'hann', 'af', 'þetta', 'Það', 'þá', 'bara', 'því', 'ég', 'svo', 'eða', 'vera', 'hefur', 'hafa', 'var', 'verið', 'hjá', 'þú', 'eins', 'þeir', 'sér', 'Þetta', 'ekkert', 'eftir', 'sé', 'ef', 'hún', 'frá', 'Ég', 'fólk', 'upp', 'svona', 'hvað', 'þessi', 'nú', 'þegar', 'þess', 'sig', 'væri', 'hafi', 'allt', 'Er', 'þau', 'koma', 'þeim', 'þessu', '?', 'þessum', 'þér', 'fá', 'fara', 'þar', 'úr', 'verður', 'segja', 'Hvað', 'gera', 'út', 'þarf', 'vegna', 'En', 'alveg', 'yfir', 'þessa', 'maður', 'mér', 'ad', 'i', 'aldrei', 'rétt', 'sama', 'meira', 'eitthvað', 'vel', 'hefði', 'voru', 'of', 'Hann', 'ætti', 'hægt', 'heldur', 'fram', 'eiga', 'taka', 'hér', 'geta', 'fer', 'allir', 'Ekki', 'mikið', 'Og', '-', 'má', 'menn', 'hvort', 'Ef', 'hverju', 'sjá', 'gert', 'kemur', 'líka', ':)', 'alltaf', 'inn', 'annað', 'einhver', 'þó', 'Trump', 'ert', 'enn', 'margir', '<3', 'getur', 'a', 'honum', 'mun', 'hef', '!'

**The Model**

In [None]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

**Training**

In [None]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader

model = Model(dataset)
model.train()

dataloader = DataLoader(dataset, batch_size=128)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    state_h, state_c = model.init_state(50)

    for batch, (x, y) in enumerate(dataloader):
        optimizer.zero_grad()

        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        loss = criterion(y_pred.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss.backward()
        optimizer.step()

        print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

AttributeError: ignored