In [25]:
'''
author: Damiano Pasquini
date: 10/13/2023
'''

import torch
import subprocess
import json
import pandas as pd
import numpy
import transformers
import datasets
import tiktoken
import wandb
import tqdm

# Check if CUDA (GPU support) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tensor_on_device = torch.randn(3, 3).to(device)
print("Torch version:", torch.__version__)
print("Is CUDA enabled?", torch.cuda.is_available())

corpus = "./nanoGPT/data/utterances.jsonl"
corpus_preprocessed = "./nanoGPT/data/movies_text_preprocessed.txt"
# cli commands
prepare = "python data/shakespeare_char/prepare.py"
sample = "python sample.py --out_dir=out-shakespeare-char --device=cpu"

Torch version: 2.1.0+cu121
Is CUDA enabled? True


In [43]:
def count_words(input=corpus):
    words = subprocess.check_output(f"wsl wc -w {input}", shell=True)
    return int(words.split()[0])

print(count_words(corpus_preprocessed))

999982


In [14]:
# run and print the output of the command
def run_shell_cmd(cmd):
    print(subprocess.check_output(cmd, shell=True).decode("utf-8"))

In [45]:
def preprocess_corpus(input=corpus, output=corpus_preprocessed):
    # transform the jsonl file into a txt file, with the format "speaker: text". When the output file contains more than 1000000 words, stop. Divide the jsonl file into lines basing on the presence of "id":
    with open(input, "r") as f:
        text = f.read()
    word_counter = 0
    with open(output, "w") as f:
        for line in text.replace("\\n", ". ").split("\n"):
            if word_counter > 1000000:
                break
            if line:
                line = json.loads(line)
                word_counter += len(line["text"].split())+len(line["speaker"].split())
                f.write(f"{line['speaker']}: {line['text']}\n")
    return output

preprocess_corpus(corpus)

'./nanoGPT/data/movies_text_preprocessed.txt'

In [15]:
# default training
def train(eval_iters=20, log_interval=1, block_size=64, batch_size=12, n_layer=4, n_head=4, n_embd=128, max_iters=2000, lr_decay_iters=2000, dropout=0.0):
    subprocess.run(f"python train.py config/train_shakespeare_char.py --device=cpu --compile=False --eval_iters={eval_iters} --log_interval={log_interval} --block_size={block_size} --batch_size={batch_size} --n_layer={n_layer} --n_head={n_head} --n_embd={n_embd} --max_iters={max_iters} --lr_decay_iters={lr_decay_iters} --dropout={dropout}")

# Prepare the data, train and sample from the model with the default parameters

In [None]:
# default training
print("Running prepare...")
run_shell_cmd(prepare)
print("Training...")
train()
print("Sampling...")
run_shell_cmd(sample)

# Prepare the data, train and sample from the model with different configurations of hyperparameters

In [None]:
# train 1


In [None]:
# train 2


In [None]:
# train 3


In [23]:
# from convokit import Corpus, download
# corpus_2 = Corpus(filename=download("supreme-corpus"))
# 
# print(corpus_2.get_utterance("2015_14_13.txt").text)

Downloading supreme-corpus to C:\Users\pasqu\.convokit\downloads\supreme-corpus
Downloading supreme-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-corpus.zip (1255.8MB)... 

KeyboardInterrupt: 