# Creating Huggingface Dataset

1. Read in text files for `training` and `validation`

In [1]:
training_file = "/content/Stoic_Training_Sentences.txt"
validation_file = "/content/Stoic_Validation_Sentences.txt"

with open(training_file) as train:
  training_lines = train.readlines()

with open(validation_file) as validate:
  validation_lines = validate.readlines()


2. Clean and parse up lines into tokens of three sentences

In [2]:
# Cleaning up lines
training_lines = [line.strip() for line in training_lines if line]
validation_lines = [line.strip() for line in validation_lines if line]


In [6]:
%%capture
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Turning into sentences 
training_string = " ".join(training_lines)
training_sentences = sent_tokenize(training_string)
validation_string = " ".join(validation_lines)
validation_sentences = sent_tokenize(validation_string)

In [10]:
# Turning into sentences of 3
training_tokens = []
validation_tokens = []

for i in range(int(len(training_sentences)/3) - 1):
  training_tokens.append(training_sentences[3*i] + " " +  training_sentences[3*i + 1] + " " + training_sentences[3*i + 2])

for i in range(int(len(validation_sentences)/3) - 1):
  validation_tokens.append(validation_sentences[3*i]  + " "  + validation_sentences[3*i + 1] + " "  + validation_sentences[3*i + 2])

3. Create dataframe from tokens and push to Huggingface

In [12]:
# Creating dataframes of sentence_tokens
import pandas as pd

training_df = pd.DataFrame(training_tokens, columns=['text'])
validation_df = pd.DataFrame(validation_tokens, columns=['text'])

# Showing example
training_df.head()

Unnamed: 0,text
0,"OF BENEFITS IN GENERAL. It is, perhaps, one of..."
1,To begin with the latter: “A benefit is a good...
2,He that does good to another man does good als...
3,In the first rank are those which deliver us f...
4,"Of things profitable there is a large field, a..."


In [None]:
%%capture
!pip install datasets "transformers[sentencepiece]"

In [None]:
# Creating two Huggingface datasets
from datasets import Dataset
training_ds = Dataset.from_pandas(training_df)
validation_ds = Dataset.from_pandas(validation_df)

In [None]:
from datasets import DatasetDict
ddict = DatasetDict({
    "train": training_ds,   
    "validation": validation_ds
})

In [None]:
!huggingface-cli login 
# Access token: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


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.7/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 41, in main
    service.run()
  File "/usr/local/lib/python3.7/



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
%%capture
ddict.push_to_hub("eliwill/Stoic")