In [1]:
from fastai.text.all import *
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [2]:
# Example of a validation dataset. In this case it is a list of python dictionaries.
# You should be able to adapt this to your usecase if you can read in a file using Pandas.
valid_jsons = [
    {"from_txt":"Hello how are you?","to_txt":"I am doing fine."},
    {"from_txt":"Is it going to rain today?","to_txt":"Let me pull up the weather."},
    {"from_txt":"How do fastai DataBlocks work?","to_txt":"Not sure, I'm still learning."}
]

# Read the list of dicts into a Data Frame
df_valid = pd.DataFrame(valid_jsons)

# Add a is_valid colum. The ColSplitter() function below expects this column.
df_valid['is_valid'] = True

# View our work
df_valid.head()

Unnamed: 0,from_txt,to_txt,is_valid
0,Hello how are you?,I am doing fine.,True
1,Is it going to rain today?,Let me pull up the weather.,True
2,How do fastai DataBlocks work?,"Not sure, I'm still learning.",True


In [3]:
# Example of a test dataset. In this case it is a list of python dictionaries.
# You should be able to adapt this to your usecase if you can read in a file using Pandas.
test_jsons = [
    {"from_txt":"Hello, where is the closest McDonald's?","to_txt":"Let me find you that on Google Maps."},
    {"from_txt":"Is it going to snow today?","to_txt":"Let me pull up the weather."},
    {"from_txt":"How much coffee is safe to drink?","to_txt":"As much as you need to learn the Fastai Library."}
]

# Read the list of dicts into a Data Frame
df_train = pd.DataFrame(test_jsons)


# Add a is_valid colum
df_train['is_valid'] = False

# View our work
df_train.head()

Unnamed: 0,from_txt,to_txt,is_valid
0,"Hello, where is the closest McDonald's?",Let me find you that on Google Maps.,False
1,Is it going to snow today?,Let me pull up the weather.,False
2,How much coffee is safe to drink?,As much as you need to learn the Fastai Library.,False


In [4]:
# Stack the two dataframes into one.
df = pd.concat([df_train,df_valid], ignore_index=True)
df.head()

Unnamed: 0,from_txt,to_txt,is_valid
0,"Hello, where is the closest McDonald's?",Let me find you that on Google Maps.,False
1,Is it going to snow today?,Let me pull up the weather.,False
2,How much coffee is safe to drink?,As much as you need to learn the Fastai Library.,False
3,Hello how are you?,I am doing fine.,True
4,Is it going to rain today?,Let me pull up the weather.,True


In [5]:
logs = DataBlock(
    
    # blocks specify what type of data we are going to be loading.
    # In this case both are text files contained in the same df
    blocks=(TextBlock.from_df('from_txt',is_lm=False),TextBlock.from_df('to_txt',is_lm=False)),
    
    # You can specify a tokenizer by passing in a tok variable. Comment the line above and ucomment the onces below.
    #blocks=(
    #    TextBlock.from_df('from_txt', is_lm=False, tok=SubwordTokenizer(vocab_sz=200)),
    #    TextBlock.from_df('to_txt'  , is_lm=False, tok=SubwordTokenizer(vocab_sz=200))),
    
    # The TestBlock tokenization process puts tokenized inputs into a column called text. 
    # The ColReader for get_x will always reference text, even if the original text inputs 
    # were in a column with another name in the dataframe.
    get_x=ColReader('text'),
    get_y=ColReader('text'),
    
    # The dataframe needs to have a is_valid column for this to work.
    splitter=ColSplitter()

)

In [6]:
dls = logs.dataloaders(df, bs=2, seq_len=150)
dls.show_batch(max_n=2)

Unnamed: 0,text,text_
0,xxbos xxmaj xxunk xxunk xxunk is xxunk xxunk xxunk xxunk ?,xxbos xxmaj let me xxunk xxunk xxunk xxunk xxmaj xxunk xxmaj xxunk .
1,xxbos xxmaj how xxunk xxunk is xxunk to xxunk ?,xxbos xxmaj xxunk xxunk xxunk xxunk xxunk xxunk xxunk the xxmaj xxunk xxmaj xxunk .
