In [1]:
from fastai import *        # Quick accesss to most common functionality
from fastai.text import *   # Quick accesss to NLP functionality
from fastai.docs import *   # Access to example data provided with fastai

# Text example

## Using .csv files

An example of creating a language model and then transfering to a classifier.

In [2]:
untar_data(IMDB_PATH)
IMDB_PATH

PosixPath('../data/imdb_sample')

Open and view the independent and dependent variables:

In [None]:
df = pd.read_csv(IMDB_PATH/'train.csv', header=None)
df.head()

In [None]:
classes = read_classes(IMDB_PATH/'classes.txt')
classes[0], classes[1]

Create a `DataBunch` for each of the language model and the classifier:

In [None]:
data_lm = text_data_from_csv(Path(IMDB_PATH), data_func=lm_data)
data_clas = text_data_from_csv(Path(IMDB_PATH), data_func=classifier_data, vocab=data_lm.train_ds.vocab)

[fast.ai](http://www.fast.ai/) has a pre-trained English model available that we can download.

In [None]:
download_wt103_model()

We'll fine-tune the language model:

In [None]:
learn = RNNLearner.language_model(data_lm, pretrained_fnames=['lstm_wt103', 'itos_wt103'])
learn.unfreeze()
learn.fit(2, slice(1e-4,1e-2))

Save our language model's encoder:

In [None]:
learn.save_encoder('enc')

Fine tune it to create a classifier:

In [None]:
learn = RNNLearner.classifier(data_clas)
learn.load_encoder('enc')
learn.fit(3, 1e-3)

## Using DataFrames

In [46]:
untar_data(IMDB_PATH)
IMDB_PATH

PosixPath('../data/imdb_sample')

In [47]:
classes = read_classes(IMDB_PATH/'classes.txt')
classes[0], classes[1]

('negative', 'positive')

In [48]:
train_df = pd.read_csv(IMDB_PATH/'train.csv', names=['rating', 'review']) # optionally can include chunksize
valid_df = pd.read_csv(IMDB_PATH/'valid.csv', names=['rating', 'review']) # optionally can include chunksize

data_lm = text_data_from_df(Path(IMDB_PATH), train_df, valid_df,
                            txt_cols=['review'], label_cols=['rating'], data_func=lm_data)

Tokenizing train.


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]')))

Numericalizing train.
Tokenizing valid.


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]')))

Numericalizing valid.


In [49]:
learn = RNNLearner.language_model(data_lm, pretrained_fnames=['lstm_wt103', 'itos_wt103'])
learn.unfreeze()
learn.fit(2, slice(1e-4,1e-2))

VBox(children=(HBox(children=(IntProgress(value=0, max=2), HTML(value='0.00% [0/2 00:00<00:00]'))), HTML(value…

Total time: 00:10
epoch  train loss  valid loss  accuracy
0      4.922027    4.151546    0.246380  (00:05)
1      4.642405    4.090209    0.251819  (00:05)



In [50]:
learn.save_encoder('enc')

In [51]:
train_df = pd.read_csv(IMDB_PATH/'train.csv', names=['rating', 'review'], chunksize=100) 
valid_df = pd.read_csv(IMDB_PATH/'valid.csv', names=['rating', 'review'], chunksize=100)
# train_df.head()

data_clas = text_data_from_df(Path(IMDB_PATH), train_df, valid_df,
                              txt_cols=['review'], label_cols=['rating'], 
                              data_func=classifier_data, vocab=data_lm.train_ds.vocab)

In [52]:
learn = RNNLearner.classifier(data_clas)
learn.load_encoder('enc')
learn.fit(3, 1e-3)

VBox(children=(HBox(children=(IntProgress(value=0, max=3), HTML(value='0.00% [0/3 00:00<00:00]'))), HTML(value…

Total time: 00:14
epoch  train loss  valid loss  accuracy
0      0.698653    0.669762    0.590000  (00:04)
1      0.669983    0.620942    0.685000  (00:04)
2      0.658963    0.588105    0.690000  (00:04)

