In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Prepare Data

In [None]:
import numpy as np
import pandas as pd
import json
import os
import re
from tqdm import tqdm

pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

from IPython.core.debugger import set_trace
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from fastai.text import * 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

%load_ext autoreload
%autoreload 2

tqdm.pandas()

In [None]:
df_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv', nrows=10000)

In [None]:
df_data.head(1)

In [None]:
df_data['sentiment'].value_counts()

In [None]:
# train-valid-test split
df_data_train, df_data_test = train_test_split(df_data, test_size=0.4, random_state=123)
df_data_valid, df_data_test = train_test_split(df_data_test, test_size=0.5, random_state=123)

In [None]:
print('Shapes:')
print(f'Launch: {df_data_train.shape, df_data_valid.shape, df_data_test.shape}')

### Define Language Model and Classifier

In [None]:
# Language model data
data_lm = TextLMDataBunch.from_df(path='', train_df=df_data_train, valid_df=df_data_valid, 
                                  text_cols='review', label_cols='sentiment')

In [None]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(path='', train_df=df_data_train, valid_df=df_data_valid, 
                                     text_cols='review', label_cols='sentiment', 
                                     vocab=data_lm.train_ds.vocab, bs=32)

In [None]:
# save language model and classifiers as pickles
data_lm.save('data_lm_export.pkl')
data_clas.save('data_clas_export.pkl')

In [None]:
# load language model and classifiers
# data_lm = load_data(outpath + 'data_lm_export.pkl')
# data_clas = load_data('data_clas_export.pkl')

### Fine Tune Language Model

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn.fit_one_cycle(3, 1e-3)

In [None]:
# unfreeze and fine-tune
learn.unfreeze()
learn.fit_one_cycle(1, 1e-4)

In [None]:
# save encoder
learn.save_encoder('fastai_ft_enc')

### Fit Classifiers

In [None]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('fastai_ft_enc');

In [None]:
data_clas.show_batch(2)

In [None]:
learn.fit_one_cycle(5, 1e-2)

In [None]:
# unfreeze to -2 and fine-tune
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

In [None]:
# unfreeze all and fine-tune
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

### Test Set Predictions

In [None]:
# text = df_data_test.reset_index(drop=True).review[10]
# true_label = df_data_test.reset_index(drop=True).sentiment[10]
# text, true_label

In [None]:
# learn.predict(text)
# learn.predict(text)[1].item()

In [None]:
def make_prediction(learner, text):
    pred = learner.predict(text)
    return learner.predict(text)[1].item()

def make_prediction_data(text):
    return make_prediction(learn, text)

In [None]:
def get_results(true_label, pred_label):
    results = {}
    results['Accuracy'] = accuracy_score(true_label, pred_label)
    precision, recall, fscore, support = precision_recall_fscore_support(true_label, pred_label, 
                                                    average='binary', pos_label='positive')
    results['Precision'] = precision
    results['Recall'] = recall
    return results

In [None]:
df_data_test['prediction'] = df_data_test['review'].progress_apply(make_prediction_data)

In [None]:
df_data_test['prediction'] = df_data_test['prediction'].map({0:'negative', 1:'positive'})

In [None]:
results = get_results(df_data_test['sentiment'], df_data_test['prediction'])

In [None]:
results