# Dataset preparing

The notebook prepares data and split them into train/valid/test datasets.

Download and unpack dataset from https://ai.stanford.edu/~amaas/data/sentiment/ to `../data` directory.

In [1]:
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm

def read_dir(path: Path) -> pd.DataFrame:
    reviews = []
    for review_path in tqdm(path.glob('*.txt')):
        ix, rate = review_path.stem.split('_')

        with review_path.open('r') as f:
            reviews.append({
                'review': f.read(),
                'score': int(rate)
            })

    return pd.DataFrame(reviews)

In [2]:
path = Path('../data/aclImdb')

train_pos = read_dir(path / 'train' / 'pos')
train_neg = read_dir(path / 'train' / 'neg')
test_pos = read_dir(path / 'test' / 'pos')
test_neg = read_dir(path / 'test' / 'neg')

train_pos['sentiment'] = 'positive'
train_neg['sentiment'] = 'negative'
test_pos['sentiment'] = 'positive'
test_neg['sentiment'] = 'negative'

train = pd.concat([train_pos, train_neg])
train['sentiment'] = train['sentiment'].astype('category')

test = pd.concat([test_pos, test_neg])
test['sentiment'] = test['sentiment'].astype('category')

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

Define train/valid split

In [3]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, train_size=0.8, random_state=88)

Serialize datasets to pickle

In [6]:
train.to_pickle(Path('../data/interim/train.pkl'))
valid.to_pickle(Path('../data/interim/valid.pkl'))
test.to_pickle(Path('../data/interim/test.pkl'))