In [11]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [12]:
args = Namespace(
    raw_dataset_csv="data\surnames\surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data\surnames\surnames_with_splits.csv",
    seed=1337
)

In [16]:
import os
os.getcwd()

'E:\\git\\PyTorchNLPBook\\chapters\\chapter_4\\4_4_cnn_surnames'

In [19]:
# Read raw data
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [20]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [21]:
# Unique classes
set(surnames.nationality)

{'Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese'}

In [23]:
# Splitting train by nationality
# Create dict
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
    print(row.to_dict())
    by_nationality[row.nationality].append(row.to_dict())
    print(by_nationality[row.nationality])

{'surname': 'Woodford', 'nationality': 'English'}
[{'surname': 'Woodford', 'nationality': 'English'}]
{'surname': 'Coté', 'nationality': 'French'}
[{'surname': 'Coté', 'nationality': 'French'}]
{'surname': 'Kore', 'nationality': 'English'}
[{'surname': 'Woodford', 'nationality': 'English'}, {'surname': 'Kore', 'nationality': 'English'}]
{'surname': 'Koury', 'nationality': 'Arabic'}
[{'surname': 'Koury', 'nationality': 'Arabic'}]
{'surname': 'Lebzak', 'nationality': 'Russian'}
[{'surname': 'Lebzak', 'nationality': 'Russian'}]
{'surname': 'Obinata', 'nationality': 'Japanese'}
[{'surname': 'Obinata', 'nationality': 'Japanese'}]
{'surname': 'Rahal', 'nationality': 'Arabic'}
[{'surname': 'Koury', 'nationality': 'Arabic'}, {'surname': 'Rahal', 'nationality': 'Arabic'}]
{'surname': 'Zhuan', 'nationality': 'Chinese'}
[{'surname': 'Zhuan', 'nationality': 'Chinese'}]
{'surname': 'Acconci', 'nationality': 'Italian'}
[{'surname': 'Acconci', 'nationality': 'Italian'}]
{'surname': 'Mifsud', 'nationa

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




{'surname': 'Gim', 'nationality': 'Chinese'}
[{'surname': 'Zhuan', 'nationality': 'Chinese'}, {'surname': 'Yau', 'nationality': 'Chinese'}, {'surname': 'Lau', 'nationality': 'Chinese'}, {'surname': 'Zhi', 'nationality': 'Chinese'}, {'surname': 'Shaw', 'nationality': 'Chinese'}, {'surname': 'Pang', 'nationality': 'Chinese'}, {'surname': 'Rang', 'nationality': 'Chinese'}, {'surname': 'Qiao', 'nationality': 'Chinese'}, {'surname': 'Pan', 'nationality': 'Chinese'}, {'surname': 'Rao', 'nationality': 'Chinese'}, {'surname': 'Yue', 'nationality': 'Chinese'}, {'surname': 'Chu', 'nationality': 'Chinese'}, {'surname': 'Tong', 'nationality': 'Chinese'}, {'surname': 'Tso', 'nationality': 'Chinese'}, {'surname': 'Zang', 'nationality': 'Chinese'}, {'surname': 'Ow-Yang', 'nationality': 'Chinese'}, {'surname': 'Song', 'nationality': 'Chinese'}, {'surname': 'Eng', 'nationality': 'Chinese'}, {'surname': 'Chi', 'nationality': 'Chinese'}, {'surname': 'Cong', 'nationality': 'Chinese'}, {'surname': 'Tang',

In [24]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_nationality.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [25]:
# Write split data to file
final_surnames = pd.DataFrame(final_list)

In [26]:
final_surnames.split.value_counts()

train    7680
test     1660
val      1640
Name: split, dtype: int64

In [27]:
final_surnames.head()

Unnamed: 0,nationality,split,surname
0,Arabic,train,Totah
1,Arabic,train,Abboud
2,Arabic,train,Fakhoury
3,Arabic,train,Srour
4,Arabic,train,Sayegh


In [28]:
# Write munged data to CSV
final_surnames.to_csv(args.output_munged_csv, index=False)