In [6]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace


args = Namespace(
    raw_dataset_csv="data/surnames/surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/surnames/surnames_with_splits.csv",
    seed=1337
)
# Read raw data
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [7]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [9]:
# Splitting train by nationality#将相同国家的样本汇总到一起
# Create dict
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
    by_nationality[row.nationality].append(row.to_dict())

#划分train val test
final_list =[]
np.random.seed(args.seed)
for _,item_list in sorted(by_nationality.items()):
    #对每个国家：0.7train,0.15val,0.15test
    np.random.shuffle(item_list)
    length = len(item_list)
    n_train = int(args.train_proportion*length)
    n_val = int(args.val_proportion*length)
    n_test = int(args.test_proportion*length)

    for i in range(n_train):
        item_list[i]['split'] = "train"
    for i in range(n_train,n_train+n_val):
        item_list[i]['split'] = 'val'
    for i in range(n_train+n_val, length):
        item_list[i]['split'] = 'test'

    final_list.extend(item_list)

In [10]:
# Write split data to file
final_surnames = pd.DataFrame(final_list)
final_surnames.to_csv(args.output_munged_csv, index=False)


In [11]:
final_surnames.split.value_counts()


train    7680
test     1660
val      1640
Name: split, dtype: int64

In [12]:
final_surnames.head()


Unnamed: 0,nationality,split,surname
0,Arabic,train,Totah
1,Arabic,train,Abboud
2,Arabic,train,Fakhoury
3,Arabic,train,Srour
4,Arabic,train,Sayegh


# 构建模型对姓氏分类

In [13]:
from argparse import Namespace
from collections import Counter
import json
import os
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

## The Vocabulary

In [33]:

class Vocabulary(object):
    """字符 <-> index 类比yelp情感分类是将每个单词<->index，而此例的姓名是字符构成的"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
       Args:
           token_to_idx (dict): a pre-existing map of tokens to indices
           add_unk (bool): a flag that indicates whether to add the UNK token
           unk_token (str): the UNK token to add into the Vocabulary
       """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:#是否添加UNK token,添加后 unkindex会变为0
            self.unk_index = self.add_token(unk_token)

    def add_token(self, token):
        if token in self._token_to_idx:#若此token已添加，返回index
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index


    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def to_serializable(self): # 用于保存Vocabulary到文件(硬盘)中
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """用于从文件(硬盘)中加载
        instantiates the Vocabulary from a serialized dictionary
        :param contents:
        :return:
        """
        return cls(**contents)
        #**表示传入的参数为dict形式
    def __len__(self):
        return len(self._token_to_idx)
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)


## The Vectorizer

In [43]:

class SurnameVectorizer(object):
    """ 将字符转为向量"""
    def __init__(self, surname_vocab, nationality_vocab):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    def vectorize(self, surname):
        # return a collapsed one-hot vector representation for this surname
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab),dtype=np.float32)
        for token in surname:#对姓氏的每个字符
            one_hot[vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, surname_df):
        #实例化Vectorizer
        """
        :param surname_df: (pandas.DataFrame格式) the review dataset
        :return:SurnameVectorizer的对象
        """
        surname_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)

        for index, row in surname_df.iterrows():
            nationality_vocab.add_token(row.nationality)
            for ch in row.surname:
                surname_vocab.add_token(ch)
        return cls (surname_vocab, nationality_vocab)

    def to_serializable(self):
        """
        用于保存到文件中
        :return: contents (dict): the serializable dictionary
        """
        return {'surname_vocab': self.surname_vocab.to_serializable(),
                'nationality_vocab': self.nationality_vocab.to_serializable()}

    @classmethod
    def from_serializable(cls,contents):
        """从一个序列化字典中实例化Vectorizer
        Instantiate a ReviewVectorizer from a serializable dictionary
        :param contents(dict):
        :return:  an instance of the SurnameVectorizer
        """
        surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
        nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
        # contents是sict，用于实例化Vocabulary
        return cls(review_vocab=surname_vocab,rating_vocab=nationality_vocab)


## The Dataset

In [27]:

class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):
        """
        Args:
            surname_df (pandas.DataFrame): the dataset
            vectorizer (SurnameVectorizer): vectorizer instatiated from dataset
        """
        self.surname_df =  surname_df
        self._vectorizer = vectorizer

        self.train_df = self.surname_df[self.surname_df.split == 'train']
        self.train_size = len(self.train_df)
        self.val_df = self.surname_df[self.surname_df.split == 'val']
        self.validation_size = len(self.val_df)
        self.test_df = self.surname_df[self.surname_df.split == 'test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        #_lookup_dict用于set split

        self.set_split('train')

        # class weights Ccoss Entropy Loss的权重,size为类别数
        class_counts = surname_df.nationality.value_counts().to_dict()
        #获得每个国家的数量 ｛‘Italish’: 222, 'Chinese':33 ...｝
        def sort_key(item): #返回国家对应的index
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key = sort_key)
        frequencies = [counts for _, counts in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        #计算出classCounts，按照index进行排名（为了根据index一一对应），

    def set_split(self,split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df.split=='train']
        # TODO: why只需要做train的Vectorizer
        #答： train的vectorizer数据样本已经足够大，包含了val的
        return cls(surname_df, vectorizer=SurnameVectorizer.from_dataframe(train_surname_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer.
        Used in the case in the vectorizer has been cached for re-use
        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        surname_df = pd.read_csv(surname_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(surname_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row =self._target_df.iloc[index]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)
        return {'x_surname_vector':surname_vector, 'y_nationality_index':nationality_index}


## The Model: SurnameClassifier

In [23]:
class SurnameClassifier(nn.Module):
    """ A 2-layer MLP for classifying surnames """
    def __init__(self,input_dim, hidden_dim, output_dim):
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features= hidden_dim)
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features= output_dim)

    def forward(self, x_in, apply_softmax = False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        if apply_softmax:
            output  = F.softmax(self.fc2(intermediate))
        return output


## Training Routine

### Helper functions

In [24]:

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    #检验loss是否下降，是否适合保存此模型，earlystop
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False
    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            torch.save(model.state_dict(), train_state['model_filename'])
            # Reset early stopping step
            train_state['early_stopping_step'] = 0
        # Stop early ?
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices =y_pred.max(1)
    #返回max的value，所对应的index
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct/len(y_pred_indices)*100

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)


### Settings and some prep work

In [25]:

args = Namespace(
    # Data and path information
    surname_csv="data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch4/surname_mlp",
    # Model hyper parameters
    hidden_dim=300,
    # Training  hyper parameters
    seed=1337,
    num_epochs=100,
    early_stopping_criteria=5,
    learning_rate=0.001,
    batch_size=64,
    # Runtime options
    cuda=True,
    reload_from_files=False, #是否从文件中加载dataset和Vectorizer
    expand_filepaths_to_save_dir=True,
)
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/ch4/surname_mlp\vectorizer.json
	model_storage/ch4/surname_mlp\model.pth
Using CUDA: True


### Initializations

In [44]:
# dataset and  vectorizer
if args.reload_from_files:
    print("Loading dataset and vectorizer")
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv, args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer")
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

#model
classifier = SurnameClassifier(input_dim = len(vectorizer.surname_vocab),
                               hidden_dim =args.hidden_dim,
                               output_dim = len(vectorizer.nationality_vocab))


Loading dataset and creating vectorizer


### Training Loop

In [45]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

# loss and optimizer
loss_func = nn.CrossEntropyLoss(weight=dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,mode='min', factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        # 在train集迭代
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0  # 每个batch的平均loss
        running_acc = 0.0  # 每个batch的平均acc
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step 2. compute the output
            y_pred = classifier(batch_dict['x_surname_vector'])
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_nationality_index'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            # step 4. use loss to produce gradients
            loss.backward()
            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 在val集迭代
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0  # 每个batch的平均loss
        running_acc = 0.0  # 每个batch的平均acc
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = classifier(x_in = batch_dict['x_surname_vector'])
            loss = loss_func(y_pred, batch_dict['y_nationality_index'])
            loss = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        train_state = update_train_state(args=args, model=classifier,train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])
        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(IntProgress(value=0, description='training routine', style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='split=train', max=120, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='split=val', max=25, style=ProgressStyle(description_width='in…

## Testing

In [47]:
# compute the loss & accuracy on the test set using the best available model
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(batch_dict['x_surname_vector'])

    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_nationality_index'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 1.8017724514007571;
Test Accuracy: 44.93750000000001


### Inference

In [50]:
def predict_nationality(surname, classifier, vectorizer):
    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).view(1, -1)
    result = classifier(vectorized_surname, apply_softmax = True)

    probability_values, indices = result.max(dim=1)
    predict_index = indices.item()
    probability_value = probability_values.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(predict_index)

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [51]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to("cpu")
prediction = predict_nationality(new_surname, classifier, vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname, prediction['nationality'], prediction['probability']))

Enter a surname to classify: Xu


  if sys.path[0] == '':


Xu -> Chinese (p=0.94)


#### Top K 

In [58]:
def predict_topk_nationality(surname, classifier, vectorizer, k=5):
    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).view(1, -1)
    result = classifier(vectorized_surname, apply_softmax = True)

    probability_values, indices = torch.topk(result, k=k)
    #返回topk的最大值及其index
    probability_values = probability_values.detach().numpy()[0]
    predict_indices = indices.detach().numpy()[0]

    output = []
    for prob_value, index in zip(probability_values, predict_indices):
        nationality = vectorizer.nationality_vocab.lookup_index(index)
        output.append({'nationality':nationality, 'probability':prob_value})
    return output


In [60]:

new_surname = input("Enter a surname to classify: ")
classifier = classifier.to("cpu")

k = int(input("How many of the top predictions to see? "))
if k > len(vectorizer.nationality_vocab):
    print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
    k = len(vectorizer.nationality_vocab)

predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("Top {} predictions:".format(k))
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))


Enter a surname to classify: Huang
How many of the top predictions to see? 5


  if sys.path[0] == '':


Top 5 predictions:
Huang -> Vietnamese (p=0.42)
Huang -> Chinese (p=0.37)
Huang -> Korean (p=0.12)
Huang -> Japanese (p=0.02)
Huang -> Arabic (p=0.02)
