# 数据集预处理

In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [2]:
args = Namespace(
    raw_dataset_csv="data/surnames/surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/surnames/surnames_with_splits.csv",#最终保存位置
    seed=1337
)

In [4]:

surnames = pd.read_csv(args.raw_dataset_csv, header=0)
#按国家划分数据
by_nationality = collections.defaultdict(list)
for index, row in surnames.iterrows():
    by_nationality[row.nationality].append(row.to_dict())

final_list = []
np.random.seed(args.seed)
# 对每个：0.7train,0.15val,0.15test
for nationality, item_list in sorted(by_nationality.items()):
    np.random.shuffle(item_list)
    length = len(item_list)
    n_train = int(args.train_proportion*length)
    n_val = int(args.val_proportion*length)
    n_test = int(args.test_proportion*length)
    for i in range(n_train):
        item_list[i]['split'] = 'train'
    for i in range(n_train, n_train + n_val):
        item_list[i]['split'] = 'val'
    for i in range(n_train + n_val, length):
        item_list[i]['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [5]:
# Write split data to file
final_surnames = pd.DataFrame(final_list)

In [7]:
final_surnames.split.value_counts()


train    7680
test     1660
val      1640
Name: split, dtype: int64

In [8]:
final_surnames.head()

Unnamed: 0,nationality,split,surname
0,Arabic,train,Totah
1,Arabic,train,Abboud
2,Arabic,train,Fakhoury
3,Arabic,train,Srour
4,Arabic,train,Sayegh


In [9]:
# Write munged data to CSV
final_surnames.to_csv(args.output_munged_csv, index=False)

# Classifying_Surnames_with_a_CNN

In [16]:
#数据用one-hot矩阵表示，而不是collapsed one-hot。
# 问：统计最长姓氏作为one-hot矩阵的行数？列的数量是V的大小
# 答：minibatch of surname matrices 是一个三维张量，需要相同大小。
# 使用数据集中最长的姓氏意味着可以以相同的方式处理每个minibatch。
# 问：姓氏短的不够的长度在one-hot矩阵中是0吗
# 答：yes，参见vectorize()函数

In [14]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

## The Vocabulary

In [61]:

class Vocabulary(object):
    """字符<->indexs"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx =token_to_idx
        self ._idx_to_token = {idx : token for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def add_token(self, token):
        if token in self._token_to_idx:#若此token已添加，返回index
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self,index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)
        # **表示传入的参数为dict形式

    def __len__(self):
        return len(self._token_to_idx)

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)



## The Vectorizer

In [64]:
class SurnameVectorizer(object):
    """ 将字符转为向量"""
    def __init__(self, surname_vocab, nationality_vocab, max_surname_length):
        """
       Args:
           surname_vocab (Vocabulary): maps characters to integers
           nationality_vocab (Vocabulary): maps nationalities to integers
           max_surname_length (int): the length of the longest surname
       """
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self._max_surname_length = max_surname_length

    def vectorize(self, surname):
        """
        Args:
            surname (str): the surname
        Returns:
            one_hot_matrix (np.ndarray): a matrix of one-hot vectors
        """
        one_hot_matrix_size = (len(self.surname_vocab), self._max_surname_length)
        # [embedding_size, seq_length]
        one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
        for index, character in enumerate(surname):
            character_index = self.surname_vocab.lookup_token(character)
            one_hot_matrix[character_index][index] = 1
        return one_hot_matrix

    @classmethod
    def from_dataframe(cls,surname_df):
        surname_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)
        max_surname_length = 0

        for index, row in surname_df.iterrows():
            nationality_vocab.add_token(row.nationality)
            max_surname_length = max(max_surname_length, len(row.surname))
            for ch in row.surname:
                surname_vocab.add_token(ch)

        return cls(surname_vocab,nationality_vocab,max_surname_length)

    def to_serializable(self):
        return {'surname_vocab': self.surname_vocab.to_serializable(),
                'nationality_vocab': self.nationality_vocab.to_serializable(),
                'max_surname_length': self._max_surname_length}

    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
        nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
        return  cls(surname_vocab, nationality_vocab, contents['max_surname_length'])


## The Dataset Class

In [65]:

class SurnameDataset(Dataset):
    def __init__(self, data_df, vectorizer):
        # data_df: dataframe格式的数据集
        self.data_df = data_df
        self._vectorizer = vectorizer

        self.train_df = self.data_df[self.data_df.split == 'train']
        self.train_size = len(self.train_df)
        self.val_df = self.data_df[self.data_df.split == 'val']
        self.validation_size = len(self.val_df)
        self.test_df = self.data_df[self.data_df.split == 'test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')# 选择train_df,val_df还是test_df

        # calss weights
        class_counts = data_df.nationality.value_counts().to_dict()
        def sort_key(item): #返回国家对应的index
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key = sort_key)
        frequencies = [counts for _, counts in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        #计算出classCounts，按照index进行排名（为了根据index一一对应），


    def set_split(self, split='train'):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    @classmethod
    def load_dataset_and_make_vectorizer(cls, data_csv):
        data_df = pd.read_csv(data_csv)
        train_df = data_df[data_df.split == 'train']
        return cls(data_df, SurnameVectorizer.from_dataframe(train_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, data_csv, vectorizer_filepath):
        data_df = pd.read_csv(data_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(data_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        surname_matrix = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)
        return {'x_surname_matrix': surname_matrix,
                'y_nationality_index': nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size



## The Model: SurnameClassifier

In [66]:

class SurnameClassifier(nn.Module):
    def __init__(self, initial_num_channels, num_classes, num_channels):
        """
        :param initial_num_channels: size of the incoming feature vector # embedding_dim
        :param num_classes:
        :param num_channels:constant channel size to use throughout network
        """
        super(SurnameClassifier, self).__init__()
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=initial_num_channels, out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,out_channels=num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, kernel_size=3),
            nn.ELU()
        )
        self.fc = nn.Linear(in_features=num_channels, out_features=num_classes)

    def forward(self, x_in, apply_softmax=False):
        """
        :param x_in（torch.Tensor）: 大小为[batch_size，initial_num_channels,max_surname_length]
        :param apply_softmax:
        :return:the resulting tensor. tensor.shape should be (batch, num_classes)
        """
        features = self.convnet(x_in).squeeze(dim =2)
        # TODO： 全连接层输入要保证是[batch_size，num_channels]？
        # TODO: 怎么保证dim2的维度是1的？
        # seqlength太大，卷积不到1，太小又不够卷积的，除非提前知道maxSeqlength，这里代码不够完善啊

        prediction_vector = self.fc(F.dropout(features, p=args.dropout_p))
        # prediction_vector = self.fc(features)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector


# Training

## Helper functions

In [68]:

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    #save model，检验early stop
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        # 两个参数分别是模型的参数，和模型要存放的绝对路径
        train_state['stop_early'] = False
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
        # if 模型比目前最好模型的效果差，则ealry_stop+1：
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step'] += 1
        else:
            torch.save(model.state_dict(), train_state['model_filename'])
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state


def compute_accuracy(y_pred,y_target):
    # y_pred的大小为[batch_size， num_classes]
    #最大概率索引
    _, y_pred_indices = y_pred.max(dim=1)
    #返回max的value，所对应的index
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct/args.batch_size*100

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)


## Settings and some prep work

In [69]:

args = Namespace(
    # Data and Path information
    data_csv="data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch4/cnn",
    # Model hyper parameters
    hidden_dim=100,
    num_channels=256,
    # Training hyper parameters
    seed=1337,
    learning_rate=0.001,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    dropout_p=0.1,
    # Runtime options
    cuda=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    catch_keyboard_interrupt=True
)
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))


# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)


Expanded filepaths: 
	model_storage/ch4/cnn\vectorizer.json
	model_storage/ch4/cnn\model.pth
Using CUDA: True


## Initializations

In [70]:
if args.reload_from_files:
    # training from a checkpoint
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.data_csv,
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.data_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()
#model
classifier = SurnameClassifier(initial_num_channels=len(vectorizer.surname_vocab),
                               num_classes=len(vectorizer.nationality_vocab),
                               num_channels=args.num_channels)


## Training Loop

训练程序包括以下操作序列:  
实例化数据集,  
实例化模型,  
实例化损失函数,  
实例化优化器,  
遍历数据集的训练分区和更新模型参数,  
遍历数据集的验证分区和测量性能,  
重复数据集迭代一定次数

In [71]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

loss_func = nn.CrossEntropyLoss(weight=dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size),
                          position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size),
                        position=1, leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        print(epoch_index)
        # Iterate over training dataset
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step2 output
            y_pred = classifier(batch_dict['x_surname_matrix'])
            # step3 loss
            loss = loss_func(y_pred, batch_dict['y_nationality_index'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            # step4 backward
            loss.backward()
            # step5 step
            optimizer.step()

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over training dataset
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = classifier(batch_dict['x_surname_matrix'])
            loss = loss_func(y_pred, batch_dict['y_nationality_index'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier, train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(IntProgress(value=0, description='training routine', style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='split=train', max=60, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='split=val', max=12, style=ProgressStyle(description_width='in…

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


# Testing


In [72]:

# 加载模型
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(weight=dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = classifier(batch_dict['x_surname_matrix'])
    loss = loss_func(y_pred, batch_dict['y_nationality_index'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_nationality_index'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 1.8568522632122038;
Test Accuracy: 57.356770833333336


# Inference

In [73]:
def predict_nationality(surname, classifier, vectorizer):
    vectorized_surname = vectorizer.vectorize(surname) #[len(surname_vocab), max_surname_length) ]
    vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0)
    # 因为classifier的输入是【batchsize，embedding_dim，seqlength】，所以需在dim=0增加1维
#     print(vectorized_surname.shape)
    result = classifier(vectorized_surname, apply_softmax=True)
    #找到最大值的index
    probability_value, index = result.max(dim=1)
    
    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index.item())
    probability_value = probability_value.item() 
    
    return {'nationality': predicted_nationality, 'probability': probability_value}



In [74]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to("cpu")
prediction = predict_nationality(new_surname, classifier, vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname, prediction['nationality'], prediction['probability']))

Enter a surname to classify: huang
huang -> Chinese (p=0.67)


In [80]:
def predict_topk_nationality(surname, classifier, vectorizer, k=5):
    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0)
    prediction_vector = classifier(vectorized_surname, apply_softmax=True)
    probability_values, indices = torch.topk(prediction_vector, k=k)
    
#     print(indices.shape)
    probability_values = probability_values[0].detach().numpy()
    indices = indices[0].detach().numpy()
    results = []
    for kth_index in range(k):
        nationality = vectorizer.nationality_vocab.lookup_index(indices[kth_index].item())
        probability_value = probability_values[kth_index].item()
        results.append({'nationality': nationality, 
                        'probability': probability_value})
    
    return results


In [81]:

new_surname = input("Enter a surname to classify: ")

k = int(input("How many of the top predictions to see? "))
if k > len(vectorizer.nationality_vocab):
    print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
    k = len(vectorizer.nationality_vocab)
    
predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,prediction['nationality'],prediction['probability']))

Enter a surname to classify: huang
How many of the top predictions to see? 5
Top 5 predictions:
huang -> Chinese (p=0.67)
huang -> Vietnamese (p=0.24)
huang -> Korean (p=0.08)
huang -> Irish (p=0.01)
huang -> English (p=0.00)
