# Setup

In [12]:
import glob
import importlib
import os
import string
import sys
import torch
import typing
import unicodedata

nn = torch.nn

from __future__ import unicode_literals, print_function, division
from io import open
import pandas as pd
import pathlib
from torch.autograd import Variable


bundle_root = pathlib.Path(os.environ.get('LABS_BUNDLE_ROOT', '/labs'))
sys.path.append(str(bundle_root / 'functions'))
sys.path.append(str(bundle_root /  'common'))
import utils
importlib.reload(utils);

In [13]:
# load base classe
RNN = utils.RNN
# init model
rnn = RNN(utils.n_letters, utils.n_hidden, utils.n_categories)
# fill in weights
rnn.load_state_dict(torch.load(str(bundle_root / 'common/char-rnn-classification.pt')))

def predict(line: str, n_predictions: int=3) -> typing.List[typing.Tuple]:
    output = utils.evaluate(Variable(utils.lineToTensor(line)), rnn)

    # Get top N categories
    topv, topi = output.data.topk(n_predictions, 1, True)
    predictions: List[Any] = []

    for i in range(n_predictions):
        value = topv[0][i]
        category_index = topi[0][i]
        predictions += [(str(value).split('tensor')[1], utils.all_categories[category_index])]

    return predictions

In [14]:
def get_name_nationality(name: str) -> pd.DataFrame:
    #print(name)
    name = name.strip()
    df_l = []
    name_l = name.split(' ')
    if '' in name_l:
        name_l.remove('')
    for name in name_l:
        #print(name)
        try:
            s = predict(name)
        except:            
            s = ['(0)', 'Unknown']
        df_l += [pd.DataFrame([(float(t[0][1:-1]), t[1]) for t in s])]
    return pd.concat(df_l).groupby(1)[0].sum().sort_values(ascending=False).index[0]

# Add nationality column to crew list 

In [17]:
crew = pd.read_parquet(bundle_root / 'data/processed/crew.parquet')

In [None]:
%%time
crew = crew.assign(nationality = lambda s:s['name_'].apply(get_name_nationality))

In [None]:
crew.to_parquet(bundle_root / 'common/crew_with_nationality.parquet')