In [1]:
from collections import Counter
import string
from itertools import chain

import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('../data/census.csv')
df['name'] = df['name'].map(str.title)


# make it as a percentage
races = ['pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic']
df[races] = df[races] / 100

df['other'] = df['pctaian'] + df['pct2prace']
df = df.drop(['pctaian', 'pct2prace', 'count'], 1)

df.shape

(167408, 6)

In [8]:
df[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic', 'other']].sum() / len(df)

pctwhite       0.774066
pctblack       0.061102
pctapi         0.055638
pcthispanic    0.082398
other          0.026673
dtype: float64

In [31]:
df.to_json('census.json')

In [20]:
CENSUS = {}
for _, row in df.iterrows():
    CENSUS[row['name']] = {
        'pctwhite': row['pctwhite'],
        'pctblack': row['pctblack'],
        'pctapi': row['pctapi'],
        'pcthispanic': row['pcthispanic'],
        'other': row['other']
    }

In [27]:
pd.DataFrame([CENSUS['Kamper'], CENSUS['Kamper']])

Unnamed: 0,other,pctapi,pctblack,pcthispanic,pctwhite
0,0.020725,0.006575,0.028,0.0213,0.92345
1,0.020725,0.006575,0.028,0.0213,0.92345


In [362]:
df['name_length'] = df['name'].map(len)
df.groupby('name_length')[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic', 'other']].mean().round(2)

Unnamed: 0_level_0,pctwhite,pctblack,pctapi,pcthispanic,other
name_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.21,0.05,0.64,0.07,0.04
3,0.47,0.07,0.34,0.08,0.04
4,0.7,0.07,0.11,0.08,0.03
5,0.76,0.07,0.07,0.08,0.03
6,0.77,0.07,0.05,0.08,0.03
7,0.79,0.06,0.04,0.08,0.03
8,0.8,0.05,0.04,0.08,0.03
9,0.82,0.04,0.04,0.07,0.02
10,0.83,0.04,0.03,0.08,0.02
11,0.79,0.03,0.05,0.1,0.02


In [363]:
def get_ngrams(text, n=2):
    return [text[i:i+n] for i in range(len(text)-1)]

In [8]:
def table(input_data, step=25, display_table=True, return_table=False):
    """Note: one should only use this function in Notebooks
    stats summary table, similary to R function"""
    if type(input_data) == pd.core.series.Series:
        input_data = input_data.values
    if input_data is None:
        raise TypeError('input data cannot be null')
    step = find_nearest([1, 2, 4, 5, 10, 20, 25], step)
    y = np.arange(0, 100 + step, step)

    x = np.percentile(input_data, y)
    x = np.append(x, [np.mean(input_data), np.std(input_data), len(input_data)])
    x = np.array([round(item, 3) for item in x])
    y = np.append(y, ['Mean', 'Std', 'Count'])
    tmp = pd.DataFrame([x], columns=y)
    tmp.Count = tmp.Count.astype(int)

    # display the table and set back the max_columns back to 20
    if display_table:
        if pd.get_option('max_columns') < 100:
            pd.set_option('display.max_columns', 104)
            display(tmp)
            pd.set_option('display.max_columns', 20)
        else:
            display(tmp)

    if return_table:
        return tmp

    
def find_nearest(array, value):
    """helper function to return the closest value in an array
    source: https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
    """
    if not isinstance(array, np.ndarray):
        array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [365]:
df['bi_gram'] = df['name'].map(get_ngrams)

In [366]:
kv = Counter(list(chain.from_iterable(df['bi_gram']))).most_common()
len(kv)

644

In [367]:
table([i[1] for i in kv], 2)

Unnamed: 0,0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,Mean,Std,Count
0,1.0,1.0,2.0,3.0,5.0,7.0,11.0,13.02,18.0,23.0,28.6,36.0,45.32,53.0,59.08,74.9,89.0,103.0,120.0,141.02,160.6,188.06,215.52,251.12,292.64,352.0,434.52,527.76,600.16,670.4,749.0,868.32,970.8,1168.38,1317.24,1436.0,1521.72,1672.74,1932.88,2090.0,2497.4,2700.56,3038.72,3683.64,4072.92,4420.1,5245.8,5944.02,7407.64,10596.02,30430.0,1546.25,2828.788,644


In [263]:
MAX_NAME_LENGTH = 15
CHAR_MAP = {k: (v + 1) for (k, v) in zip(string.ascii_lowercase[:26], np.arange(26))}

In [186]:
bi_gram_features = set([i[0] for i in kv if i[1] > 30 and i[1] < 2000])
BI_GRAM_MAP = {k: v for (k, v) in zip(bi_gram_features, np.arange(len(bi_gram_features)))}
df['bi_gram'] = df['bi_gram'].apply(lambda x: set([i for i in x if i in bi_gram_features]))
len(bi_gram_features)

359

In [368]:
def token_count(name, n):
    token_count_dict = {k: 0 for k in string.ascii_lowercase[:26]}
    
    if n == 1:
        for char in name:
            if char in token_count_dict:
                token_count_dict[char] += 1
                
    if n == 2:
        char_list = get_ngrams(name, 2)
        char_counter = Counter([char[0] for char in char_list if char[0] in token_count_dict])
        token_count_dict = {**token_count_dict, **dict(char_counter)}
            
    return token_count_dict        
    

In [369]:
def name_featurization(name):
    fi = np.zeros(len(CHAR_MAP) * 2 + 1) #MAX_NAME_LENGTH
    
    # unigram char count
    fi[:len(CHAR_MAP)] = np.array(list(token_count(name, 1).values()))
    
    # bigram char count
    fi[len(CHAR_MAP):len(CHAR_MAP)*2] = np.array(list(token_count(name, 2).values()))
    
    # length
    name = name[:MAX_NAME_LENGTH]
    name_length = len(name)
    fi[len(CHAR_MAP) * 2] = name_length
    
#     # char map
#     for i, char in enumerate(name):
#         fi[len(CHAR_MAP) * 2 + 1 + i] = CHAR_MAP.get(char, 0)
    
    return fi

In [371]:
%%time
features = {}
for n in df['name']:
    features[n] = name_featurization(n)

CPU times: user 3.9 s, sys: 188 ms, total: 4.09 s
Wall time: 3.98 s


In [372]:
len(features)

167408

In [374]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

In [202]:
from sklearn.neural_network import MLPClassifier

In [196]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

In [375]:
train_idx, test_idx = train_test_split(df.index, test_size=.2, random_state=42)
race_to_predict = ['pctwhite', 'pctblack', 'pctapi', 'pcthispanic', 'other']

In [376]:
%%time
X_train = np.stack([features[n] for n in df.loc[train_idx, 'name']])
X_test = np.stack([features[n] for n in df.loc[test_idx, 'name']])

df_truncate = (df[race_to_predict] > .3) * 1
df_truncate = df_truncate[race_to_predict]
y_train = df_truncate.loc[train_idx].to_numpy()
y_test = df_truncate.loc[test_idx].to_numpy()

CPU times: user 396 ms, sys: 29.9 ms, total: 426 ms
Wall time: 425 ms


In [378]:
X_train.shape, X_test.shape

((133926, 53), (33482, 53))

In [379]:
y_train.shape, y_test.shape

((133926, 5), (33482, 5))

In [380]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline(
    [('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),])

In [283]:
%%time
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

CPU times: user 10min 31s, sys: 1min 25s, total: 11min 57s
Wall time: 2min 1s


In [284]:
y_pred = clf.predict(X_test)
y_true = df.loc[test_idx]['pctapi'].round().to_numpy()

print(classification_report(y_test, y_pred, target_names=race_to_predict))

              precision    recall  f1-score   support

    pctwhite       0.91      0.93      0.92     28272
    pctblack       0.41      0.01      0.03      2215
      pctapi       0.56      0.36      0.44      2049
 pcthispanic       0.62      0.44      0.52      2421
       other       0.33      0.01      0.02       246

   micro avg       0.88      0.80      0.84     35203
   macro avg       0.57      0.35      0.38     35203
weighted avg       0.83      0.80      0.80     35203
 samples avg       0.84      0.82      0.82     35203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [285]:
!pip install tensorflow -q

In [289]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [302]:
model = Sequential()
model.add(Embedding(128, 32, input_length=68))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(race_to_predict), activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 68, 32)            4096      
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
Total params: 87,173
Trainable params: 87,173
Non-trainable params: 0
_________________________________________________________________
None


In [207]:
print(classification_report(y_test, y_pred, target_names=race_to_predict))

              precision    recall  f1-score   support

    pctwhite       0.89      0.95      0.92     28512
    pctblack       0.30      0.08      0.12      2525
      pctapi       0.53      0.25      0.34      2127
 pcthispanic       0.63      0.23      0.33      2523
       other       0.24      0.05      0.08       326

   micro avg       0.86      0.79      0.82     36013
   macro avg       0.52      0.31      0.36     36013
weighted avg       0.80      0.79      0.78     36013
 samples avg       0.84      0.82      0.82     36013



  _warn_prf(average, modifier, msg_start, len(result))
