In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install tensorflow==2.2.0 -q

In [3]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.__version__

'2.2.0'

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE 



In [5]:
MAX_NAME_LENGTH = 16
BATCH_SIZE = 128
RACES = ['pctwhite', 'pctblack', 'pctapi', 'pcthispanic', 'other']

In [6]:
df = pd.read_csv('drive/MyDrive/census.csv')
df['name'] = df['name'].map(str.title)
print(df.shape)

(167408, 8)


In [7]:
df['other'] = df['pctaian'] + df['pct2prace']
df.drop(['pctaian', 'pct2prace'], 1, inplace=True)

In [8]:
df.sample(3)

Unnamed: 0,name,pctwhite,pctblack,pctapi,pcthispanic,count,other
116571,Pinyan,98.89,0.0,0.0,0.246667,406,0.863333
60370,Hafiz,20.335,10.52,53.425,1.31,580,14.41
153136,Twinn,41.485,8.5,1.715,1.715,164,46.585


In [9]:
def get_ngrams(text, n=2):
    return [text[i:i+n] for i in range(len(text)-1)]

df['name_list'] = df['name'].map(list)
df['bi_gram_name'] = df['name'].map(get_ngrams)
df['name_cat'] = (df['name_list'] + df['bi_gram_name']).apply(lambda x: " ".join(x))

In [10]:
def pad_to_sequences(x, encoder):
    x = encoder.texts_to_sequences(x)
    return keras.preprocessing.sequence.pad_sequences(x, maxlen=MAX_NAME_LENGTH)


In [11]:
train_idx, test_idx = train_test_split(df.index, test_size=.2, random_state=42)

In [15]:
# very imbalanced dataset
df_truncate = (df[RACES] > 30) * 1
df_truncate = df_truncate[RACES]
y_train = df_truncate.loc[train_idx].to_numpy()
y_test = df_truncate.loc[test_idx].to_numpy()

y_train.sum(0) / max(y_train.sum(0))

array([1.        , 0.07646737, 0.07162029, 0.08650586, 0.00783126])

In [16]:
%%time
X_train = df.loc[train_idx, 'name']
X_test = df.loc[test_idx, 'name']

encoder = keras.preprocessing.text.Tokenizer(char_level=True, lower=False)
encoder.fit_on_texts(X_train)

x_train = pad_to_sequences(X_train, encoder)
x_test = pad_to_sequences(X_test, encoder)

CPU times: user 1.5 s, sys: 43.5 ms, total: 1.55 s
Wall time: 1.51 s


In [17]:
%%time
sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(x_train, y_train)

print(f"""Shape of X before SMOTE: {x_train.shape}
Shape of X after SMOTE: {X_sm.shape}""")




Shape of X before SMOTE: (133926, 16)
Shape of X after SMOTE: (566385, 16)
CPU times: user 3.59 s, sys: 41.4 ms, total: 3.63 s
Wall time: 3.64 s


In [18]:
X_sm.shape, y_sm.shape, x_test.shape, y_test.shape

((566385, 16), (566385, 5), (33482, 16), (33482, 5))

In [20]:
def build_simple_lstm_model(encoder, embedding_dim=32, num_classes=5, lr=0.001):
    model = keras.Sequential()

    model.add(layers.Embedding(len(encoder.index_word) + 1, 
                               embedding_dim, 
                               input_length=MAX_NAME_LENGTH))

    # model.add(layers.LSTM(2 * embedding_dim, activation="tanh", return_sequences=True, dropout=.2, recurrent_dropout=0.2))
    # model.add(layers.BatchNormalization())

    model.add(layers.LSTM(64, activation="tanh", dropout=.2))
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(32, activation="relu"))
    model.add(layers.Dense(num_classes, activation="softmax"))

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, 
                  loss="binary_crossentropy", # prev categorical_crossentropy
                  metrics=["accuracy"])

    model.summary()

    return model

model = build_simple_lstm_model(encoder, 32, len(RACES))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 32)            1728      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 39,173
Trainable params: 39,173
Non-trainable params: 0
_________________________________________________________________


In [21]:
%%time
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
model.fit(x=X_sm, 
          y=y_sm, 
          epochs=200, 
          shuffle=True,
          batch_size=512, 
          validation_split=0.1, 
          callbacks=[callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
CPU times: user 4min 25s, sys: 28.1 s, total: 4min 53s
Wall time: 4min 5s


In [23]:
y_pred = model.predict(x_test, 1024, verbose=1)
print(classification_report(y_test, y_pred.round(), target_names=RACES))

              precision    recall  f1-score   support

    pctwhite       0.93      0.79      0.86     28272
    pctblack       0.21      0.15      0.17      2215
      pctapi       0.41      0.47      0.44      2049
 pcthispanic       0.51      0.52      0.51      2421
       other       0.08      0.09      0.08       246

   micro avg       0.81      0.71      0.76     35203
   macro avg       0.43      0.40      0.41     35203
weighted avg       0.82      0.71      0.76     35203
 samples avg       0.74      0.72      0.73     35203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
model.predict(pad_to_sequences(["Ouyang", "Wong", "Ximen", "Seibel"], encoder)).round(2)

array([[0.  , 0.09, 0.9 , 0.  , 0.  ],
       [0.03, 0.  , 0.96, 0.  , 0.  ],
       [0.31, 0.27, 0.35, 0.07, 0.  ],
       [0.82, 0.13, 0.01, 0.04, 0.  ]], dtype=float32)

In [26]:
df[df.name=='Seibel']

Unnamed: 0,name,pctwhite,pctblack,pctapi,pcthispanic,count,other,name_list,bi_gram_name,name_cat
134615,Seibel,95.325,0.255,0.725,1.99,4331,1.7,"[S, e, i, b, e, l]","[Se, ei, ib, be, el]",S e i b e l Se ei ib be el


In [28]:
model.save('drive/MyDrive/echo-meter/race_predictor_mvp')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: drive/MyDrive/echo-meter/race_predictor_mvp/assets


In [29]:
import pickle
pickle.dump( encoder, open( "drive/MyDrive/echo-meter/encoder.pkl", "wb" ) )