In [20]:
import re
import pandas as pd

MAX_NAME_LENGTH = 16
BATCH_SIZE = 128
INFLUENCER_MIN_FOLLOWERS = 1000
NAME_DENY_LIST = {'Journal', 'Institute', 'News', 'The', 'AI', 'Capital'}
RACES = ['pctwhite', 'pctblack', 'pctapi', 'pcthispanic', 'other']

df = pd.read_json('friends.json')

df.shape

(200, 6)

In [9]:
df.head()

Unnamed: 0,next_cursor,next_cursor_str,previous_cursor,previous_cursor_str,total_count,users
0,1545601571879068086,1545601571879068160,0,0,,"{'blocked_by': False, 'blocking': False, 'cont..."
1,1545601571879068086,1545601571879068160,0,0,,"{'blocked_by': False, 'blocking': False, 'cont..."
2,1545601571879068086,1545601571879068160,0,0,,"{'blocked_by': False, 'blocking': False, 'cont..."
3,1545601571879068086,1545601571879068160,0,0,,"{'blocked_by': False, 'blocking': False, 'cont..."
4,1545601571879068086,1545601571879068160,0,0,,"{'blocked_by': False, 'blocking': False, 'cont..."


In [4]:
df['users'][120]['followers_count']

1425511

In [5]:
df['users'][120]['name']

'Guy Kawasaki'

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [8]:
%%time
model = keras.models.load_model("../model/race_predictor_mvp")

CPU times: user 1.84 s, sys: 61 ms, total: 1.9 s
Wall time: 1.92 s


In [9]:
import pickle
encoder = pickle.load(open('../model/encoder.pkl', 'rb'))

In [22]:
model = build_simple_lstm_model(encoder, 32, 5)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 32)            1728      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 39,173
Trainable params: 39,173
Non-trainable params: 0
_________________________________________________________________


In [11]:
def pad_to_sequences(x, encoder):
    x = encoder.texts_to_sequences(x)
    return keras.preprocessing.sequence.pad_sequences(x, maxlen=MAX_NAME_LENGTH)

In [14]:
%%time
import json

with open('../data/census.jsonl', 'rb') as json_file:
    CENSUS = {}
    for l in json_file:
        l = json.loads(l)
        CENSUS[l['name']] = {
            'pctwhite': l['pctwhite'],
            'pctblack': l['pctblack'],
            'pctapi': l['pctapi'],
            'pcthispanic': l['pcthispanic'],
            'other': l['other']
    }
  

CPU times: user 1.02 s, sys: 38.2 ms, total: 1.06 s
Wall time: 1.06 s


In [16]:
def get_name_and_info(data):
    """prase name from twitter raw data, filter out news publishers/official branded accounts"""
    parsed_data =  [
        (get_english_only(i['name']), 
         1 if i['followers_count'] > INFLUENCER_MIN_FOLLOWERS else 0) for i in data['users'] 
        if 'news' not in i['description'].lower() and 'official' not in i['description'].lower()]
        
    # hardcode blacklist to remove obvious business accounts
    parsed_data = [i for i in parsed_data if not any(w in i[0] for w in NAME_DENY_LIST)]
    
    # personal accounts tends to have 1 or 2 spaces only in the name
    parsed_data = [i for i in parsed_data if i[0].count(' ') > 0 and i[0].count(' ') < 3]
    
    return parsed_data


def get_english_only(text):
    return re.sub('[^A-Za-z ]+', '', text).strip()

In [25]:
with open('friends.json') as json_file:
    data = json.load(json_file)
    
df = pd.DataFrame(get_name_and_info(data), columns=['name', 'is_influencer'])
df['last_name'] = df['name'].apply(lambda x: x.split()[-1].title())
df = df[df['last_name'].map(len) > 1].reset_index(drop=True)

In [44]:
df

Unnamed: 0,name,is_influencer,last_name
0,Jane Wang,1,Wang
1,Cristina Cordova,1,Cordova
2,Henry Golding,1,Golding
3,Karen Chee,1,Chee
4,Tianqi Chen,1,Chen
...,...,...,...
111,Julia Hartz,1,Hartz
112,Robin Chan,1,Chan
113,Steven Sinofsky,1,Sinofsky
114,Scott Kupor,1,Kupor


In [35]:
%timeit _ = df[df['last_name'].isin(CENSUS)]

20.1 ms ± 525 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [51]:
z = set(df['last_name']) & set(CENSUS.keys())
tmp = df[df.last_name.isin(z)]

In [66]:
a, b, c, d = tmp['last_name'].map(CENSUS)

ValueError: too many values to unpack (expected 4)

In [63]:
pd.DataFrame.from_dict(tmp['last_name'].map(CENSUS), orient='index', columns=RACES)

TypeError: 'numpy.ndarray' object is not callable

In [46]:
tmp

Unnamed: 0,name,is_influencer,last_name,pctwhite,pctblack,pctapi,pcthispanic,other


In [None]:
# predicted results
results = pd.DataFrame(model.predict(pad_to_sequences(df['last_name'], encoder)).round(3), columns=RACES)

# concat
df = pd.concat([df, results], 1)

In [352]:
df.loc[50:55]

Unnamed: 0,name,screen_name,is_influencer,last_name,pctwhite,pctblack,pctapi,pcthispanic,other
50,Greg Brockman,gdb,1,Brockman,0.99,0.009,0.001,0.0,0.0
51,New China,XinhuaChinese,1,China,0.48,0.004,0.116,0.092,0.308
52,Josh Hartung,joshuahartung,0,Hartung,0.795,0.067,0.137,0.001,0.0
53,Baidu Inc,Baidu_Inc,1,Inc,0.263,0.09,0.629,0.018,0.0
54,MacCallister Higgins,macjshiggins,1,Higgins,0.421,0.578,0.0,0.0,0.001
55,Yuanqing Lin,YuanqingLin,1,Lin,0.025,0.0,0.973,0.002,0.0


In [378]:
def diversity_calculation(df, prefix=''):
    """round up small nubmers and round down big numbers"""
    if len(df) == 0:
        d = {
            'pctwhite': 0,
            'pctblack': 0,
            'pctapi': 0,
            'pcthispanic': 0,
            'other': 0,
            'total_count': 0
        }
    else:
        pct = (df[RACES].mean() * 100).map(math.ceil)
        d = dict(pct)
        for k, v in d.items():
            if v == max(pct):
                d[k] = max(pct) - sum(pct) + 100
                break
        d['total_count'] = len(df)
        
    if prefix:
        d = {prefix + '_' + k: v for k, v in d.items()}
    return d

In [381]:
def get_diversity(data):
    df = pd.DataFrame(get_name_and_info(data), columns=['name', 'is_influencer'])
    df['last_name'] = df['name'].apply(lambda x: x.split()[-1].title())
    df = df[df['last_name'].map(len) > 1].reset_index(drop=True)

    # predicted results
    results = pd.DataFrame(model.predict(pad_to_sequences(df['last_name'], encoder)).round(3), columns=RACES)

    # concat
    df = pd.concat([df, results], 1)
    
    # update distribution through census data if possible
    for idx, row in df.iterrows():
        if row['last_name'] in CENSUS:
            df.loc[idx, RACES] = CENSUS[row['last_name']].values()
        
    # ignore the distributions that are less prominent
    df = df[df[RACES].max(1) > .5]
    
    return {**diversity_calculation(df), **diversity_calculation(df[df['is_influencer'] == 1], 'influencer')}

In [387]:
empty_df = pd.DataFrame()
{**diversity_calculation(empty_df), **diversity_calculation(empty_df, 'influencer')}

{'pctwhite': 0,
 'pctblack': 0,
 'pctapi': 0,
 'pcthispanic': 0,
 'other': 0,
 'total_count': 0,
 'influencer_pctwhite': 0,
 'influencer_pctblack': 0,
 'influencer_pctapi': 0,
 'influencer_pcthispanic': 0,
 'influencer_other': 0,
 'influencer_total_count': 0}