**Example pipeline using some smaller models**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from tqdm import tqdm
import pickle
import json
from os import listdir
from os.path import join, isfile
from sklearn.model_selection import train_test_split
from scipy.sparse import load_npz
from scipy import sparse
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.svm import OneClassSVM
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier, BaggingClassifier, IsolationForest, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.multiclass import OneVsRestClassifier
from collections import defaultdict
from imblearn.under_sampling import NearMiss
from sklearn.metrics import plot_roc_curve, auc
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data
path = "data_clean/"
files = [path + f for f in listdir(path) if isfile(join(path, f))]
ch = pickle.load(open("place_hierarchy_tree.pcl", "rb"))

In [3]:
def get_subtree_position(depth):
    res = ["l", "r"]
    if depth > 1:
        children = get_subtree_position(depth - 1)
        res = [element + child for child in children for element in res]
    return res

In [4]:
def get_N_splits(root, N, balanced=True):
    
    if balanced:
        subtree_clusters = []

        # note that the hierarchical tree is not balanced, hence a too large N will result in an error eventually
        required_depth = np.log2(N)
        subtrees = get_subtree_position(required_depth)

        for subtree in subtrees:
            current_position = root  # start at root
            for char in subtree:  # navigate through the tree to find root of subtree
                if char == "l":
                    current_position = current_position.get_left()
                else:
                    current_position = current_position.get_right()
            subtree_clusters.append(current_position.pre_order())  # get all leaf nodes of current subtree
            
        return subtree_clusters
    else:  # not balanced
        subtrees = [root]
        
        while len(subtrees) < N:
            max_pos = -1
            max_len = -1
            
            # find largest subtree
            for i, subtree in enumerate(subtrees):
                l = len(subtree.pre_order())
                if l > max_len:
                    max_len = l
                    max_pos = i
                    
            # keep all smaller subtrees
            new_subtrees = [s for i, s in enumerate(subtrees) if i != max_pos]
            # and split the largest one
            new_subtrees.append(subtrees[max_pos].get_left())
            new_subtrees.append(subtrees[max_pos].get_right())
            subtrees = new_subtrees
            
        subtrees = [s.pre_order() for s in subtrees]
        return subtrees

In [5]:
C = {2**i: get_N_splits(ch, 2**i) for i in range(1,5)}

In [6]:
for l in C[16]:
    print(l)

[13, 134, 18, 87, 103, 76, 139]
[61, 136, 11, 90]
[44, 92]
[142, 28, 94]
[55, 50, 84]
[48, 111, 17, 30, 29, 6, 93]
[47, 120, 119, 39, 41, 124, 2, 97, 25, 106, 52, 80]
[64, 4, 105, 37, 141, 69, 36, 131, 19, 82, 95, 125, 16, 137, 60, 65, 102, 112, 148, 20, 33, 78]
[123, 35, 23, 57, 107, 40, 121, 0, 62, 115, 98, 118, 43, 146, 128, 31, 109]
[51, 133]
[12, 53, 58, 143, 45, 26, 110]
[10, 96, 145, 42, 114]
[22, 79, 24, 101, 5, 66, 122, 49, 77, 104, 116]
[74, 91, 15, 130, 59, 99, 54, 3, 117, 126, 27, 81]
[8, 108, 14, 129, 63, 100, 32, 86, 113, 67, 73]
[46, 135, 70, 127, 147, 9, 89, 7, 34, 72, 140, 83, 149, 56, 75, 21, 68, 144, 88, 1, 71, 138, 132, 38, 85]


In [7]:
C.keys()

dict_keys([2, 4, 8, 16])

In [8]:
pickle.dump(C, open("layer_mappings.pcl","wb"))

**load dataset**

In [9]:
use_datacenter = True

"""
1. Feature Matrix
"""
feature_files = [f for f in files if f[-8:] == "features"]

"""
2. Geocoding Features
"""
geocoding_files = [f for f in files if f[-3:] == "loc"]


"""
3. Text Prediction
"""
text_files = [f for f in files if f[-8:] == "pred.npz"]


"""
4. Merge Features
"""

tmp_feat = []
tmp_text = []

for feature_file in tqdm(feature_files[:20]):  # limit files
    file_prefix = feature_file.split(".json")[0]
    
    try:
        geo_file = [f for f in geocoding_files if file_prefix in f][0]
        text_file = [f for f in text_files if file_prefix in f][0]
    except IndexError:
        #  npz file has not been created yet
        continue
        
    df_feat = pd.read_csv(feature_file, delimiter=";")
    df_geo = pd.read_csv(geo_file)
    df_feat["profile_location"] = df_geo["profile_location"]
    tmp_feat.append(df_feat)    
    
    df_text = pd.DataFrame(sparse.load_npz(text_file).todense())
    tmp_text.append(df_text)
    
    
df_feat = pd.concat(tmp_feat, axis=0, ignore_index=True)

if not use_datacenter:
    df_feat = df_feat.drop(columns=["dc", "wt"])

df_text = pd.concat(tmp_text, axis=0, ignore_index=True)

df_labels = pd.DataFrame()
df_labels["150"] = df_feat["label"]
df_feat = df_feat.drop(columns="label")

100%|██████████| 20/20 [00:15<00:00,  1.26it/s]


In [10]:
N = 10**7

df_feat = df_feat[:N]
df_labels = df_labels[:N]
df_text = df_text[:N]

In [11]:
df_feat.shape, df_text.shape

((6029688, 12), (6029688, 151))

In [12]:
df_feat

Unnamed: 0,post_created,dc,wt,language,friends,followers,statuses,lists,account_created,profile_language,source,profile_location
0,-0.505586,11,30,pt,0.316192,0.261891,0.603712,-1.000000,0.557267,pt,Twitter for Android,14
1,-0.505586,10,5,zh,-0.180313,-0.194865,0.269856,-1.000000,-0.583389,ne,Twitter for iPhone,3
2,-0.505588,11,10,en,0.143174,-0.006851,0.508282,-0.809151,0.013887,en,Twitter for iPhone,28
3,-0.505577,11,25,en,0.431639,0.228458,0.628973,-0.809151,-0.457390,en,Twitter for iPhone,125
4,-0.505581,11,29,en,0.288043,0.381266,0.713266,-0.447916,0.055484,en,Twitter for iPhone,53
5,-0.505585,10,6,en,-0.017454,-0.077859,0.371578,-1.000000,0.472921,nl,Twitter for Android,27
6,-0.505581,11,10,en,0.325715,0.344313,0.781566,-0.759176,-0.308151,en,Twitter for iPhone,-1
7,-0.505573,11,11,en,-0.507820,-0.541549,0.064346,-0.809151,0.112753,en,Instagram,21
8,-0.505570,11,25,en,-0.055309,0.124362,0.829552,-0.759176,-0.628773,en,Twitter for iPhone,101
9,-0.505573,11,11,en,0.335701,0.237088,0.775309,-0.661961,0.748066,en,Twitter for Android,57


**hierarchical label mapping**

In [13]:
C4_T = {}
for cluster_id, members in enumerate(C[4]):
    for member in members:
        C4_T[member] = cluster_id

C16_T = {}
for cluster_id, members in enumerate(C[16]):
    for member in members:
        C16_T[member] = cluster_id

df_labels["4"] = df_labels["150"].apply(lambda n: C4_T[n])
df_labels["16"] = df_labels["150"].apply(lambda n: C16_T[n])

In [14]:
df_labels

Unnamed: 0,150,4,16
0,14,2,14
1,3,1,13
2,28,3,3
3,16,3,7
4,103,0,0
5,27,1,13
6,87,0,0
7,75,3,15
8,107,0,8
9,57,0,8


In [15]:
C4_to_C16 = defaultdict(set)
# 4 -> 16 Mapping
for cluster_id, members in enumerate(C[4]):
    for member in members:
        C4_to_C16[cluster_id].add(C16_T[member])

In [16]:
C4_to_C16

defaultdict(set,
            {0: {0, 4, 8, 12},
             1: {1, 5, 9, 13},
             2: {2, 6, 10, 14},
             3: {3, 7, 11, 15}})

**encode categorical features**

In [17]:
"""
1. Label Encodings
"""

# encode language
selected_languages = df_feat["language"].unique()
df_feat["profile_language"] = df_feat["profile_language"].apply(lambda n: n if n in selected_languages else "en")

language_encoder = LabelEncoder().fit(selected_languages)
df_feat["language"] = language_encoder.transform(df_feat["language"])
df_feat["profile_language"] = language_encoder.transform(df_feat["profile_language"])

# encode source
selected_sources = list(df_feat["source"].value_counts().keys()[:9])  # value counts are sorted by default
df_feat["source"] = df_feat["source"].apply(lambda n: n if n in selected_sources else "other")
selected_sources.append("other")

source_encoder = LabelEncoder().fit(selected_sources)
df_feat["source"] = source_encoder.transform(df_feat["source"])

In [18]:
df_feat

Unnamed: 0,post_created,dc,wt,language,friends,followers,statuses,lists,account_created,profile_language,source,profile_location
0,-0.505586,11,30,6,0.316192,0.261891,0.603712,-1.000000,0.557267,6,6,14
1,-0.505586,10,5,8,-0.180313,-0.194865,0.269856,-1.000000,-0.583389,2,8,3
2,-0.505588,11,10,2,0.143174,-0.006851,0.508282,-0.809151,0.013887,2,8,28
3,-0.505577,11,25,2,0.431639,0.228458,0.628973,-0.809151,-0.457390,2,8,125
4,-0.505581,11,29,2,0.288043,0.381266,0.713266,-0.447916,0.055484,2,8,53
5,-0.505585,10,6,2,-0.017454,-0.077859,0.371578,-1.000000,0.472921,2,6,27
6,-0.505581,11,10,2,0.325715,0.344313,0.781566,-0.759176,-0.308151,2,8,-1
7,-0.505573,11,11,2,-0.507820,-0.541549,0.064346,-0.809151,0.112753,2,2,21
8,-0.505570,11,25,2,-0.055309,0.124362,0.829552,-0.759176,-0.628773,2,8,101
9,-0.505573,11,11,2,0.335701,0.237088,0.775309,-0.661961,0.748066,2,6,57


**backups**

In [19]:
# Create backups
backup_df_feat = df_feat
backup_df_labels = df_labels
backup_df_text = df_text

**training example with simpler model**

**train classifiers for layers 1,2,3**

In [20]:
from sklearn.ensemble import RandomForestClassifier
training_phase = True

# layer 1 (continental)
labels_l1 = sorted(df_labels["4"].unique())
print("Target classes on layer 1: {}".format(labels_l1))

X_train, X_test, y_train, y_test = train_test_split(df_feat, 
                                                    df_labels["4"], 
                                                    train_size=.8, 
                                                    test_size=.2,
                                                    stratify=df_labels["4"])
if training_phase:
    model_l1 = RandomForestClassifier(n_estimators=200, n_jobs=100)
    t0 = time.time()
    model_l1.fit(X_train, y_train)
    print("Trained model l1 after {} seconds".format(time.time() - t0))
    print("Model Score: {}".format(model_l1.score(X_test, y_test)))
    
    #pickle.dump(model_l1, open("model_l1.pcl", "wb"))
else:
    model_l1 = pickle.load(open("l1.pcl", "rb"))
    print("Model model_l1 successfully loaded.")

Target classes on layer 1: [0, 1, 2, 3]
Trained model l1 after 94.87700080871582 seconds
Model Score: 0.8863183679426305


In [21]:
# feature importances
sorted([(a,b) for a,b in zip(df_feat.columns, model_l1.feature_importances_)], key=lambda n:n[1], reverse=True)

[('language', 0.25907505507152684),
 ('profile_language', 0.11782900394946721),
 ('profile_location', 0.10869934559325178),
 ('post_created', 0.09919930222439458),
 ('followers', 0.06836640985014182),
 ('friends', 0.06489756438148182),
 ('statuses', 0.06417338312248116),
 ('account_created', 0.06391557401782418),
 ('lists', 0.043992802740310005),
 ('source', 0.04209506066382745),
 ('dc', 0.04058630859971954),
 ('wt', 0.027170189785573368)]

In [22]:
# layer 2 (countries)
labels_l2 = {}
training_phase = True
models_l2 = {}

for area_id in labels_l1:  # train one classifier per area in the previous layer
    # grab the correct data
    df_labels_tmp = df_labels[df_labels["4"] == area_id]
    labels_l2[area_id] = sorted(df_labels_tmp["16"].unique())
    print("Target classes on layer 2, cluster {}: {}".format(area_id, labels_l2[area_id]))
    df_feat_tmp = df_feat[df_labels["4"] == area_id]
    
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(df_feat_tmp,
                                                    df_labels_tmp["16"], 
                                                    train_size=.8, 
                                                    test_size=.2,
                                                    stratify=df_labels_tmp["16"])
    if training_phase:
        models_l2[area_id] = RandomForestClassifier(n_estimators=200, n_jobs=100)
        t0 = time.time()
        models_l2[area_id].fit(X_train_tmp, y_train_tmp)
        print("Trained model model_l2_c{} after {} seconds".format(area_id, time.time() - t0))
        print("Model Score: {}".format(models_l2[area_id].score(X_test_tmp, y_test_tmp)))
        
        #pickle.dump(models_l2[area_id], open("model_l2_c{}.pcl".format(area_id), "wb"))
    else:
        models_l2[area_id] = pickle.load(open("model_l2_c{}.pcl".format(area_id), "rb"))
        print("Model model_l2_c{} successfully loaded.".format(area_id))
    print("\n")

Target classes on layer 2, cluster 0: [0, 4, 8, 12]
Trained model model_l2_c0 after 38.18390893936157 seconds
Model Score: 0.8726658340979373


Target classes on layer 2, cluster 1: [1, 5, 9, 13]
Trained model model_l2_c1 after 3.4680023193359375 seconds
Model Score: 0.8508174083478106


Target classes on layer 2, cluster 2: [2, 6, 10, 14]
Trained model model_l2_c2 after 23.618075609207153 seconds
Model Score: 0.9194397847668063


Target classes on layer 2, cluster 3: [3, 7, 11, 15]
Trained model model_l2_c3 after 21.974197149276733 seconds
Model Score: 0.9122286365991471




In [23]:
labels_l2

{0: [0, 4, 8, 12], 1: [1, 5, 9, 13], 2: [2, 6, 10, 14], 3: [3, 7, 11, 15]}

In [24]:
# layer 3 (states)
labels_l3 = {}
training_phase = True
models_l3 = {}

for area_id in sorted([x for l in labels_l2.values() for x in l]):  # train one classifier per area in the previous layer
    # grab the correct data
    df_labels_tmp = df_labels[df_labels["16"] == area_id]
    labels_l3[area_id] = sorted(df_labels_tmp["150"].unique())
    print("Target classes on layer 3, cluster {}: {}".format(area_id, labels_l3[area_id]))
    df_feat_tmp = df_feat[df_labels["16"] == area_id]
    
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(df_feat_tmp,
                                                    df_labels_tmp["150"], 
                                                    train_size=.8, 
                                                    test_size=.2,
                                                    stratify=df_labels_tmp["150"])
    if training_phase:
        models_l3[area_id] = RandomForestClassifier(n_estimators=200, n_jobs=100)
        t0 = time.time()
        models_l3[area_id].fit(X_train_tmp, y_train_tmp)
        print("Trained model model_l3_c{} after {} seconds".format(area_id, time.time() - t0))
        print("Model Score: {}".format(models_l3[area_id].score(X_test_tmp, y_test_tmp)))
        
        #pickle.dump(models_l3[area_id], open("model_l3_c{}.pcl".format(area_id), "wb"))
    else:
        models_l3[area_id] = pickle.load(open("model_l3_c{}.pcl".format(area_id), "rb"))
        print("Model model_l3_c{} successfully loaded.".format(area_id))
    print("\n")

Target classes on layer 3, cluster 0: [13, 18, 76, 87, 103, 134, 139]
Trained model model_l3_c0 after 5.112483739852905 seconds
Model Score: 0.7890930350898087


Target classes on layer 3, cluster 1: [11, 61, 90, 136]
Trained model model_l3_c1 after 1.3706717491149902 seconds
Model Score: 0.8755520504731861


Target classes on layer 3, cluster 2: [44, 92]
Trained model model_l3_c2 after 1.1342241764068604 seconds
Model Score: 0.9185725334217214


Target classes on layer 3, cluster 3: [28, 94, 142]
Trained model model_l3_c3 after 1.9161734580993652 seconds
Model Score: 0.8880451713395638


Target classes on layer 3, cluster 4: [50, 55, 84]
Trained model model_l3_c4 after 0.979464054107666 seconds
Model Score: 0.9619422572178478


Target classes on layer 3, cluster 5: [6, 17, 29, 30, 48, 93, 111]
Trained model model_l3_c5 after 1.5500195026397705 seconds
Model Score: 0.7205480837293963


Target classes on layer 3, cluster 6: [2, 25, 39, 41, 47, 52, 80, 97, 106, 119, 120, 124]
Trained mod

**predict some datapoints using hierarchical forwarding**

In [26]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

predicted_labels = df_labels  # create a copy of our labels dataframe which will include all intermediate predictions

predicted_labels["l1_prediction"] = [-1]*df_feat.shape[0]  # fill with dummy data
predicted_labels["l2_prediction"] = [-1]*df_feat.shape[0]
predicted_labels["l3_prediction"] = [-1]*df_feat.shape[0]

# predict layer 1
predicted_labels["l1_prediction"] = model_l1.predict(df_feat)

print("Accuracy l1: {}".format(accuracy_score(predicted_labels["4"], predicted_labels["l1_prediction"])))
print("Predicted areas w/ counts: {}".format(np.unique(predicted_labels["l1_prediction"], return_counts=True)))
print("\n")

# predict layer 2
for area_id in models_l2: 
    ids = np.where(predicted_labels["l1_prediction"] == area_id)[0]  # identify datapoints that have been classified into this area by previous layer
    predicted_labels["l2_prediction"].iloc[ids] = models_l2[area_id].predict(df_feat.iloc[ids])
    
print("Accuracy l2: {}".format(accuracy_score(predicted_labels["16"], predicted_labels["l2_prediction"])))
print("Predicted areas w/ counts: {}".format(np.unique(predicted_labels["l2_prediction"], return_counts=True)))
print("\n")

# predict layer 3
for area_id in models_l3:
    ids = np.where(predicted_labels["l2_prediction"] == area_id)[0]  # identify datapoints that have been classified into this area by previous layer
    predicted_labels["l3_prediction"].iloc[ids] = models_l3[area_id].predict(df_feat.iloc[ids])
    
print("Accuracy l3: {}".format(accuracy_score(predicted_labels["150"], predicted_labels["l3_prediction"])))
print("Predicted areas w/ counts: {}".format(np.unique(predicted_labels["l3_prediction"], return_counts=True)))
print("\n")

Accuracy l1: 0.9772925232615685
Predicted areas w/ counts: (array([0, 1, 2, 3]), array([2386117,  322070, 1720991, 1600510]))


Accuracy l2: 0.9573074096039463
Predicted areas w/ counts: (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), array([ 430037,   59848,   42404,  148645,    3458,   91540,  416530,
        800062, 1247844,    9848,  229220,  142790,  704778,  160834,
       1032837,  509013]))


Accuracy l3: 0.9208080086399164
Predicted areas w/ counts: (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
      

In [27]:
predicted_labels

Unnamed: 0,150,4,16,l1_prediction,l2_prediction,l3_prediction
0,14,2,14,2,14,14
1,3,1,13,1,13,3
2,28,3,3,3,3,28
3,16,3,7,3,7,16
4,103,0,0,0,8,107
5,27,1,13,1,13,27
6,87,0,0,0,0,87
7,75,3,15,3,15,75
8,107,0,8,0,8,107
9,57,0,8,0,8,57


**generate and persist confusion matrix**

In [28]:
c = confusion_matrix(predicted_labels["150"], predicted_labels["l3_prediction"])
pickle.dump(c, open("confusion_matrix.pcl", "wb"))
pickle.dump(predicted_labels, open("predicted_labels.pcl", "wb"))
print(c)

[[81005     0     4 ...     0     2     0]
 [    4  3542     0 ...     1     0     1]
 [   17     0 40230 ...     0     1     0]
 ...
 [    6     0     1 ... 12205     0     2]
 [    0     0     0 ...     4  2936     0]
 [    2     0     1 ...     7     5  4435]]
