# TalkingData Mobile User Demographics - 3rd place solution

In this competition, Kagglers are challenged to build a model predicting users’ demographic characteristics based on their app usage, geolocation, and mobile device properties. Doing so will help millions of developers and brand advertisers around the world pursue data-driven marketing efforts which are relevant to their users and catered to their preferences.

https://www.kaggle.com/c/talkingdata-mobile-user-demographics

## Load the data

### gender_age_train.csv, gender_age_test.csv -- the training and test set

Variable to predict: Group


In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [13, 7]

datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'), index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id')
print('train shape: {}, test shape: {}'.format(gatrain.shape, gatest.shape))

phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
print('phone shape', phone.shape)

events = pd.read_csv(os.path.join(datadir,'events.csv'),  parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'], dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
labelcategories = pd.read_csv(os.path.join(datadir, 'label_categories.csv'))
print('events shape: {}, appevents shape: {}, applabels shape: {}, labels shape: {}'.format(events.shape, appevents.shape, applabels.shape, applabels.shape))


train shape: (74645, 3), test shape: (112071, 0)
phone shape (186716, 2)


  mask |= (ar1 == a)


events shape: (3252950, 4), appevents shape: (32473067, 3), applabels shape: (459943, 2), labels shape: (459943, 2)


**KEY: We used different models for these 2 groups**

## 2.- Feature Engineering

### One Hot Encoding with sparse matrices

In [20]:
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import numpy as np

def get_sparse_data(train, test, col):
    """ A sparse matrix of features can be constructed using the csr_matrix constructor:
        csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where "data", "row_ind" and "col_ind" satisfy the
        relationship "a[row_ind[k], col_ind[k]] = data[k]""
    """
    full = pd.concat([train[col], test[col]], axis=0)
    full = full.fillna(-9999)
    appencoder = LabelEncoder().fit(full)
    train[col] = appencoder.transform(train[col])
    test[col] = appencoder.transform(test[col])
    nvalues = len(appencoder.classes_)

    xtr = csr_matrix(
        (np.ones(len(train)), (train.trainrow, train[col])),
        shape=(train.shape[0], nvalues))

    xte = csr_matrix(
        (np.ones(len(test)), (test.testrow, test[col])),
        shape=(test.shape[0], nvalues))
    assert np.sum(xtr) == len(train)
    assert np.sum(xte) == len(test)
    return xtr, xte

def get_sparse_from_grouped(train, test, trans, col):
    """ 
    Example:
    Remember that there are many events for each device.
    for each device I want to mark which event id was purchased with
    I'll have as many event values columns as events are linked to devices    
    """
    trans[col] = trans[col].fillna(-9999)
    appencoder = LabelEncoder().fit(trans[col])
    trans[col] = appencoder.transform(trans[col])
    nvalues = len(appencoder.classes_)

    aggtable = (
        trans.groupby(['device_id', col])['device_id']
        .agg(['count'])
        .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
        .merge(test[['testrow']], how='left', left_index=True, right_index=True)
        .reset_index()
    )
    temp = aggtable.dropna(subset=['trainrow'])
    xtr = csr_matrix(
        (np.ones(len(temp)).astype(float), (temp.trainrow, temp[col])),
        shape=(train.shape[0], nvalues))

    temp = aggtable.dropna(subset=['testrow'])
    xte = csr_matrix(
        (np.ones(len(temp)).astype(float), (temp.testrow, temp[col])),
        shape=(test.shape[0], nvalues))
    assert xtr.shape[0] == len(train)
    assert xte.shape[0] == len(test)
    return xtr, xte

# Simple:
gatrain["trainrow"] = np.arange(gatrain.shape[0])
gatest["testrow"] = np.arange(gatest.shape[0])

gatrain['model'] = phone["device_model"]
gatest['model'] = phone["device_model"]
xtrain, xtest = get_sparse_data(gatrain, gatest, 'model')

# Grouped: 
transactions = appevents.merge(
    events[["device_id"]], how="left", left_on="event_id", right_index=True
)
print('shape of apps in devices: {}'.format(transactions.shape))
xtrain_grouped, xtest_grouped = get_sparse_from_grouped(gatrain, gatest, transactions, 'app_id')

xtrain = hstack([xtrain, xtrain_grouped]).tocsr()
xtest = hstack([xtest, xtest_grouped]).tocsr()
xtrain

shape of apps in devices: (32473067, 4)


<74645x20836 sparse matrix of type '<class 'numpy.float64'>'
	with 990277 stored elements in Compressed Sparse Row format>

### Bag of brands and labels

#### For each APP, we list all the labels

In [24]:
# app_lab = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
# app_lab = app_lab.groupby("app_id")["label_id"].apply(
#     lambda x: " ".join(str(s) for s in x)
# )
# appevents["app_lab"] = appevents["app_id"].map(app_lab)
# appevents = appevents.groupby("event_id")["app_lab"].apply(
#     lambda x: " ".join(str(s) for s in x)
# )
# events["app_lab"] = events.index.map(appevents)
# events = events.groupby("device_id")["app_lab"].apply(
#     lambda x: " ".join(str(s) for s in x)
# )
# gatrain["app_lab"] = gatrain.index.map(events)
# gatest["app_lab"] = gatest.index.map(events)

# gatrain['device_model'] = phone['device_model']
# gatrain['phone_brand'] = phone['phone_brand']

# gatest['device_model'] = phone['device_model']
# gatest['phone_brand'] = phone['phone_brand']

# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# def get_hash_data(train, test):
#     df = pd.concat((train, test), axis=0, ignore_index=True, sort=True)
#     split_len = len(train)
#     tfv = TfidfVectorizer(min_df=1)
#     df = (
#         df[["phone_brand", "device_model", "app_lab"]]
#         .astype(np.str)
#         .apply(lambda x: " ".join(s for s in x), axis=1)
#         .fillna("Missing")
#     )
#     df_tfv = tfv.fit_transform(df)
#     train = df_tfv[:split_len, :]
#     test = df_tfv[split_len:, :]
#     return train, test

# xtrain_bag, xtest_bag = get_hash_data(gatrain, gatest)
# xtrain_bag

<74645x2045 sparse matrix of type '<class 'numpy.float64'>'
	with 1878613 stored elements in Compressed Sparse Row format>

### Fitting NNET

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU


def baseline_model2(num_columns):
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(num_columns,)))
    model.add(Dense(60))
    model.add(PReLU())
    model.add(Dropout(0.30))
    model.add(Dense(40, init="normal", activation="tanh"))
    model.add(PReLU())
    model.add(Dropout(0.20))

    model.add(Dense(12, init="normal", activation="softmax"))
    model.compile(
        loss="categorical_crossentropy", optimizer="adadelta", metrics=["accuracy"]
    )
    return model


folds = pd.read_csv(os.path.join(datadir, "../folds/folds_5.csv"), index_col="device_id")

targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)
dummy_y = np_utils.to_categorical(y)


pred = np.zeros((y.shape[0], nclasses * 2))
pred_test = np.zeros((gatest.shape[0], nclasses * 2))
n_folds = len(folds["fold"].unique())
nbags = 1
nepoch = 20
print(
    "Starting training Logistic and Keras. Using {} folds, bagged {} times".format(
        n_folds, nbags
    )
)


for fold_id in range(1, n_folds + 1):
    train_id = folds["fold"].values != fold_id
    valid_id = folds["fold"].values == fold_id

    Xtr, Ytr = xtrain[train_id, :], y[train_id]
    Xva, Yva = xtrain[valid_id, :], y[valid_id]

    # Logistic regression
    clf1 = LogisticRegression(
        C=0.06, multi_class="multinomial", solver="lbfgs"
    ) 
    clf1.fit(Xtr, Ytr)
    pred[valid_id, 0:12] = clf1.predict_proba(Xva)

    score_val = log_loss(Yva, pred[valid_id, 0:12])
    print("Logistic logloss for fold {} is {}".format(fold_id, score_val))

    ## Fitting Keras ------------------------>
    # First, train on all data, but only no-events feature. Validate with no events:
    Xtr, Ytr_dum = xtrain[train_id, :], dummy_y[train_id]
    Xva, Yva_dum = xtrain[valid_id, :], dummy_y[valid_id]

    for j in range(nbags):
        model = baseline_model2(Xtr.shape[1])
        fit = model.fit(
            xtrain, dummy_y,
            epochs=nepoch,
            batch_size=512,
            verbose=1,
        )

        #pred[valid_id_ne, 12:25] += model.predict_generator(
        #    generator=batch_generatorp(Xva, 400, False), val_samples=Xva.shape[0]
        #)
        pred[valid_id, 12:25] += model.predict(Xva)

        # pred += model.predict_generator(generator=batch_generatorp(xval, 800, False), val_samples=xval.shape[0])
        ## average predictions

    pred[valid_id, 12:25] /= nbags

    score_val = log_loss(Yva, pred[valid_id, 12:25])
    print("Total: Keras-n/e logloss for fold {} is {}".format(fold_id, score_val))

score_val = log_loss(y, pred[:, 12:25])
print("Keras: logloss for {} folds is {}".format(n_folds, score_val))


Starting training Logistic and Keras. Using 5 folds, bagged 1 times




Logistic logloss for fold 1 is 2.286249574007255
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total: Keras-n/e logloss for fold 1 is 2.1436058350843448




Logistic logloss for fold 2 is 2.292489002525951
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total: Keras-n/e logloss for fold 2 is 2.1594185256838085




Logistic logloss for fold 3 is 2.2864477966004864
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total: Keras-n/e logloss for fold 3 is 2.1561983220452694




Logistic logloss for fold 4 is 2.2937916211011347
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total: Keras-n/e logloss for fold 4 is 2.157345314810835




Logistic logloss for fold 5 is 2.2819986488647284
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total: Keras-n/e logloss for fold 5 is 2.1510965525920676
Keras: logloss for 5 folds is 2.153533206867103


### Predicting for the test set:

In [None]:
##Predicting the test set
np.random.seed(seed)
for j in range(nbags):
    model = baseline_model2(xtrain.shape[1])  
    fit = model.fit(
        xtrain, dummy_y,
        epochs=nepoch,
        batch_size=381,
        verbose=1,
    )

    pred_test[test_id, 12:25] += model.predict(
        generator=batch_generatorp(xtest[test_id_ne, :], 400, False),
        val_samples=xtest[test_id, :].shape[0],
    )
    print("bagg test no events:", j)

pred_test[test_id, 12:25] /= nbags


##Predicting the test set Regression- With Events
clf2 = LogisticRegression(
    C=0.016, multi_class="multinomial", solver="lbfgs"
)
clf2.fit(Xtrain, y)
pred_test[test_id, 0:12] = clf2.predict_proba(Xtest[test_id, :])

col_names = np.concatenate((targetencoder.classes_, targetencoder.classes_), axis=0)
pred_train_df = pd.DataFrame(pred, index=gatrain.index, columns=col_names)
pred_test_df = pd.DataFrame(pred_test, index=gatest.index, columns=col_names)

## Generating submissions for kaggle:
# pred_train_df.to_csv(
#     "preds/keras_cv5_2_bagging_split_train.csv", index=True, index_label="device_id"
# )
# pred_test_df.to_csv(
#     "preds/keras_cv5_2_bagging_split_test.csv", index=True, index_label="device_id"
# )