In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

In [2]:
gender_train = pd.read_csv('gender_age_train.csv')
gender_test = pd.read_csv('gender_age_test.csv')
phone_brand = pd.read_csv('phone_brand_device_model.csv')
app_events = pd.read_csv('app_events.csv', dtype={'app_id': np.str})
app_labels = pd.read_csv('app_labels.csv',dtype={'app_id': np.str})
labels = pd.read_csv('label_categories.csv')
events = pd.read_csv('events_new.csv', index_col=0)
app_text = pd.read_csv('app_text.csv')

In [3]:
# data - events and app_text
data = pd.merge(events, app_text, on='event_id')

In [4]:
# drop columns from app_ev (?????)
# where is app_ev (?)
app_ev = app_events.drop(['is_installed', 'is_active'],axis=1)

In [5]:
# app_cnt - count of events for any app
app_cnt = app_ev.groupby('app_id', as_index=False)['event_id'].count()

In [6]:
# data - merge of data and app_events
data = pd.merge(data, app_events, on='event_id')

In [7]:
#data.head(2)

In [8]:
data.long_lat = data.long_lat.apply(lambda x: x.replace('_', ' '))

In [9]:
data_1 = data.drop_duplicates(subset=['event_id','timestamp'])

In [10]:
data_1['date'] = [i[:10] for i in data_1['timestamp']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
#data_1.head(2)

In [12]:
data_2 = data_1.groupby('device_id', as_index=False)['date', 'long_lat'].agg(lambda x: ' '.join(set(x)))

In [13]:
data = data.drop_duplicates(subset=['device_id'])

In [14]:
# kick duplicates from phone_brand
phone_brand.drop_duplicates(subset='device_id',keep='first', inplace=True)

In [15]:
# train and test data - merge of gender... and data
train_data = pd.merge(gender_train, data, on='device_id', how='left')
test_data = pd.merge(gender_test, data, on='device_id', how='left')

In [16]:
# train and test data - merge of ..._data and phone_brand
train_data = pd.merge(train_data, phone_brand, on='device_id')
test_data = pd.merge(test_data, phone_brand, on='device_id')

In [17]:
train_data = pd.merge(train_data, data_2[['device_id','long_lat','date']], on='device_id', how='left')
test_data = pd.merge(test_data, data_2[['device_id','long_lat','date']], on='device_id', how='left')

In [18]:
# fill na values with no_data
train_data = train_data.fillna('no_data')
test_data = test_data.fillna('no_data')

In [19]:
group = train_data['group']
train_data = train_data[['device_id', 'category', 'app_id_x', 'phone_brand', 'device_model', 'long_lat_y','date']]
test_data = test_data[['device_id', 'category', 'app_id_x', 'phone_brand', 'device_model','long_lat_y','date']]

In [20]:
train_data.category = train_data.category.astype(str)
train_data.app_id_x = train_data.app_id_x.astype(str)
train_data.phone_brand = train_data.phone_brand.astype(str)
train_data.device_model = train_data.device_model.astype(str)
train_data.long_lat_y = train_data.long_lat_y.astype(str)
train_data.date = train_data.date.astype(str)

In [21]:
test_data.category = test_data.category.astype(str)
test_data.app_id_x = test_data.app_id_x.astype(str)
test_data.phone_brand = test_data.phone_brand.astype(str)
test_data.device_model = test_data.device_model.astype(str)
test_data.long_lat_y = test_data.long_lat_y.astype(str)
test_data.date = test_data.date.astype(str)

In [22]:
train_data['text'] = train_data.category + ' ' + train_data.app_id_x + ' ' + \
train_data.phone_brand + ' ' + train_data.device_model + ' ' + train_data.long_lat_y + ' ' + train_data.date

In [23]:
test_data['text'] = test_data.category + ' ' + test_data.app_id_x + ' ' + \
test_data.phone_brand + ' ' + test_data.device_model + ' ' + test_data.long_lat_y + ' ' + test_data.date

In [24]:
#train_data.head(20)

In [25]:
#test_data.head(20)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
print(train_data.shape)
print(test_data.shape)

(74645, 8)
(112071, 8)


In [28]:
total = pd.concat([train_data, test_data])

In [29]:
total.shape

(186716, 8)

In [30]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.85, stop_words='english')
# vectorizer = TfidfVectorizer(stop_words='english')
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(total.text.values)

In [31]:
print(X[:train_data.shape[0]].shape)
print(X[train_data.shape[0]:].shape)

(74645, 17844)
(112071, 17844)


In [32]:
def map_column(table):
    labels = sorted(table.unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.map(mappings)
    return table

In [33]:
random.seed(2016)

def run_xgb(train, test, target, eta=0.1, random_state=0):
    #eta = 0.1
    max_depth = 6
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "alpha": 3,
        "min_child_weight": 2,
        "seed": random_state,
    }
    num_boost_round = 20000
    early_stopping_rounds = 100
    test_size = 0.3

    X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=test_size, random_state=random_state)
    # TODO change split 
    print('Length train:', X_train.shape[0])
    print('Length valid:', X_valid.shape[0])
    #y_train = X_train[target]
    #y_valid = X_valid[target]
    # TODO delete upper code
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    # sparse matrix ???

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [34]:
def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [35]:
#map_group = map_column(group)
from sklearn.preprocessing import LabelEncoder

In [36]:
#map_group = np.array(list(map_group))
lable_group = LabelEncoder()
map_group = lable_group.fit_transform(group)

In [37]:
X_train, X_val, y_train, y_val = train_test_split(X[:train_data.shape[0]], map_group, train_size=0.999, random_state=10)

print("# Num of Features: ", X_train.shape[1])

('# Num of Features: ', 17844)


In [38]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [39]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

((74570, 17844), (75, 17844), (74570,), (75,))


In [40]:
#------------------------------------------------- Write functions ----------------------------------------

def rstr(df): return df.dtypes, df.head(3) ,df.apply(lambda x: [x.unique()]), df.apply(lambda x: [len(x.unique())]),df.shape

def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [44]:
# 2.28 on lb - bad news, everyone, my features is bad :'(

In [43]:
from keras.layers.advanced_activations import PReLU
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD


#act = keras.layers.advanced_activations.PReLU(init='zero', weights=None)

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(320, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(120, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.3))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

model = baseline_model()

fit = model.fit_generator(generator=batch_generator(X_train, y_train, 400, True),
                         nb_epoch=18,
                         samples_per_epoch=69984,
                         validation_data=(X_val.todense(), y_val), verbose=2
                         )

scores_val = model.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
print('logloss val {}'.format(log_loss(y_val, scores_val)))

Epoch 1/18
29s - loss: 2.4548 - acc: 0.1289 - val_loss: 2.4273 - val_acc: 0.0933
Epoch 2/18
29s - loss: 2.4046 - acc: 0.1439 - val_loss: 2.4062 - val_acc: 0.0933
Epoch 3/18
29s - loss: 2.3837 - acc: 0.1548 - val_loss: 2.3890 - val_acc: 0.0933
Epoch 4/18
29s - loss: 2.3645 - acc: 0.1612 - val_loss: 2.3642 - val_acc: 0.0933
Epoch 5/18
29s - loss: 2.3521 - acc: 0.1651 - val_loss: 2.3409 - val_acc: 0.0933
Epoch 6/18
29s - loss: 2.3419 - acc: 0.1704 - val_loss: 2.3338 - val_acc: 0.1067
Epoch 7/18
29s - loss: 2.3320 - acc: 0.1735 - val_loss: 2.3195 - val_acc: 0.0800
Epoch 8/18
29s - loss: 2.3275 - acc: 0.1747 - val_loss: 2.3174 - val_acc: 0.1333
Epoch 9/18
29s - loss: 2.3175 - acc: 0.1771 - val_loss: 2.3107 - val_acc: 0.1333
Epoch 10/18
29s - loss: 2.3146 - acc: 0.1802 - val_loss: 2.3031 - val_acc: 0.1467
Epoch 11/18
29s - loss: 2.3046 - acc: 0.1857 - val_loss: 2.3074 - val_acc: 0.1333
Epoch 12/18
29s - loss: 2.3003 - acc: 0.1861 - val_loss: 2.3044 - val_acc: 0.1467
Epoch 13/18
29s - loss: 2



In [42]:
##################
#  Build Model
##################

from keras.layers.advanced_activations import PReLU
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD


#act = keras.layers.advanced_activations.PReLU(init='zero', weights=None)

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(150, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

def second_model():
    # create model
    model = Sequential()
    model.add(Dense(155, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

def third_model():
    # create model
    model = Sequential()
    model.add(Dense(145, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

model = baseline_model()
model_2 = second_model()
model_3 = third_model()

fit = model.fit_generator(generator=batch_generator(X_train, y_train, 400, True),
                         nb_epoch=18,
                         samples_per_epoch=69984,
                         validation_data=(X_val.todense(), y_val), verbose=2
                         )

fit_2 = model_2.fit_generator(generator=batch_generator(X_train, y_train, 400, True),
                         nb_epoch=18,
                         samples_per_epoch=69784,
                         validation_data=(X_val.todense(), y_val), verbose=2
                         )

fit_3 = model_3.fit_generator(generator=batch_generator(X_train, y_train, 400, True),
                         nb_epoch=18,
                         samples_per_epoch=69884,
                         validation_data=(X_val.todense(), y_val), verbose=2
                         )

# evaluate the model
scores_val = model.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
print('logloss val {}'.format(log_loss(y_val, scores_val)))

scores_val_2 = model_2.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
print('logloss val {}'.format(log_loss(y_val, scores_val_2)))

scores_val_3 = model_3.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
print('logloss val {}'.format(log_loss(y_val, scores_val_3)))

#print("# Final prediction")
#scores = model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
#result = pd.DataFrame(scores , columns=lable_group.classes_)
#result["device_id"] = device_id
#print(result.head(1))
#result = result.set_index("device_id")

#result.to_csv('./sub_bagofapps7_keras_10_50_pt2_10epoch.csv', index=True, index_label='device_id')
#Drop out 0.2
#Validation 2.3017
#result.to_csv('keras_' + str(log_loss(y_val, scores_val)) + '.csv', index=True, index_label='device_id')


print("Done")

Epoch 1/18
15s - loss: 2.4347 - acc: 0.1336 - val_loss: 2.4198 - val_acc: 0.0667
Epoch 2/18
15s - loss: 2.3984 - acc: 0.1486 - val_loss: 2.4183 - val_acc: 0.0667
Epoch 3/18


KeyboardInterrupt: 

In [None]:
#res, score = run_xgb(X[:train_data.shape[0]], X[train_data.shape[0]:], map_group, eta=0.07, random_state=0)
#print score
#create_submission(score, result_test, res)

In [None]:
#res1, score1 = run_xgb(X[:train_data.shape[0]], X[train_data.shape[0]:], map_group, eta=0.07, random_state=10)

In [None]:
#res2, score2 = run_xgb(X[:train_data.shape[0]], X[train_data.shape[0]:], map_group, eta=0.07, random_state=101)

In [None]:
#res3, score3 = run_xgb(X[:train_data.shape[0]], X[train_data.shape[0]:], map_group, eta=0.07, random_state=110)

In [None]:
#score_tot = (score + score1 + score2 + score3)/4

In [None]:
#res_tot = []
#for i in range(len(res)):
#    res_tot.append(list(np.mean([res[i], res1[i], res2[i], res3[i]], axis=0)))

In [None]:
#create_submission(score3, gender_test, res3)
#create_submission(score_tot, gender_test, res_tot)

In [47]:
device_id = test_data.device_id

In [48]:
print("# Final prediction")
scores = model.predict_generator(generator=batch_generatorp(X[train_data.shape[0]:], 800, False), val_samples=X[train_data.shape[0]:].shape[0])
result = pd.DataFrame(scores, columns=lable_group.classes_)
result["device_id"] = device_id
print(result.head(1))
result = result.set_index("device_id")

# Final prediction
       F23-    F24-26    F27-28    F29-32    F33-42      F43+      M22-  \
0  0.000139  0.000453  0.001182  0.006755  0.033858  0.059674  0.003295   

     M23-26    M27-28    M29-31    M32-38      M39+            device_id  
0  0.021044  0.028795  0.092346  0.213323  0.539135  1002079943728939269  


In [50]:
print("# Final prediction")
scores_2 = model_2.predict_generator(generator=batch_generatorp(X[train_data.shape[0]:], 800, False), val_samples=X[train_data.shape[0]:].shape[0])
result_2 = pd.DataFrame(scores_2 , columns=lable_group.classes_)
result_2["device_id"] = device_id
print(result_2.head(1))
result_2 = result_2.set_index("device_id")

# Final prediction
       F23-  F24-26    F27-28   F29-32    F33-42      F43+     M22-    M23-26  \
0  0.000149  0.0006  0.001351  0.00805  0.041345  0.052811  0.00107  0.015825   

    M27-28    M29-31    M32-38      M39+            device_id  
0  0.03088  0.087962  0.231555  0.528403  1002079943728939269  


In [51]:
print("# Final prediction")
scores_3 = model_3.predict_generator(generator=batch_generatorp(X[train_data.shape[0]:], 800, False), val_samples=X[train_data.shape[0]:].shape[0])
result_3 = pd.DataFrame(scores_3 , columns=lable_group.classes_)
result_3["device_id"] = device_id
print(result_3.head(1))
result_3 = result_3.set_index("device_id")

# Final prediction
      F23-    F24-26    F27-28    F29-32    F33-42      F43+      M22-  \
0  0.00008  0.000385  0.000851  0.003951  0.034338  0.047487  0.001262   

     M23-26    M27-28    M29-31    M32-38      M39+            device_id  
0  0.017092  0.024214  0.082074  0.244851  0.543415  1002079943728939269  


In [52]:
tot = result + result_2 + result_3 

In [53]:
tot = tot/3

In [54]:
tot[tot<0.001]=0.001
tot[tot>0.999]=0.999

In [55]:
tot.to_csv('keras_my_features_' + str(log_loss(y_val, (scores_val + scores_val_2 + scores_val_3)/3)) + '.csv', index=True, index_label='device_id')