In [None]:
# !export LC_ALL=en_US.UTF8

In [1]:
import pandas as pd
import numpy as np

import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from create_submission import create_submission

---------

## Reading

In [3]:
# !ls ./data/

* Events

In [3]:
events = pd.read_csv("data/events.csv", dtype={'device_id': np.str})
events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

* Brands

In [None]:
phones = pd.read_csv("data/phone_brand_device_model.csv", dtype={'device_id': np.str})
phones.drop_duplicates('device_id', keep='first', inplace=True)
phones = map_column(phones, 'phone_brand')
phones = map_column(phones, 'device_model')

* Train & test

In [None]:
train = pd.read_csv("data/gender_age_train.csv", dtype={'device_id': np.str})
train = map_column(train, 'group')
train = train.drop(['age'], axis=1)
train = train.drop(['gender'], axis=1)
train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
train.fillna(-1, inplace=True)

test = pd.read_csv("data/gender_age_test.csv", dtype={'device_id': np.str})
test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
test.fillna(-1, inplace=True)

* Target

In [None]:
features = list(test.columns.values)
features.remove('device_id')

In [3]:
# app_labels = pd.read_csv("data/app_labels.csv")
# app_events = pd.read_csv("data/app_events.csv")
# train = pd.read_csv("data/gender_age_train.csv")
# # phones = pd.read_csv("data/phone_brand_device_model.csv")
# phones = pd.read_csv("data/phones.csv")
# label_categories = pd.read_csv("data/label_categories.csv")

# events = pd.read_csv("data/events.csv")

# sample_sub = pd.read_csv("data/sample_submission.csv")

In [5]:
# print(app_labels.shape)
# app_labels.head()

# print(app_events.shape)
# app_events.head()

print(train.shape)
# train.head()

print(test.shape)
# test.head()

print(phones.shape)
# phones.head()

print(events.shape)
# events.head()

print(label_categories.shape)
# label_categories.head()

(459943, 2)
(32473067, 4)
(74645, 4)
(187245, 3)
(3252950, 5)
(930, 2)


---------

## Cleaning

### File descriptions

* `gender_age_train.csv`, `gender_age_train.csv` - the training and test set
    * `group`: this is the target variable you are going to predict
* `events.csv`, `app_events.csv` - when a user uses TalkingData SDK, the event gets logged in this data. Each event has an event id, location (lat/long), and the event corresponds to a list of apps in `app_events`.
    * `timestamp`: when the user is using an app with TalkingData SDK
* `app_labels.csv` - apps and their labels, the `label_id`'s can be used to join with `label_categories`
* `label_categories.csv` - apps' labels and their categories in text
* `phone_brand_device_model.csv` - device ids, brand, and models
    * `phone_brand`: note that the brands are in Chinese
        * 小米 xiaomi
        * 三星 sangsung
        * 苹果 apple
        * 华为 huawei
        * 魅族 meizu
        * 酷派 coolpad
        * 天语 K-touch
        * 中兴 ZTE
        * 金立 gionee
        * 海信 Hisense
        * 联想 Lenovo
        * 索尼 sony
        * 酷比 koobee
        * 康佳 konka
        * 奇酷 qiku
        * 欧博信 opsson
* `sample_submission.csv` - a sample submission file in the correct format

In [89]:
sample_sub.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


-----
### Translating phone brands' names

In [70]:
phone_names_translation = pd.read_csv("data/phone_names_translation.csv", 
                                      header = None, 
                                      names = ['phone_brand', 'brand'])
print(phone_names_translation)

# phones = pd.merge(phones, phone_names_translation)
# del phones['phone_brand']
# phones.to_csv("phones.csv", index_label=False, encoding='UTF8')

print('\n\n========== phones\' last 10 rows')
print(phones.tail(10))

   phone_brand     brand
0           小米    xiaomi
1           三星  sangsung
2           苹果     apple
3           华为    huawei
4           魅族     meizu
5           酷派   coolpad
6           天语   K-touch
7           中兴       ZTE
8           金立    gionee
9           海信   Hisense
10          联想    Lenovo
11          索尼      sony
12          酷比    koobee
13          康佳     konka
14          奇酷      qiku
15         欧博信    opsson


                  device_id device_model   brand
145281 -4482696120138446487     IVO 6655  opsson
145282  4613939269386649923          Q3C  opsson
145283 -2477559452412485718     IVO 6666  opsson
145284 -7687510258203758013           D1  opsson
145285  1420741651959055894     IVO 6688  opsson
145286  2457053156373760353     IVO 8800  opsson
145287  8728719520886912421     IVO 6655  opsson
145288  2195155146355827586     IVO 8800  opsson
145289 -7148236089014830095     IVO 8800  opsson
145290 -4646161890445337322          Q3C  opsson


In [None]:
# app_labels['app_id'] = [str(i) for i  in app_labels.app_id]
# app_labels['label_id'] = [str(i) for i  in app_labels.label_id]

# app_events['app_id'] = [str(i) for i  in app_events.app_id]
# app_events['event_id'] = [str(i) for i  in app_events.event_id]

# label_categories['label_id'] = [str(i) for i  in label_categories.label_id]
# ga_train['device_id'] = [str(i) for i  in ga_train.device_id]
# phones['device_id'] = [str(i) for i in phones.device_id]

# events['device_id'] = [str(i) for i in events.device_id]
# events['event_id'] = [str(i) for i in events.event_id]

---------

### Merging

In [31]:
df_apps = pd.merge(app_labels, label_categories)

In [56]:
df_people = pd.merge(phones, train)

In [28]:
df = pd.merge(df_apps, df_people)

In [5]:
df_events = pd.merge(events, app_events)

In [None]:
df = pd.merge(df, df_events)

In [None]:
df = (pd.merge(pd.merge(pd.merge(pd.merge
             (pd.merge(events, app_events), app_labels), 
         label_categories), 
        train), phones))

# df = pd.merge(pd.merge(events, app_events), app_labels)
# df = (pd.merge(
#         pd.merge(app_events, 
#                         pd.merge(events, pd.merge(train, phones))), 
#         app_labels))

In [29]:
print(df.head())
print(df.shape)

   event_id            device_id            timestamp  longitude  latitude  \
0         6  1476664663289716375  2016-05-01 00:27:21        0.0       0.0   
1     58641  1476664663289716375  2016-05-02 21:08:36        0.0       0.0   
2    131844  1476664663289716375  2016-05-07 12:17:55        0.0       0.0   
3    150542  1476664663289716375  2016-05-07 12:17:01        0.0       0.0   
4    184730  1476664663289716375  2016-05-06 19:45:53        0.0       0.0   

                app_id  is_installed  is_active  label_id  \
0  5927333115845830913             1          1       549   
1  5927333115845830913             1          1       549   
2  5927333115845830913             1          1       549   
3  5927333115845830913             1          1       549   
4  5927333115845830913             1          1       549   

                category device_model   brand gender  age group  
0  Property Industry 1.0       Mate 7  huawei      M   19  M22-  
1  Property Industry 1.0       M

---------

## Exploring

----->     `    eda.ipynb`

---------

## Splitting

In [None]:
X = df.drop('group', axis=1)
y = df.group
print(X.shape)
print(y.shape)

In [36]:
from sklearn.cross_validation import train_test_split
(X_train, X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size = 0.3, 
                                     random_state = 0)

              device_id gender  age   group phone_brand device_model
0  -8076087639492063270      M   35  M32-38          小米         MI 2
1  -2897161552818060146      M   35  M32-38          小米         MI 2
2  -8260683887967679142      M   35  M32-38          小米         MI 2
3  -4938849341048082022      M   30  M29-31          小米       红米note
4    245133531816851882      M   30  M29-31          小米         MI 3
(74839, 5)
(74839,)
