In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from mixed_naive_bayes import MixedNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Mixed NB

In [28]:
chunksize = 10 ** 6
num_of_chunk = 0
train = pd.DataFrame()

for chunk in pd.read_csv("train.gz", compression='gzip',chunksize=chunksize):
    num_of_chunk += 1
    train = pd.concat([train, chunk.sample(frac=.05, replace=False, random_state=123)], axis=0)
    print('Processing Chunk No. ' + str(num_of_chunk))     
    
train.reset_index(inplace=True)

# 備份train 資料長度，以便稍後df重新分割索引用途
train_len = len(train)
train_len

Processing Chunk No. 1
Processing Chunk No. 2
Processing Chunk No. 3


134189

In [29]:
train_df = train

In [30]:
test_df = pd.read_csv("test.gz", compression='gzip', header='infer')

In [31]:
train_df = train_df.drop(['index', 'id'], axis=1)

In [32]:
test_df = test_df.drop(['id'], axis=1)

In [33]:
test_df

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,14102201,5,0,978ac369,7dec9eb9,9ccfa2ea,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,21694,201,31,2075,3,11,152,30
1,14102201,5,1,d410bf4e,dfe994f8,74073276,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,16858,201,31,1465,3,7,201,11
2,14102201,5,0,078d3465,dd641cc7,8fd0aea4,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,21759,201,31,2080,0,3,84,209
3,14102201,5,1,e0db3d09,b30bad9c,74073276,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,19950,201,31,1378,3,135,76,11
4,14102201,5,1,9b971c93,983b49d4,f66779e6,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,19950,201,31,1378,3,135,-1,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473605,14102205,5,0,8cbf056b,85c6289a,f66779e6,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,8330,201,31,339,3,143,78,11
473606,14102205,5,1,248ed81a,7fb09894,74073276,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,6616,201,31,154,2,3,131,20
473607,14102205,5,0,078d3465,dd641cc7,8fd0aea4,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,21763,201,31,2080,0,3,85,209
473608,14102205,5,1,1b84e3e3,2064e137,f66779e6,ecda2021,2021c8a9,07d7df22,a99f214a,...,1,0,15705,201,31,1300,0,3,-1,67


In [34]:
def get_hour(hour):
    h = str(hour)[-2:]
    return h

train_df['hour'] = train_df.hour.apply(get_hour)
test_df['hour'] = test_df.hour.apply(get_hour)

In [35]:
len_of_feature_count = []
for i in train_df.columns[2:23].tolist():
    print(i, ':', len(train_df[i].astype(str).value_counts()))
    len_of_feature_count.append(len(train_df[i].astype(str).value_counts()))

C1 : 4
banner_pos : 5
site_id : 1568
site_domain : 1452
site_category : 20
app_id : 1
app_domain : 1
app_category : 1
device_id : 6639
device_ip : 86738
device_model : 2840
device_type : 2
device_conn_type : 2
C14 : 481
C15 : 4
C16 : 4
C17 : 143
C18 : 4
C19 : 32
C20 : 148
C21 : 33


In [36]:
train_df = train_df.drop(['app_id', 'app_domain', "app_category"], axis=1)
test_df = test_df.drop(['app_id', 'app_domain', "app_category"], axis=1)

In [37]:
train_df.columns

Index(['click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'device_id', 'device_ip', 'device_model',
       'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21'],
      dtype='object')

In [39]:
obj_features = []

for i in train_df.columns[2:]:
    if len(train_df[i].astype(str).value_counts()) > 10:
        obj_features.append(i)
obj_features

['site_id',
 'site_domain',
 'site_category',
 'device_id',
 'device_ip',
 'device_model',
 'C14',
 'C17',
 'C19',
 'C20',
 'C21']

In [40]:
for feature in obj_features:
    train_df[feature] = train_df[feature].apply(hash)
    test_df[feature] = test_df[feature].apply(hash)

In [45]:
category_features = []
for feature in train_df.columns:
    if feature not in obj_features:
        category_features.append(feature)
category_features

['click',
 'hour',
 'C1',
 'banner_pos',
 'device_type',
 'device_conn_type',
 'C15',
 'C16',
 'C18']

In [46]:
label_encoder = LabelEncoder()
for feature in category_features[1:]:
    print("Processing: " + feature)
    train_df.loc[:, feature] = label_encoder.fit_transform(train_df.loc[:, feature])
    test_df.loc[:, feature] = label_encoder.fit_transform(test_df.loc[:, feature])

Processing: hour
Processing: C1
Processing: banner_pos
Processing: device_type
Processing: device_conn_type
Processing: C15
Processing: C16
Processing: C18


In [71]:
x_train = train_df.loc[:, (train_df.columns != 'click')]
x_train = x_train.drop(['C15', 'C16'], axis=1)
y_train = train_df[["click"]]

In [72]:
category_features.remove("C15")
category_features.remove("C16")
category_features

['click', 'hour', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 'C18']

In [73]:
categorical_features_input = []
for feature in category_features[1:]:
    print(feature + "  " + str(x_train.columns.get_loc(feature)))
    categorical_features_input.append(x_train.columns.get_loc(feature))

hour  0
C1  1
banner_pos  2
device_type  9
device_conn_type  10
C18  13


In [74]:
clm = MixedNB(categorical_features=categorical_features_input)

In [75]:
clm_result = clm.fit(x_train, np.ravel(y_train))

[24  4  5  2  2  4]


In [76]:
x_test = test_df
x_test = x_test.drop(['C15', 'C16'], axis=1)

prediction_test_class = clm_result.predict(x_test)

In [91]:
prediction_test_prob = clm_result.predict_proba(x_test)

In [70]:
for f in category_features[1:]:
    print(f)
    print(sorted(x_train[f].unique()))
    print(sorted(x_test[f].unique()))
    print()

hour
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
[0, 1, 2, 3, 4]

C1
[0, 1, 2, 3]
[0, 1, 2, 3]

banner_pos
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]

device_type
[0, 1]
[0, 1]

device_conn_type
[0, 1]
[0, 1]

C15
[0, 1, 2, 3]
[0, 1, 2, 3, 4]

C16
[0, 1, 2, 3]
[0, 1, 2, 3, 4, 5]

C18
[0, 1, 2, 3]
[0, 1, 2, 3]



In [90]:
x_test

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C17,C18,C19,C20,C21
0,0,2,0,-130733713507312859,4216390159525055228,8989390496182156993,8139837926009990151,2322932187405145523,-6653854128764593809,1,0,21694,2075,3,11,152,30
1,0,2,1,-8420732336032201532,-5795910976872708696,3241280909458911684,8139837926009990151,4458808066245559643,-9093417987928466091,1,0,16858,1465,3,7,201,11
2,0,2,0,-3367094153135357327,5971589140873687243,5481027797147490012,8139837926009990151,-643444275609898529,-4274244308430995516,1,0,21759,2080,0,3,84,209
3,0,2,1,2871429882650405294,1636531627246496509,3241280909458911684,8139837926009990151,8928909494443258300,-7755109868502984308,1,0,19950,1378,3,135,76,11
4,0,2,1,-4469581802049383141,1652344638219598869,-2127969792040958913,8139837926009990151,-5768234327574002625,186393860440689725,1,0,19950,1378,3,135,-2,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473605,4,2,0,-6041492807631839275,-6513112801695970150,-2127969792040958913,8139837926009990151,-2845334342924406475,186393860440689725,1,0,8330,339,3,143,78,11
473606,4,2,1,-335022880208110853,-3523361295620922062,3241280909458911684,8139837926009990151,-8182498889943577497,-9144191600576006691,1,0,6616,154,2,3,131,20
473607,4,2,0,-3367094153135357327,5971589140873687243,5481027797147490012,8139837926009990151,5437265425059495094,-950161541193647887,1,0,21763,2080,0,3,85,209
473608,4,2,1,-1093639372994759372,-3428769563543445904,-2127969792040958913,8139837926009990151,-6063138989402597750,-3919272212876345799,1,0,15705,1300,0,3,-2,67


In [92]:
prediction_test_prob

array([[2.31457213e-185, 3.53698662e-186],
       [4.89758711e-186, 3.53388742e-187],
       [1.95596514e-184, 5.17774569e-185],
       ...,
       [1.80688789e-184, 4.27961124e-185],
       [2.33259134e-185, 5.23312923e-186],
       [3.11509278e-185, 5.57908380e-186]])

In [95]:
test_df = pd.read_csv("test.gz", compression='gzip', header='infer')

In [97]:
output_dic_rfclassifier = {"id": list(test_df["id"]), "ctr": prediction_test_prob[:, 1]}
output_df_rfclassifier = pd.DataFrame.from_dict(output_dic_rfclassifier)

In [98]:
output_df_rfclassifier

Unnamed: 0,id,ctr
0,2683788,3.536987e-186
1,2683789,3.533887e-187
2,2683790,5.177746e-185
3,2683791,2.090392e-186
4,2683792,4.366177e-186
...,...,...
473605,3157393,1.439616e-186
473606,3157394,2.570375e-186
473607,3157395,4.279611e-185
473608,3157396,5.233129e-186


In [99]:
output_df_rfclassifier.to_csv("submission_v10.csv", index=False)