### 資料集簡介
<p>欄位說明:</p>
<p>FileID: 檔案識別ID</p>
<p>CustomerID: 使用者裝置識別ID</p>
<p>QueryTs: 該筆資料發生時間</p>
<p>ProductID: 使用者裝置的產品代碼</p>



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
plt.rcParams['font.family']='SimHei' #顯示中文

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load in the train datasets
data_dir = "/data/examples/trend/data/"
train = pd.read_csv(data_dir + 'training-set.csv', encoding = "utf-8", header=None)
test = pd.read_csv(data_dir + 'testing-set.csv', encoding = "utf-8", header=None)

In [4]:
#query_log裡面被官方排除的 FileID
train_exc = pd.read_csv(data_dir + 'exception/exception_train.txt', encoding = "utf-8", header=None)
test_exc = pd.read_csv(data_dir + 'exception/exception_testing.txt', encoding = "utf-8", header=None)

In [5]:
test_exc.head(2)

Unnamed: 0,0
0,4eedf630f7160dafea969e5d57239d31
1,8e1c12f42b705cb465bab88225d03c81


In [7]:
# training set - label: 0:非惡意程式, 1:惡意程式
train.columns=['FileID','label']
train.head(2)

Unnamed: 0,FileID,label
0,0000e2398b12121a85166fed5fe2a3da,0
1,0001fe8dce14ce099aa6ca8ea5026ea7,0


In [8]:
# testing set - AUC: Area Under ROC Curve
test.columns=['FileID','AUC']
test.head(2)

Unnamed: 0,FileID,AUC
0,00008c73ee43c15b16c26b26398c1577,0.5
1,0002ded3a0b54f2ffdab0ca77a5ce2b6,0.5


In [32]:
train_exc.columns = ['FileID']
test_exc.columns = ['FileID']
train_exc.head(2)
test_exc.head(2)

Unnamed: 0,FileID
0,4eedf630f7160dafea969e5d57239d31
1,8e1c12f42b705cb465bab88225d03c81


In [9]:
#確認排除的FileID在training set裡面找不到
for item in train_exc:
    print(train[train['FileID']==item])

Empty DataFrame
Columns: [FileID, label]
Index: []


In [10]:
#取0301當天的query log來查看
log_files = os.listdir(data_dir+"query_log")
query_logs = []
for log_file in log_files:
    query_log = pd.read_csv(data_dir + 'query_log/' + log_file, encoding = "utf-8", header=None)
    query_log.columns=['FileID','CustomerID','QueryTs','ProductID']
    query_log['times'] = 1
    query_logs.append(query_log)

In [11]:
querys = pd.concat(query_logs)

In [12]:
querys.describe()

Unnamed: 0,QueryTs,times
count,83273110.0,83273110.0
mean,1492394000.0,1.0
std,2138802.0,0.0
min,1488326000.0,1.0
25%,1490234000.0,1.0
50%,1492291000.0,1.0
75%,1494531000.0,1.0
max,1496275000.0,1.0


In [13]:
querys.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83273110 entries, 0 to 563303
Data columns (total 5 columns):
FileID        object
CustomerID    object
QueryTs       int64
ProductID     object
times         int64
dtypes: int64(2), object(3)
memory usage: 20.2 GB


### 樞杻分析

In [14]:
query_pivot_productid = querys.pivot_table(values='times',index=['FileID'],columns='ProductID',aggfunc='sum')
# query_pivot_customerid = querys.pivot_table(values='times',index=['FileID'],columns='CustomerID',aggfunc='sum')

In [78]:
query_pivot_productid

ProductID,55649,218578,262880,533133,0374c4,055649,05b409,0cdb7a,20f8a5,218578,...,a310bb,aaa9c8,b93794,c105a0,c76d58,cc3a6a,d465fc,dd8d4a,e47f04,fec24f
FileID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008c73ee43c15b16c26b26398c1577,,,,,,,,,,,...,,,,1.0,,,,,83.0,
0000e2398b12121a85166fed5fe2a3da,,,,,,38.0,,,,,...,,,,,,,,,,
0001fe8dce14ce099aa6ca8ea5026ea7,141.0,,,,,44.0,,,,,...,,,,,45.0,,,,,
00027f50019000accc492e5684efc818,2.0,,,,,11.0,,,,,...,,,,14.0,,,,,,
00028c9da3573ec50db74b44310ae507,8.0,,,,,3.0,,,,,...,,,,,,,,,,
0002ded3a0b54f2ffdab0ca77a5ce2b6,,,,,,,,,,,...,,,,,,,,,,
0003dc8130969abe688cadf5f14ea19f,2.0,,,,,20.0,,,,,...,,,,,3.0,,,,,
000462baadff8a02f7f6fef84d242eed,20.0,,,,,58.0,,,13.0,,...,,,,,,,,,,
00050a9df8e13f6ab5a3d3b3e2fc6a86,427.0,,,,,64.0,,,,,...,,,,,115.0,,,,,
0005e4c0263f52453438ddd0d55aef82,,,,,,,,,,,...,,,,,,,,,,


In [54]:
train_query_pivot_productid = query_pivot_productid.ix[train["FileID"]]
test_query_pivot_productid = query_pivot_productid.ix[test["FileID"]]

## 基本前處理

### 填入缺值

In [58]:
train_query_pivot_productid = train_query_pivot_productid.fillna(0)
test_query_pivot_productid = test_query_pivot_productid.fillna(0)

### 正規化, 標準縮放

In [130]:
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [131]:
# ss = StandardScaler()
# train_standard = ss.fit_transform(train_query_pivot_productid)
# test_standard = ss.transform(test_query_pivot_productid)
nl = Normalizer()
train_norm = nl.fit_transform(train_query_pivot_productid)
test_norm = nl.transform(test_query_pivot_productid)

In [160]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
# import lightgbm as lgb

In [150]:
X_train = train_norm
y_train = train['label'].values
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="roc_auc", cv = kf))
    return(rmse)

In [163]:
GBoost = GradientBoostingClassifier(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='deviance', random_state =5, verbose=1)

In [None]:
n_folds = 10
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

### 聚合函數
count（個數）, sum（加總）, mean（平均）, median（中位數）, std（標準差）, var（變異數）, first（第一個非NA）, last（最後一個非NA）

In [21]:
# querys.groupby(['FileID','CustomerID','ProductID'])[['times']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,times
FileID,CustomerID,ProductID,Unnamed: 3_level_1
00008c73ee43c15b16c26b26398c1577,033e91e730e176cd2ac81dc48a8f5088,7acab3,3
00008c73ee43c15b16c26b26398c1577,06bea9f9c1d72c4d3151efa6bcd5ba83,e47f04,3
00008c73ee43c15b16c26b26398c1577,113597f904c023f44d1a01babb780f58,7acab3,1
00008c73ee43c15b16c26b26398c1577,12f5f66461e78a75266a1535178b0523,e47f04,3
00008c73ee43c15b16c26b26398c1577,1b3061efd83d14f3f8e5411cd9aceccf,e47f04,1
00008c73ee43c15b16c26b26398c1577,1dfe70c932c3d86bc6da193a68437590,e47f04,1
00008c73ee43c15b16c26b26398c1577,1f4d9de76b1579f5defba747152a3bd9,e47f04,2
00008c73ee43c15b16c26b26398c1577,2061e471056569886dc55d2deecf10ea,e47f04,1
00008c73ee43c15b16c26b26398c1577,20b84480b3e4d56c89cdb0f468fddb20,e47f04,1
00008c73ee43c15b16c26b26398c1577,2126efd5e40acd2fb45fc4d02e628850,885fab,1


### 其它會用到的工具

In [None]:
#AUC計算範例
import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2])
pred = np.array([0.9, 0.1, 0.7, 0.9])
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
metrics.auc(fpr, tpr)


In [None]:
#timestamp轉換
import datetime
print(
    datetime.datetime.fromtimestamp(
        int("1488326402")
    ).strftime('%Y-%m-%d %H:%M:%S'))
