In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import math
pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [3]:
#时间特征处理
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    #得到准确时间
    df['hour_spe'] = df['time'].dt.hour+(df["time"].dt.minute)/60
    #得到小时，用于判断主要作业时段
    df["hour"] = df["time"].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    


In [4]:
#target为带拓展特征 methodes为拓展后特征（拓展方法）
# speed  mean
#key = ship  target = x
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        #speed_mean
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

In [5]:
#构造特征（每一特征都建立在groupby(渔船ID)基础上的）
def extract_feature(df, train):
    #分别构建x ，y方向的最大值，最小值，均值，标准差，方差，偏度，和总和
    t = group_feature(df, 'ship','x',['max','min','mean','std','var','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train = pd.merge(train, t, on='ship', how='left')
    
    t = group_feature(df, 'ship','y',['max','min','mean','std','var','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    t = group_feature(df, 'ship','v',['max','min','mean','std','var','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    t = group_feature(df, 'ship','g',['max','min','mean','std','var','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')    
    
    train["distance"] = (1.852*train['v_sum']/6)

    #经纬度范围
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    #不知道什么意义的特征
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    #坡度
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    #区域面积
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    #统计主要作业时间段
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)
    #对时间的统计
    t = group_feature(df, 'ship','hour',['max','min'])
    train = pd.merge(train, t, on='ship', how='left')
    # nunique()用于获取唯一值的统计个数
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['hour_nunique'] = train['ship'].map(hour_nunique)
    train['date_nunique'] = train['ship'].map(date_nunique)
    #
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train



In [6]:

extract_dt(train)   
extract_dt(test)

In [7]:
#drop_duplicates删除重复行，别且保留第一次出现的
train_label = train.drop_duplicates('ship')
test_label = test.drop_duplicates('ship')


In [8]:
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)
'''
train_label.loc[train_label['type'] == '拖网','type'] = 0
train_label.loc[train_label['type'] == '围网','type'] = 1
train_label.loc[train_label['type'] == '刺网','type'] = 2
'''


"\ntrain_label.loc[train_label['type'] == '拖网','type'] = 0\ntrain_label.loc[train_label['type'] == '围网','type'] = 1\ntrain_label.loc[train_label['type'] == '刺网','type'] = 2\n"

In [9]:
#获取特征
train_label = extract_feature(train, train_label)


{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_var': 'var', 'x_skew': 'skew', 'x_sum': 'sum'}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_var': 'var', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_var': 'var', 'v_skew': 'skew', 'v_sum': 'sum'}
{'g_max': 'max', 'g_min': 'min', 'g_mean': 'mean', 'g_std': 'std', 'g_var': 'var', 'g_skew': 'skew', 'g_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}


In [10]:
test_label = extract_feature(test, test_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_var': 'var', 'x_skew': 'skew', 'x_sum': 'sum'}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_var': 'var', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_var': 'var', 'v_skew': 'skew', 'v_sum': 'sum'}
{'g_max': 'max', 'g_min': 'min', 'g_mean': 'mean', 'g_std': 'std', 'g_var': 'var', 'g_skew': 'skew', 'g_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}


In [12]:
train_label.head()

Unnamed: 0,ship,x,y,v,g,time,type,speed_033,speed_6,speed_23,g_90,g_100,g_300,date,hour_spe,hour,x_max,x_min,x_mean,x_std,x_var,x_skew,x_sum,x_count,y_max,y_min,y_mean,y_std,y_var,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_var,v_skew,v_sum,g_max,g_min,g_mean,g_std,g_var,g_skew,g_sum,distance,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_time,diff_day,diff_second
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,0,50,10,8,55,10,0,1900-11-10,11.966667,11,6152038.0,6118352.0,6124263.0,11064.578515,122424900.0,1.582401,428698400.0,70,5130781.0,5124873.0,5129618.0,1841.41127,3390795.0,-1.298389,359073300.0,9.39,0.0,1.573,2.891904,8.363111,1.73988,110.11,129,0,27.285714,45.548554,2074.670807,1.129837,1910,33.987287,33686.667453,5907.975523,-987570.4,1027165.0,0.17538,199020000.0,9,23,0,16,4,2 days 23:48:51,2,85731
1,1,6076254.0,5061743.0,3.99,278,1900-11-10 11:40:21,0,215,18,116,276,81,3,1900-11-10,11.666667,11,6102450.0,6049472.0,6090794.0,16808.695885,282532300.0,-0.968273,2210958000.0,363,5112874.0,5042857.0,5092916.0,27152.980992,737284400.0,-0.709359,1848729000.0,10.47,0.0,1.705372,2.45116,6.008183,1.506506,619.05,336,0,59.556474,93.102059,8667.993349,1.3299,21619,191.0801,52978.013345,70016.655842,-936597.9,1059593.0,1.321617,3709343000.0,18,23,0,24,4,2 days 23:39:47,2,85187
2,10,6321032.0,5242805.0,4.48,213,1900-11-10 11:49:36,0,301,27,54,210,134,37,1900-11-10,11.816667,11,6346913.0,6246119.0,6262484.0,32280.567149,1042035000.0,1.62304,2486206000.0,397,5265810.0,5229867.0,5242458.0,5975.460236,35706130.0,2.198003,2081256000.0,10.09,0.0,1.313854,2.442825,5.967392,2.14541,521.6,359,0,108.758186,112.515081,12659.643399,0.727645,43177,161.000533,100794.674835,35942.703641,-980308.7,1117046.0,0.356593,3622833000.0,23,23,0,24,4,2 days 23:33:53,2,84833
3,100,6102751.0,5112534.0,0.0,0,1900-10-30 19:51:07,0,25,9,318,87,209,32,1900-10-30,19.85,19,6151439.0,6102326.0,6126483.0,13061.268057,170596700.0,-0.065192,2223913000.0,363,5112752.0,5069616.0,5081902.0,10618.265816,112747600.0,1.603756,1844730000.0,8.69,0.0,3.358044,1.323613,1.751952,0.109862,1218.97,353,0,183.112948,105.634368,11158.619804,-0.170257,66470,376.255407,49113.022232,43135.705758,-989574.0,1081823.0,0.878295,2118525000.0,11,23,0,24,3,2 days 19:49:49,2,71389
4,1000,6843713.0,5480538.0,2.0,216,1900-11-06 23:42:30,1,167,58,65,108,213,43,1900-11-06,23.7,23,6844414.0,6748890.0,6807524.0,26404.110865,697177100.0,-0.764797,2539207000.0,373,5540087.0,5440815.0,5464947.0,30244.632009,914737800.0,1.396089,2038425000.0,8.9,0.0,2.107936,2.654623,7.047024,1.095275,786.26,358,0,158.924933,101.729664,10348.924457,0.23048,59279,242.692253,95524.035775,99271.486171,-1208803.0,1403598.0,1.03923,9482813000.0,0,23,0,24,3,2 days 23:37:11,2,85031


In [13]:
features = [x for x in train_label.columns if x not in ['ship','type','time','diff_time','date']]
target = 'type'

In [14]:
print(len(features), ','.join(features))


55 x,y,v,g,speed_033,speed_6,speed_23,g_90,g_100,g_300,hour_spe,hour,x_max,x_min,x_mean,x_std,x_var,x_skew,x_sum,x_count,y_max,y_min,y_mean,y_std,y_var,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_var,v_skew,v_sum,g_max,g_min,g_mean,g_std,g_var,g_skew,g_sum,distance,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_day,diff_second


In [15]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [16]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []

#对于三种类型分别有一个发预测值
pred = np.zeros((len(test_label),3))

oof = np.zeros((len(X), 3))
#fold.split()
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
    #利用K折验证，分别设置测试集  验证集
    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    #训练模型
    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    #预测结果，预测的返回值是对每种类型的预测值
    val_pred = model.predict(X.iloc[val_idx])
    #将每一次的预测值 存在 oof中
    print(val_pred)
    oof[val_idx] = val_pred
    #真实结果
    val_y = y.iloc[val_idx]
    #预测结果取最大值
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    #最终预测结果，取五次均值
    test_pred = model.predict(test_label[features])
    pred += test_pred/5

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0666052	valid_1's multi_logloss: 0.243168
[200]	training's multi_logloss: 0.0147109	valid_1's multi_logloss: 0.243627
Early stopping, best iteration is:
[146]	training's multi_logloss: 0.0323676	valid_1's multi_logloss: 0.236717
[[9.97805719e-01 6.05324498e-04 1.58895602e-03]
 [7.62738735e-01 4.24659375e-03 2.33014671e-01]
 [9.96572130e-01 1.94257712e-03 1.48529254e-03]
 ...
 [9.99868989e-01 1.09076561e-04 2.19345630e-05]
 [9.98297039e-01 2.56568061e-04 1.44639316e-03]
 [3.65128388e-01 9.10165011e-03 6.25769962e-01]]
0 val f1 0.8886658636239236
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0655433	valid_1's multi_logloss: 0.261786
[200]	training's multi_logloss: 0.0145867	valid_1's multi_logloss: 0.256831
Early stopping, best iteration is:
[192]	training's multi_logloss: 0.0164012	valid_1's multi_logloss: 0.254493
[[9.73663222e-01 4.70684750e

In [28]:

'''''''''''
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

X = train_label[features].copy()
y = train_label[target]
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.333, random_state=0)   # 分训练集和验证集
train = lgb.Dataset(train_x, train_y)
valid = lgb.Dataset(valid_x, valid_y, reference=train)


parameters = {
              #'max_depth': [15, 20, 25, 30, 35],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              
              'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
              'bagging_freq': [2, 4, 5, 6, 8],
              'lambda_l1': [0, 0.1, 0.4, 0.5, 0.6],
              'lambda_l2': [0, 10, 15, 35, 40],
              'cat_smooth': [1, 10, 15, 20, 35]
}
gbm = lgb.LGBMClassifier(boosting_type='gbdt',
                         n_estimators =5000,
                         objective = 'multiclass',
                         num_class = 3,
                         metric = 'multi_logloss',
                         verbose = 0,
                         
                         num_leaves = 35,)
                                        
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='f1_macro', cv=3)
gsearch.fit(train_x, train_y)

#print("Best score: %0.3f" % gsearch.best_score_)
#print("Best parameters set:")
#best_parameters = gsearch.best_estimator_.get_params()
#for param_name in sorted(parameters.keys()):
#    print("\t%s: %r" % (param_name, best_parameters[param_name]))


KeyboardInterrupt: 

In [18]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [26]:

gsearch.best_score_，gsearch.best_params_

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-26-9310f5176110>, line 2)

In [17]:
#oof中存储每次K折验证之后的结果
print(oof)

[[9.73663222e-01 4.70684750e-03 2.16299300e-02]
 [9.98181927e-01 1.57057260e-03 2.47500700e-04]
 [9.97210848e-01 2.69249667e-03 9.66552478e-05]
 ...
 [2.33314645e-02 8.66718761e-02 8.89996659e-01]
 [9.98297039e-01 2.56568061e-04 1.44639316e-03]
 [3.65128388e-01 9.10165011e-03 6.25769962e-01]]


In [18]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

oof f1 0.8748270883269705


In [19]:
#返回沿轴最大值的索引 mark argmax!!!
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('result.csv', index=None, header=None)

0    0.627
1    0.240
2    0.133
Name: pred, dtype: float64


In [30]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    #查看特征重要性
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [31]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [32]:
df[:10]

Unnamed: 0,name,score
32,speed_23,672.4
65,y_max_x_min,632.0
64,y_max,574.6
57,x_min,520.6
55,x_max_y_min,496.4
50,x,431.0
62,y,412.0
58,x_skew,385.4
69,y_skew,361.8
68,y_min,342.4


In [None]:
df[10:20]

In [None]:
df[20:30]

In [None]:
df[30:40]