In [1]:
import os, sys, glob
import numpy as np
import pandas as pd

import time
import datetime

from joblib import Parallel, delayed
from sklearn.metrics import f1_score, log_loss, classification_report
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def read_feat(path, test_mode=False):
    df = pd.read_csv(path)
    df = df.iloc[::-1]
    
    if test_mode:
        df_feat = [df['渔船ID'].iloc[0], df['type'].iloc[0]]
        df = df.drop(['type'], axis=1)
    else:
        df_feat = [df['渔船ID'].iloc[0]]
        
    df['time'] = df['time'].apply(lambda x: datetime.datetime.strptime(x, "%m%d %H:%M:%S"))
    df_diff = df.diff(1).iloc[1:]
    df_diff['time_seconds'] = df_diff['time'].dt.total_seconds()
    df_diff['dis'] = np.sqrt(df_diff['x']**2 + df_diff['y']**2)
    
    df_feat.append(df['time'].dt.day.nunique())
    df_feat.append(df['time'].dt.hour.min())
    df_feat.append(df['time'].dt.hour.max())
    df_feat.append(df['time'].dt.hour.value_counts().index[0])

    df_feat.append(df['速度'].min())
    df_feat.append(df['速度'].max())
    df_feat.append(df['速度'].mean())

    # df_feat.append(df_diff['time'].min())
    # df_feat.append(df_diff['time'].max())
    # df_feat.append(df_diff['time'].mean())
    
    df_feat.append(df_diff['速度'].min())
    df_feat.append(df_diff['速度'].max())
    df_feat.append(df_diff['速度'].mean())
    df_feat.append((df_diff['速度'] > 0).mean())
    df_feat.append((df_diff['速度'] == 0).mean())

    df_feat.append(df_diff['方向'].min())
    df_feat.append(df_diff['方向'].max())
    df_feat.append(df_diff['方向'].mean())
    df_feat.append((df_diff['方向'] > 0).mean())
    df_feat.append((df_diff['方向'] == 0).mean())

    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['x'] > 0).mean())
    df_feat.append((df_diff['x'] == 0).mean())

    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['y'] > 0).mean())
    df_feat.append((df_diff['y'] == 0).mean())
    
    df_feat.append(df_diff['dis'].min())
    df_feat.append(df_diff['dis'].max())
    df_feat.append(df_diff['dis'].mean())

    df_feat.append((df_diff['dis']/df_diff['time_seconds']).min())
    df_feat.append((df_diff['dis']/df_diff['time_seconds']).max())
    df_feat.append((df_diff['dis']/df_diff['time_seconds']).mean())
    
    return df_feat

In [9]:
train_feat = []
for i in range(7000):
    train_feat.append(read_feat('./hy_round1_train_20200102/%d.csv'%(i), True))
train_feat = np.asarray(train_feat)

In [10]:
train_feat.shape

(7000, 35)

In [14]:
train_feat = pd.DataFrame(train_feat)
train_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0,拖网,4,0,23,15,0.0,9.39,0.2659661835748792,-6.800000000000001,...,2.181406345189609,0.0299934644484294,0.0193704600484261,0.9443099273607748,0.0,4745.887438051192,87.08964389083005,0.0,5.125651593855435,0.136680134429129
1,1,拖网,4,0,23,19,0.0,10.47,1.6079220779220778,-3.19,...,4.886008454358323,0.4621218310549133,0.1536458333333333,0.6328125,0.0,5828.1147917583885,494.8746985854979,0.0,5.23265708910985,0.7400346771721328
2,2,拖网,4,0,23,17,0.0,50.46,0.5951502145922748,-49.97,...,0.1870892898591031,0.0096076193716055,0.0560344827586206,0.8836206896551724,0.0,607.6779747047108,14.589569662102392,0.0,0.9421363948910244,0.0175005973120835
3,3,拖网,4,0,23,22,0.0,10.09,1.4713432835820894,-8.58,...,4.620353291302076,0.3287056214512248,0.2335329341317365,0.5568862275449101,0.0,4043.677371140993,459.5570999944612,0.0,5.125174626207763,0.7359309118891172
4,4,围网,4,0,23,23,0.0,10.09,1.412219451371571,-7.4,...,4.148412485842511,0.3976434238302269,0.2325,0.5225,0.0,5170.784247062245,382.2780746000655,0.0,4.429555955213512,0.5835072875812957


In [15]:
train_feat[1] = train_feat[1].map({'围网':0,'刺网':1,'拖网':2})

In [16]:
train_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0,2,4,0,23,15,0.0,9.39,0.2659661835748792,-6.800000000000001,...,2.181406345189609,0.0299934644484294,0.0193704600484261,0.9443099273607748,0.0,4745.887438051192,87.08964389083005,0.0,5.125651593855435,0.136680134429129
1,1,2,4,0,23,19,0.0,10.47,1.6079220779220778,-3.19,...,4.886008454358323,0.4621218310549133,0.1536458333333333,0.6328125,0.0,5828.1147917583885,494.8746985854979,0.0,5.23265708910985,0.7400346771721328
2,2,2,4,0,23,17,0.0,50.46,0.5951502145922748,-49.97,...,0.1870892898591031,0.0096076193716055,0.0560344827586206,0.8836206896551724,0.0,607.6779747047108,14.589569662102392,0.0,0.9421363948910244,0.0175005973120835
3,3,2,4,0,23,22,0.0,10.09,1.4713432835820894,-8.58,...,4.620353291302076,0.3287056214512248,0.2335329341317365,0.5568862275449101,0.0,4043.677371140993,459.5570999944612,0.0,5.125174626207763,0.7359309118891172
4,4,0,4,0,23,23,0.0,10.09,1.412219451371571,-7.4,...,4.148412485842511,0.3976434238302269,0.2325,0.5225,0.0,5170.784247062245,382.2780746000655,0.0,4.429555955213512,0.5835072875812957


In [18]:
test_feat = []
for i in range(7000,9000):
    test_feat.append(read_feat('./hy_round1_testA_20200102/%d.csv'%(i), False))
test_feat = np.asarray(test_feat)
test_feat = pd.DataFrame(test_feat)
test_feat.shape

(2000, 34)

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
n_fold = 10
skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
eval_fun = f1_score

def run_oof(clf, X_train, y_train, X_test, kf):
    print(clf)
    preds_train = np.zeros((len(X_train), 3), dtype = np.float)
    preds_test = np.zeros((len(X_test), 3), dtype = np.float)
    train_loss = []; test_loss = []

    i = 1
    for train_index, test_index in kf.split(X_train, y_train):
        x_tr = X_train[train_index]; x_te = X_train[test_index]
        y_tr = y_train[train_index]; y_te = y_train[test_index]
        clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
        
        train_loss.append(eval_fun(y_tr, np.argmax(clf.predict_proba(x_tr)[:], 1), average='macro'))
        test_loss.append(eval_fun(y_te, np.argmax(clf.predict_proba(x_te)[:], 1), average='macro'))

        preds_train[test_index] = clf.predict_proba(x_te)[:]
        preds_test += clf.predict_proba(X_test)[:]

        print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
        print('-' * 50)
        i += 1
    print('Train: ', train_loss)
    print('Val: ', test_loss)
    print('-' * 50)
    print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
    preds_test /= n_fold
    return preds_train, preds_test

params = {
    'learning_rate': 0.01,
    'min_child_samples': 5,
    'max_depth': 7,
    'lambda_l1': 2,
    'boosting': 'gbdt',
    'objective': 'multiclass',
    'n_estimators': 2000,
    'metric': 'multi_error',
    'num_class': 3,
    'feature_fraction': .75,
    'bagging_fraction': .85,
    'seed': 99,
    'num_threads': 20,
    'verbose': -1
}

train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
                                train_feat.iloc[:, 2:].values, #X
                                train_feat.iloc[:, 1].values, #y
                                test_feat.iloc[:, 1:].values, 
                                skf)


LGBMClassifier(bagging_fraction=0.85, boosting='gbdt', boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.75,
               importance_type='split', lambda_l1=2, learning_rate=0.01,
               max_depth=7, metric='multi_error', min_child_samples=5,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=2000,
               n_jobs=-1, num_class=3, num_leaves=31, num_threads=20,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=99, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, verbose=-1)
1: Train 0.9116418 Val 0.6673047/0.6673047
--------------------------------------------------
2: Train 0.8661966 Val 0.6761415/0.6717231
--------------------------------------------------
3: Train 0.9032193 Val 0.6431954/0.6622139
--------------------------------------------------
4: Train 0.8615836 Val 0.6321717/0.6547033
---------------

In [20]:
test_feat['label'] = np.argmax(test_pred, 1)
test_feat['label'] = test_feat['label'].map({0:'围网',1:'刺网',2:'拖网'})
test_feat[[0, 'label']].to_csv('baseline.csv',index=None, header=None)