In [25]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

def get_fea_ppg(df):
    df=df.groupby('recording_time')['PPG'].mean()
    return pd.DataFrame(df)

def get_fea_acc(df):
    df=df.groupby('recording_time')[['Motion_dataX','Motion_dataY','Motion_dataZ']].mean()
    return pd.DataFrame(df)

def get_fea_gsr(df):
    df=df.groupby('recording_time')['GSR'].mean()
    return pd.DataFrame(df)

def manual_feature(df):
    feat = [
        df_batch['Motion_dataX'].mean(),
        df_batch['Motion_dataX'].max(),
        df_batch['Motion_dataX'].min(),
        df_batch['Motion_dataX'].max() - df_batch['Motion_dataX'].min(),
        df_batch['Motion_dataX'].diff(1).mean(),
        df_batch['Motion_dataX'].diff(1).max(),
        df_batch['Motion_dataX'].diff(1).min(),

        df_batch['Motion_dataY'].mean(),
        df_batch['Motion_dataY'].max(),
        df_batch['Motion_dataY'].min(),
        df_batch['Motion_dataY'].max() - df_batch['Motion_dataY'].min(),
        df_batch['Motion_dataY'].diff(1).mean(),
        df_batch['Motion_dataY'].diff(1).max(),
        df_batch['Motion_dataY'].diff(1).min(),

        df_batch['Motion_dataZ'].mean(),
        df_batch['Motion_dataZ'].max(),
        df_batch['Motion_dataZ'].min(),
        df_batch['Motion_dataZ'].max() - df_batch['Motion_dataZ'].min(),
        df_batch['Motion_dataZ'].diff(1).mean(),
        df_batch['Motion_dataZ'].diff(1).max(),
        df_batch['Motion_dataZ'].diff(1).min(),

        df_batch['GSR'].mean(),
        df_batch['GSR'].max(),
        df_batch['GSR'].min(),
        df_batch['GSR'].max() - df_batch['GSR'].min(),
        df_batch['GSR'].diff(1).mean(),
        df_batch['GSR'].diff(1).max(),
        df_batch['GSR'].diff(1).min(),
        df_batch['GSR'].isnull().mean(),

        df_batch['PPG'].mean(),
        df_batch['PPG'].max(),
        df_batch['PPG'].min(),
        df_batch['PPG'].max() - df_batch['PPG'].min(),
        df_batch['PPG'].diff(1).mean(),
        df_batch['PPG'].diff(1).max(),
        df_batch['PPG'].diff(1).min(),
    ]
    return feat

In [27]:
train_features = []
list_=os.listdir('./training_data/')
train_label = pd.read_csv('training_data/train_label.csv', encoding='gb2312')
# 对训练集的个体
for sid in list_:
    if '.csv' in sid:
        continue

    # 三类观测数据
    df_acc = get_fea_acc(pd.read_csv(f'./training_data/{sid}/ACC.csv'))
    df_gsr = get_fea_gsr(pd.read_csv(f'./training_data/{sid}/GSR.csv'))
    df_ppg = get_fea_ppg(pd.read_csv(f'./training_data/{sid}/PPG.csv'))

    # 按照时间顺序，拼接三类观测数据
    df = pd.concat([df_acc, df_gsr, df_ppg], axis=1).reset_index()
    
    df['GSR'] = df['GSR'].round(4)
    df['GSR'] = df['GSR'].replace(0.0061, np.nan)
    
    label = train_label.set_index('文件名').loc[sid].values[0]

    # 拆分为更小的数据
    for idx in range(df.shape[0] // 3000):
        df_batch = df.iloc[idx*3000: (idx+1)*3000]
        feat = manual_feature(df_batch)
        feat = [sid] + feat + [label]
        train_features.append(feat)

test_features = []
for sid in os.listdir('./test_data/'):
    if '.csv' in sid:
        continue

    df_acc = get_fea_acc(pd.read_csv(f'./test_data/{sid}/ACC.csv'))
    df_gsr = get_fea_gsr(pd.read_csv(f'./test_data/{sid}/GSR.csv'))
    df_ppg = get_fea_ppg(pd.read_csv(f'./test_data/{sid}/PPG.csv'))
    df = pd.concat([df_acc, df_gsr, df_ppg], axis=1).reset_index()
    
    df['GSR'] = df['GSR'].round(4)
    df['GSR'] = df['GSR'].replace(0.0061, np.nan)
    
    print(sid, df['recording_time'].min(), df['recording_time'].max())
    for idx in range(df.shape[0] // 3000):
        df_batch = df.iloc[idx*3000: (idx+1)*3000]
        feat = manual_feature(df_batch)
        feat = [sid] + feat
        test_features.append(feat)

data0033 09:41:01 16:59:59
data0129 06:00:00 16:59:59
data0157 10:23:05 16:59:59
data0172 10:43:28 16:59:59
data0179 06:00:00 16:59:59
data0191 08:23:27 16:59:59
data0302 06:00:00 16:59:59
data0369 09:10:05 16:59:59
data0377 08:38:51 16:59:59
data0418 06:00:00 16:59:59
data0461 06:00:00 16:59:59
data0541 06:00:00 16:59:59
data0548 06:00:00 16:59:59
data0562 09:11:31 16:59:59
data0645 07:19:29 16:59:59
data0667 06:00:00 16:59:59
data0671 10:01:30 16:59:59
data0699 06:00:00 16:59:59
data0702 06:00:00 16:59:59
data0856 06:00:00 16:59:59
data0882 09:41:24 16:59:59
data0943 06:00:00 16:24:55
data0975 06:00:00 16:59:59
data0982 11:10:27 16:59:59
data0984 06:00:00 16:59:59
data0999 06:00:00 16:59:59
data1428 06:00:00 16:59:59
data1576 06:00:00 16:59:59
data2381 06:00:00 16:59:59
data3862 06:00:00 16:59:59
data4722 08:58:50 16:18:37
data4998 09:26:17 16:59:59
data5166 08:11:03 16:59:59
data5468 06:00:00 16:59:59
data7329 07:34:22 16:59:59
data7431 06:00:00 16:59:59
data9057 06:00:00 16:59:59
d

In [28]:
train_features = pd.DataFrame(train_features)
test_features = pd.DataFrame(test_features)

model = LGBMClassifier()
model.fit(train_features.iloc[:, 1:-1], train_features.iloc[:, -1])
test_pred = model.predict(test_features.iloc[:, 1:])

test_features['label'] = test_pred

pred = test_features.groupby(0)['label'].mean() > 0.5

pred = pred.astype(int).reset_index()

pred.to_csv('lgb1.csv', index=None, header=None)
# 成绩：0.92308