In [1]:
import warnings
import random
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
import tensorflow as tf
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model, Sequential 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import tensorflow
tensorflow.random.set_seed(2020)
np.random.seed(2020)
random.seed(2020)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)



In [2]:
def pipeline_feature(data, outliersdata, dataname):
    redundant_col = ['Unnamed: 0','入口网络编号', '出口网络编号', '出口车道编号','车种代码','免费类型代码',
                     '支付方式代码','cluster','车牌号','路径标识', 'ETC车辆电子标签OBU编号']
    
    data = data.drop(redundant_col, axis=1)
    outliersdata = outliersdata.drop(redundant_col, axis=1)
    data['入口站编号'] = data['入口站编号'].astype('str')
    data['出口站编号'] = data['出口站编号'].astype('str')
    data['轴型及轴重'] = data['轴型及轴重'].astype('str')

    outliersdata['入口站编号'] = outliersdata['入口站编号'].astype('str')
    outliersdata['出口站编号'] = outliersdata['出口站编号'].astype('str')
    outliersdata['轴型及轴重'] = outliersdata['轴型及轴重'].astype('str')


    leEntryCode = preprocessing.LabelEncoder()
    data['入口站编号'] = leEntryCode.fit_transform(data['入口站编号'])
    outliersdata['入口站编号'] = leEntryCode.fit_transform(outliersdata['入口站编号'])

    leExitCode = preprocessing.LabelEncoder()
    data['出口站编号'] = leExitCode.fit_transform(data['出口站编号'])
    outliersdata['出口站编号'] = leExitCode.fit_transform(outliersdata['出口站编号'])

    leVCode = preprocessing.LabelEncoder()
    data['轴型及轴重'] = leVCode.fit_transform(data['轴型及轴重'])
    outliersdata['轴型及轴重'] = leVCode.fit_transform(outliersdata['轴型及轴重'])
    
    data['入口日期及时间'] = pd.to_datetime(data['入口日期及时间'])
    outliersdata['入口日期及时间'] = pd.to_datetime(outliersdata['入口日期及时间'])

    data['出口日期及时间'] = pd.to_datetime(data['出口日期及时间'])
    outliersdata['出口日期及时间'] = pd.to_datetime(outliersdata['出口日期及时间'])

    data['entryweek'] = data['入口日期及时间'].dt.week
    data['entryday'] = data['入口日期及时间'].dt.day
    data['entryhour'] = data['入口日期及时间'].dt.hour

    outliersdata['entryweek'] = outliersdata['入口日期及时间'].dt.week
    outliersdata['entryday'] = outliersdata['入口日期及时间'].dt.day
    outliersdata['entryhour'] = outliersdata['入口日期及时间'].dt.hour

    data['exitweek'] = data['出口日期及时间'].dt.week
    data['exitday'] = data['出口日期及时间'].dt.day
    data['exithour'] = data['出口日期及时间'].dt.hour

    outliersdata['exitweek'] = outliersdata['出口日期及时间'].dt.week
    outliersdata['exitday'] = outliersdata['出口日期及时间'].dt.day
    outliersdata['exithour'] = outliersdata['出口日期及时间'].dt.hour
    
    df_group = data.groupby('车型代码')['超限率']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['车型代码超限率均值'] = data['车型代码'].map(building_mean)
    data['车型代码超限率中位数'] = data['车型代码'].map(building_median)
    data['车型代码超限率最小值'] = data['车型代码'].map(building_min)
    data['车型代码超限率最大值'] = data['车型代码'].map(building_max)
    data['车型代码超限率标准差'] = data['车型代码'].map(building_std)

    outliersdata['车型代码超限率均值'] = outliersdata['车型代码'].map(building_mean)
    outliersdata['车型代码超限率中位数'] = outliersdata['车型代码'].map(building_median)
    outliersdata['车型代码超限率最小值'] = outliersdata['车型代码'].map(building_min)
    outliersdata['车型代码超限率最大值'] = outliersdata['车型代码'].map(building_max)
    outliersdata['车型代码超限率标准差'] = outliersdata['车型代码'].map(building_std)
    
    df_group = data.groupby('车型代码')['行驶时间']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['车型代码行驶时间均值'] = data['车型代码'].map(building_mean)
    data['车型代码行驶时间中位数'] = data['车型代码'].map(building_median)
    data['车型代码行驶时间最小值'] = data['车型代码'].map(building_min)
    data['车型代码行驶时间最大值'] = data['车型代码'].map(building_max)
    data['车型代码行驶时间标准差'] = data['车型代码'].map(building_std)

    outliersdata['车型代码行驶时间均值'] = outliersdata['车型代码'].map(building_mean)
    outliersdata['车型代码行驶时间中位数'] = outliersdata['车型代码'].map(building_median)
    outliersdata['车型代码行驶时间最小值'] = outliersdata['车型代码'].map(building_min)
    outliersdata['车型代码行驶时间最大值'] = outliersdata['车型代码'].map(building_max)
    outliersdata['车型代码行驶时间标准差'] = outliersdata['车型代码'].map(building_std)
    
    df_group = data.groupby('车型代码')['通过车道的平均速度(km/h)']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['车型代码平均速度均值'] = data['车型代码'].map(building_mean)
    data['车型代码平均速度中位数'] = data['车型代码'].map(building_median)
    data['车型代码平均速度最小值'] = data['车型代码'].map(building_min)
    data['车型代码平均速度最大值'] = data['车型代码'].map(building_max)
    data['车型代码平均速度标准差'] = data['车型代码'].map(building_std)

    outliersdata['车型代码平均速度均值'] = outliersdata['车型代码'].map(building_mean)
    outliersdata['车型代码平均速度中位数'] = outliersdata['车型代码'].map(building_median)
    outliersdata['车型代码平均速度最小值'] = outliersdata['车型代码'].map(building_min)
    outliersdata['车型代码平均速度最大值'] = outliersdata['车型代码'].map(building_max)
    outliersdata['车型代码平均速度标准差'] = outliersdata['车型代码'].map(building_std)
    
    df_group = data.groupby('出口站编号')['通过车道的平均速度(km/h)']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['出口站编号平均速度均值'] = data['出口站编号'].map(building_mean)
    data['出口站编号平均速度中位数'] = data['出口站编号'].map(building_median)
    data['出口站编号平均速度最小值'] = data['出口站编号'].map(building_min)
    data['出口站编号平均速度最大值'] = data['出口站编号'].map(building_max)
    data['出口站编号平均速度标准差'] = data['出口站编号'].map(building_std)

    outliersdata['出口站编号平均速度均值'] = outliersdata['出口站编号'].map(building_mean)
    outliersdata['出口站编号平均速度中位数'] = outliersdata['出口站编号'].map(building_median)
    outliersdata['出口站编号平均速度最小值'] = outliersdata['出口站编号'].map(building_min)
    outliersdata['出口站编号平均速度最大值'] = outliersdata['出口站编号'].map(building_max)
    outliersdata['出口站编号平均速度标准差'] = outliersdata['出口站编号'].map(building_std)

    df_group = data.groupby('入口站编号')['通过车道的平均速度(km/h)']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['入口站编号平均速度均值'] = data['入口站编号'].map(building_mean)
    data['入口站编号平均速度中位数'] = data['入口站编号'].map(building_median)
    data['入口站编号平均速度最小值'] = data['入口站编号'].map(building_min)
    data['入口站编号平均速度最大值'] = data['入口站编号'].map(building_max)
    data['入口站编号平均速度标准差'] = data['入口站编号'].map(building_std)

    outliersdata['入口站编号平均速度均值'] = outliersdata['入口站编号'].map(building_mean)
    outliersdata['入口站编号平均速度中位数'] = outliersdata['入口站编号'].map(building_median)
    outliersdata['入口站编号平均速度最小值'] = outliersdata['入口站编号'].map(building_min)
    outliersdata['入口站编号平均速度最大值'] = outliersdata['入口站编号'].map(building_max)
    outliersdata['入口站编号平均速度标准差'] = outliersdata['入口站编号'].map(building_std)
    
    df_group = data.groupby('出口站编号')['行驶时间']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['出口站编号行驶时间均值'] = data['出口站编号'].map(building_mean)
    data['出口站编号行驶时间中位数'] = data['出口站编号'].map(building_median)
    data['出口站编号行驶时间最小值'] = data['出口站编号'].map(building_min)
    data['出口站编号行驶时间最大值'] = data['出口站编号'].map(building_max)
    data['出口站编号行驶时间标准差'] = data['出口站编号'].map(building_std)

    outliersdata['出口站编号行驶时间均值'] = outliersdata['出口站编号'].map(building_mean)
    outliersdata['出口站编号行驶时间中位数'] = outliersdata['出口站编号'].map(building_median)
    outliersdata['出口站编号行驶时间最小值'] = outliersdata['出口站编号'].map(building_min)
    outliersdata['出口站编号行驶时间最大值'] = outliersdata['出口站编号'].map(building_max)
    outliersdata['出口站编号行驶时间标准差'] = outliersdata['出口站编号'].map(building_std)
    
    
    df_group = data.groupby('入口站编号')['行驶时间']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['入口站编号行驶时间均值'] = data['入口站编号'].map(building_mean)
    data['入口站编号行驶时间中位数'] = data['入口站编号'].map(building_median)
    data['入口站编号行驶时间最小值'] = data['入口站编号'].map(building_min)
    data['入口站编号行驶时间最大值'] = data['入口站编号'].map(building_max)
    data['入口站编号行驶时间标准差'] = data['入口站编号'].map(building_std)

    outliersdata['入口站编号行驶时间均值'] = outliersdata['入口站编号'].map(building_mean)
    outliersdata['入口站编号行驶时间中位数'] = outliersdata['入口站编号'].map(building_median)
    outliersdata['入口站编号行驶时间最小值'] = outliersdata['入口站编号'].map(building_min)
    outliersdata['入口站编号行驶时间最大值'] = outliersdata['入口站编号'].map(building_max)
    outliersdata['入口站编号行驶时间标准差'] = outliersdata['入口站编号'].map(building_std)
    
    
    df_group = data.groupby('出口站编号')['超限率']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['出口站编号超限率均值'] = data['出口站编号'].map(building_mean)
    data['出口站编号超限率中位数'] = data['出口站编号'].map(building_median)
    data['出口站编号超限率最小值'] = data['出口站编号'].map(building_min)
    data['出口站编号超限率最大值'] = data['出口站编号'].map(building_max)
    data['出口站编号超限率标准差'] = data['出口站编号'].map(building_std)

    outliersdata['出口站编号超限率均值'] = outliersdata['出口站编号'].map(building_mean)
    outliersdata['出口站编号超限率中位数'] = outliersdata['出口站编号'].map(building_median)
    outliersdata['出口站编号超限率最小值'] = outliersdata['出口站编号'].map(building_min)
    outliersdata['出口站编号超限率最大值'] = outliersdata['出口站编号'].map(building_max)
    outliersdata['出口站编号超限率标准差'] = outliersdata['出口站编号'].map(building_std)
    
    df_group = data.groupby('入口站编号')['超限率']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['入口站编号超限率均值'] = data['入口站编号'].map(building_mean)
    data['入口站编号超限率中位数'] = data['入口站编号'].map(building_median)
    data['入口站编号超限率最小值'] = data['入口站编号'].map(building_min)
    data['入口站编号超限率最大值'] = data['入口站编号'].map(building_max)
    data['入口站编号超限率标准差'] = data['入口站编号'].map(building_std)

    outliersdata['入口站编号超限率均值'] = outliersdata['入口站编号'].map(building_mean)
    outliersdata['入口站编号超限率中位数'] = outliersdata['入口站编号'].map(building_median)
    outliersdata['入口站编号超限率最小值'] = outliersdata['入口站编号'].map(building_min)
    outliersdata['入口站编号超限率最大值'] = outliersdata['入口站编号'].map(building_max)
    outliersdata['入口站编号超限率标准差'] = outliersdata['入口站编号'].map(building_std)
    
    df_group = data.groupby('出口站编号')['车货总重']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['出口站编号车货总重均值'] = data['出口站编号'].map(building_mean)
    data['出口站编号车货总重中位数'] = data['出口站编号'].map(building_median)
    data['出口站编号车货总重最小值'] = data['出口站编号'].map(building_min)
    data['出口站编号车货总重最大值'] = data['出口站编号'].map(building_max)
    data['出口站编号车货总重标准差'] = data['出口站编号'].map(building_std)

    outliersdata['出口站编号车货总重均值'] = outliersdata['出口站编号'].map(building_mean)
    outliersdata['出口站编号车货总重中位数'] = outliersdata['出口站编号'].map(building_median)
    outliersdata['出口站编号车货总重最小值'] = outliersdata['出口站编号'].map(building_min)
    outliersdata['出口站编号车货总重最大值'] = outliersdata['出口站编号'].map(building_max)
    outliersdata['出口站编号车货总重标准差'] = outliersdata['出口站编号'].map(building_std)
    
    df_group = data.groupby('入口站编号')['车货总重']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['入口站编号车货总重均值'] = data['入口站编号'].map(building_mean)
    data['入口站编号车货总重中位数'] = data['入口站编号'].map(building_median)
    data['入口站编号车货总重最小值'] = data['入口站编号'].map(building_min)
    data['入口站编号车货总重最大值'] = data['入口站编号'].map(building_max)
    data['入口站编号车货总重标准差'] = data['入口站编号'].map(building_std)

    outliersdata['入口站编号车货总重均值'] = outliersdata['入口站编号'].map(building_mean)
    outliersdata['入口站编号车货总重中位数'] = outliersdata['入口站编号'].map(building_median)
    outliersdata['入口站编号车货总重最小值'] = outliersdata['入口站编号'].map(building_min)
    outliersdata['入口站编号车货总重最大值'] = outliersdata['入口站编号'].map(building_max)
    outliersdata['入口站编号车货总重标准差'] = outliersdata['入口站编号'].map(building_std)
    
    df_group = data.groupby('车型代码')['车货总重']

    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)

    data['车型代码车货总重均值'] = data['车型代码'].map(building_mean)
    data['车型代码车货总重中位数'] = data['车型代码'].map(building_median)
    data['车型代码车货总重最小值'] = data['车型代码'].map(building_min)
    data['车型代码车货总重最大值'] = data['车型代码'].map(building_max)
    data['车型代码车货总重标准差'] = data['车型代码'].map(building_std)

    outliersdata['车型代码车货总重均值'] = outliersdata['车型代码'].map(building_mean)
    outliersdata['车型代码车货总重中位数'] = outliersdata['车型代码'].map(building_median)
    outliersdata['车型代码车货总重最小值'] = outliersdata['车型代码'].map(building_min)
    outliersdata['车型代码车货总重最大值'] = outliersdata['车型代码'].map(building_max)
    outliersdata['车型代码车货总重标准差'] = outliersdata['车型代码'].map(building_std)
    
    data['入口站编号平均速度离差'] = data['通过车道的平均速度(km/h)'] - data['入口站编号平均速度中位数']
    data['出口站编号平均速度离差'] = data['通过车道的平均速度(km/h)'] - data['出口站编号平均速度中位数']
    data['入口站编号行驶时间离差'] = data['行驶时间'] - data['入口站编号行驶时间中位数']
    data['出口站编号行驶时间离差'] = data['行驶时间'] - data['出口站编号行驶时间中位数']

    data['入口站编号平均速度离差'] = data['入口站编号平均速度离差'].apply(np.exp)
    data['出口站编号平均速度离差'] = data['出口站编号平均速度离差'].apply(np.exp)
    data['入口站编号行驶时间离差'] = data['入口站编号行驶时间离差'].apply(np.log)
    data['出口站编号行驶时间离差'] = data['出口站编号行驶时间离差'].apply(np.log)

    data['入口站编号超限率离差'] = data['超限率'] - data['入口站编号超限率中位数']
    data['出口站编号超限率离差'] = data['超限率'] - data['出口站编号超限率中位数']
    data['入口站编号车货总重离差'] = data['车货总重'] - data['入口站编号车货总重中位数']
    data['出口站编号车货总重离差'] = data['车货总重'] - data['出口站编号车货总重中位数']

    data['入口站编号超限率离差'] = np.abs(data['入口站编号超限率离差']).apply(np.log)
    data['出口站编号超限率离差'] = np.abs(data['出口站编号超限率离差']).apply(np.log)
    data['入口站编号车货总重离差'] = np.abs(data['入口站编号车货总重离差']).apply(np.log)
    data['出口站编号车货总重离差'] = np.abs(data['出口站编号车货总重离差']).apply(np.log)

    data['车型代码平均速度离差'] = data['通过车道的平均速度(km/h)'] - data['车型代码平均速度中位数']
    data['车型代码行驶时间离差'] = data['行驶时间'] - data['车型代码行驶时间中位数']
    data['车型代码超限率离差'] = data['超限率'] - data['车型代码超限率中位数']
    data['车型代码车货总重离差'] = data['车货总重'] - data['车型代码车货总重中位数']

    data['车型代码平均速度离差'] = data['车型代码平均速度离差'].apply(np.exp)
    data['车型代码行驶时间离差'] = data['车型代码行驶时间离差'].apply(np.log)
    data['车型代码超限率离差'] = np.abs(data['车型代码超限率离差']).apply(np.log)
    data['车型代码车货总重离差'] = np.abs(data['车型代码车货总重离差']).apply(np.log)

    #=====================outliers

    outliersdata['入口站编号平均速度离差'] = outliersdata['通过车道的平均速度(km/h)'] - outliersdata['入口站编号平均速度中位数']
    outliersdata['出口站编号平均速度离差'] = outliersdata['通过车道的平均速度(km/h)'] - outliersdata['出口站编号平均速度中位数']
    outliersdata['入口站编号行驶时间离差'] = outliersdata['行驶时间'] - outliersdata['入口站编号行驶时间中位数']
    outliersdata['出口站编号行驶时间离差'] = outliersdata['行驶时间'] - outliersdata['出口站编号行驶时间中位数']

    outliersdata['入口站编号平均速度离差'] = outliersdata['入口站编号平均速度离差'].apply(np.exp)
    outliersdata['出口站编号平均速度离差'] = outliersdata['出口站编号平均速度离差'].apply(np.exp)
    outliersdata['入口站编号行驶时间离差'] = outliersdata['入口站编号行驶时间离差'].apply(np.log)
    outliersdata['出口站编号行驶时间离差'] = outliersdata['出口站编号行驶时间离差'].apply(np.log)

    outliersdata['入口站编号超限率离差'] = outliersdata['超限率'] - outliersdata['入口站编号超限率中位数']
    outliersdata['出口站编号超限率离差'] = outliersdata['超限率'] - outliersdata['出口站编号超限率中位数']
    outliersdata['入口站编号车货总重离差'] = outliersdata['车货总重'] - outliersdata['入口站编号车货总重中位数']
    outliersdata['出口站编号车货总重离差'] = outliersdata['车货总重'] - outliersdata['出口站编号车货总重中位数']

    outliersdata['入口站编号超限率离差'] = np.abs(outliersdata['入口站编号超限率离差']).apply(np.log)
    outliersdata['出口站编号超限率离差'] = np.abs(outliersdata['出口站编号超限率离差']).apply(np.log)
    outliersdata['入口站编号车货总重离差'] = np.abs(outliersdata['入口站编号车货总重离差']).apply(np.log)
    outliersdata['出口站编号车货总重离差'] = np.abs(outliersdata['出口站编号车货总重离差']).apply(np.log)

    outliersdata['车型代码平均速度离差'] = outliersdata['通过车道的平均速度(km/h)'] - outliersdata['车型代码平均速度中位数']
    outliersdata['车型代码行驶时间离差'] = outliersdata['行驶时间'] - outliersdata['车型代码行驶时间中位数']
    outliersdata['车型代码超限率离差'] = outliersdata['超限率'] - outliersdata['车型代码超限率中位数']
    outliersdata['车型代码车货总重离差'] = outliersdata['车货总重'] - outliersdata['车型代码车货总重中位数']

    outliersdata['车型代码平均速度离差'] = outliersdata['车型代码平均速度离差'].apply(np.exp)
    outliersdata['车型代码行驶时间离差'] = outliersdata['车型代码行驶时间离差'].apply(np.log)
    outliersdata['车型代码超限率离差'] = np.abs(outliersdata['车型代码超限率离差']).apply(np.log)
    outliersdata['车型代码车货总重离差'] = np.abs(outliersdata['车型代码车货总重离差']).apply(np.log)
    
    data = data.replace([np.inf, -np.inf], np.nan)
    outliersdata = outliersdata.replace([np.inf, -np.inf], np.nan)
    data = data.fillna(-1)
    outliersdata = outliersdata.fillna(-1)

    data.to_pickle('cleandata/' + dataname + 'feg.pkl')
    outliersdata.to_pickle('outliers/' + dataname + 'feg.pkl')

In [3]:
save = False
if save:
    file = ['July.csv','Aug.csv','Sep.csv','Oct.csv', 'Nov.csv','Dec.csv']

    for f in file:
        data = pd.read_csv('cleandata/' + f)
        outliersdata = pd.read_csv('outliers/' + f)
        pipeline_feature(data, outliersdata, f)

In [4]:
droptime = ['入口日期及时间','出口日期及时间']
cat_feature = ['入口站编号', '出口站编号', '车型代码', '轴型及轴重', 'entryweek', 'entryday', 'entryhour', 'exitweek', 'exitday', 'exithour']
time_feature = ['entryweek', 'entryday', 'entryhour', 'exitweek', 'exitday', 'exithour']
base_feature = ['里程', '总轴数', '车货总重', '限重', '超限率', '是否ETC车道代码', '行驶时间', '通过车道的平均速度(km/h)']
target = '是否绿色通道车辆代码'

In [5]:
file = ['July.csvfeg.pkl','Aug.csvfeg.pkl','Sep.csvfeg.pkl','Oct.csvfeg.pkl', 'Nov.csvfeg.pkl','Dec.csvfeg.pkl']
batch_size = 512
for f in file:
    data = pd.read_pickle('cleandata/' + f)
    data.drop(droptime, axis=1, inplace=True)
    real_feature = [i for i in data.columns if i not in cat_feature and i != target and i not in base_feature]
    
    X = data[base_feature + cat_feature + real_feature]
    y = data[target]
    
    trn_cat = X[cat_feature]
    trn_real = X[base_feature + real_feature]
    
    sds_real = joblib.load('models/sds_real')
    trn_real = sds_real.transform(trn_real)
    
    model = tf.keras.models.load_model('models/conv1d_upsampling.h5')
    
    pred = model.predict([trn_real, trn_cat.values], batch_size=batch_size, verbose=1)
    print(accuracy_score(y, np.round(pred)))
    print("auc score:", roc_auc_score(y,pred))
    print(classification_report(y, np.round(pred)))
    
    outliers = pd.read_pickle('outliers/' + f)
    outliers.drop(droptime, axis=1, inplace=True)

    outliersX = outliers[base_feature + cat_feature + real_feature]
    outliersy = outliers[target]

    outlierstst_cat = outliersX[cat_feature]

    outlierstst_real = outliersX[base_feature + real_feature]
    outlierstst_real = sds_real.transform(outlierstst_real)
    
    outliersypred = model.predict([outlierstst_real, outlierstst_cat.values], batch_size=batch_size, verbose=1)
    print("outliers acc: {}".format(accuracy_score(outliersy, np.round(outliersypred))))

0.986825041817806
auc score: 0.9604952640025719
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    528065
           1       0.10      0.09      0.09      4005

    accuracy                           0.99    532070
   macro avg       0.54      0.54      0.54    532070
weighted avg       0.99      0.99      0.99    532070

outliers acc: 0.7877465857359636
0.897312782440284
auc score: 0.6766734659194991
              precision    recall  f1-score   support

           0       0.90      0.99      0.95     44299
           1       0.58      0.13      0.21      5269

    accuracy                           0.90     49568
   macro avg       0.74      0.56      0.58     49568
weighted avg       0.87      0.90      0.87     49568

outliers acc: 0.000249500998003992
0.9868232366611348
auc score: 0.9604902544093151
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    528068
           1       0.10 

In [6]:
file = ['July.csvfeg.pkl','Aug.csvfeg.pkl','Sep.csvfeg.pkl','Oct.csvfeg.pkl', 'Nov.csvfeg.pkl','Dec.csvfeg.pkl']
batch_size = 512

Callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]

for f in file:
    data = pd.read_pickle('cleandata/' + f)
    data.drop(droptime, axis=1, inplace=True)
    real_feature = [i for i in data.columns if i not in cat_feature and i != target and i not in base_feature]
    
    X = data[base_feature + cat_feature + real_feature]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=2020)
    
    trn_cat = X_train[cat_feature]
    trn_real = X_train[base_feature + real_feature]
    
    tst_cat = X_test[cat_feature]
    tst_real = X_test[base_feature + real_feature]

    sds_real = joblib.load('models/sds_real')
    trn_real = sds_real.transform(trn_real)
    tst_real = sds_real.transform(tst_real)
    
    batch_size = 512
    model.fit([trn_real, trn_cat.values], y_train, batch_size=batch_size, verbose=0, epochs=20, validation_split=0.2, callbacks=Callbacks)

    model = tf.keras.models.load_model('models/conv1d_upsampling.h5')
    
    pred = model.predict([tst_real, tst_cat.values], batch_size=batch_size, verbose=1)
    print("acc: {}".format(accuracy_score(y_test, np.round(pred))))
    print("auc score:", roc_auc_score(y_test,pred))
    print(classification_report(y_test, np.round(pred)))
    
    outliers = pd.read_pickle('outliers/' + f)
    outliers.drop(droptime, axis=1, inplace=True)

    outliersX = outliers[base_feature + cat_feature + real_feature]
    outliersy = outliers[target]
    outlierstst_cat = outliersX[cat_feature]
    outlierstst_real = outliersX[base_feature + real_feature]
    outlierstst_real = sds_real.transform(outlierstst_real)
    
    outliersypred = model.predict([outlierstst_real, outlierstst_cat.values], batch_size=batch_size, verbose=1)
    print("outliers acc: {}".format(accuracy_score(outliersy, np.round(outliersypred))))

acc: 0.9868062472982878
auc score: 0.9601777441279159
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    369665
           1       0.10      0.09      0.09      2784

    accuracy                           0.99    372449
   macro avg       0.55      0.54      0.54    372449
weighted avg       0.99      0.99      0.99    372449

outliers acc: 0.7877465857359636
acc: 0.8982362095798029
auc score: 0.6783144861060064
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     31039
           1       0.58      0.13      0.21      3659

    accuracy                           0.90     34698
   macro avg       0.74      0.56      0.58     34698
weighted avg       0.87      0.90      0.87     34698

outliers acc: 0.000249500998003992
acc: 0.986830517757993
auc score: 0.9605693059690739
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    369692
        