In [None]:
#!pip install tsfresh

In [1]:
import warnings
warnings.filterwarnings('ignore');

In [2]:
#Import Libraries     
import pandas as pd
import json
import numpy as np
import re
import tsfresh
import random
import sys

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score as accuracy

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

  from numpy.core.umath_tests import inner1d


In [3]:
#Use to distance to destination
def coord_to_dist(df, dest_lat, dest_long):
    """
    Calculates the distance to the destination in miles
    Parameters
    ----------
    df : Pandas Dataframe with  Longitude and Latitude columns
    Returns
    -------
    dist_in_miles : array-like object
        Returns an array that as the distance to the destination in miles
    """
    long_dist = (df['Longitude'] - dest_long)*69
    lat_dist = (df['Latitude'] - dest_lat)*69
    dist_in_miles = (long_dist**2 + lat_dist**2)**0.5
    return dist_in_miles

In [5]:
#Import RvBsm
RvBsm = pd.read_csv('raw_data/RvBsm.csv')
RvBsm['Device & Trip'] = RvBsm['Device'].map(str) + ' ' + RvBsm['Trip'].map(str)
RvBsm = RvBsm.drop(columns=['Device', 'Trip', 'Time', 'RvRandomId', 'BsmPsId', 
                            'BasicVehClass', 'NativeFlag', 'RvDevice'])
for col in RvBsm.columns:
    if col != 'Device & Trip':
        RvBsm[col] = RvBsm[col].map(float)
RvBsm.head()

Unnamed: 0,LocalTimeMS,Latitude,Longitude,Elevation,Heading,GpsSpeed,BrakeStatus,YawRate,LongAccel,Device & Trip
0,80282393,40.283614,-83.529785,291.9,228.7125,25.5,0,-0.97,0.07,2004 12
1,80282893,40.283537,-83.529898,292.0,227.95,25.4,0,-1.46,0.01,2004 12
2,80283393,40.283458,-83.530009,292.1,226.95,25.3,0,-2.19,0.01,2004 12
3,80283893,40.283377,-83.530117,292.1,225.5,25.32,0,-2.92,0.07,2004 12
4,80284393,40.283294,-83.530222,292.2,224.0875,25.32,0,-2.92,0.07,2004 12


In [None]:
def label_and_save_extracted_features(df):
    """
    Due to the lack of labeling in our data, we will make our own inferences to label the data based on 
    psychological behaviors when driving. After labeling each DataFrame,
    
    Parameters
    ----------
    df : Pandas Dataframe w/ time column in ms and features
        Each DataFrame should be a unique device and trip sorted from by time
    """
    
    T = 30000 #Period in milliseconds (30 seconds)
    walking_distance = 0.25 #Walking Distance in miles (5 Blocks in miles)
    max_speed = 7 #Speed in m/s (About 15mph, the speed limit in a parking lot)
    end_time = max(df['LocalTimeMS']) #ending time
    path = 'features/' + df['Device & Trip'].iloc[0] + '.csv'
    
    clean_df = df.sort_values('LocalTimeMS',ascending=False) #Sort in descending order by time
    dest_lat, dest_long = clean_df['Latitude'].iloc[0], clean_df['Longitude'].iloc[0]
    clean_df['DistanceToDestination'] = coord_to_dist(clean_df, dest_lat, dest_long)
    clean_df = clean_df.drop(['Device & Trip','Latitude','Longitude'], axis=1) #Drop useless columns
    
    #Available Parking
    ap_df = clean_df[clean_df['LocalTimeMS'] > end_time - T]
    ap_df['Parking'] = np.array(['Available Parking' for i in range(len(ap_df))])
    features = impute(extract_features(ap_df,
                                       column_id='Parking',
                                       column_sort="LocalTimeMS",
                                       show_warnings=False)).reset_index()                
    
    #Looking for Parking
    nap_df = clean_df[clean_df['LocalTimeMS'] <= end_time - T] #no available parking
    too_far_fast = nap_df[(nap_df['DistanceToDestination'] > walking_distance) 
                          | (nap_df['GpsSpeed'] > max_speed)]['LocalTimeMS']
    
    if len(too_far_fast) > 0:
        too_far_fast = max(too_far_fast)
        lfp_num_T = int((end_time - T - too_far_fast) // T)
    else:
        lfp_num_T = 0
    for i in range(lfp_num_T):
        lfp_df = clean_df[(clean_df['LocalTimeMS'] <= end_time - i*T)]
        lfp_df = lfp_df[lfp_df['LocalTimeMS'] > (max(lfp_df['LocalTimeMS'])- T)]
        lfp_df['Parking'] = np.array(['Looking for Parking' for i in range(len(lfp_df))])
        features =  features.append(impute(extract_features(lfp_df,
                                                column_id='Parking',
                                                column_sort="LocalTimeMS",
                                                            show_warnings=False)).reset_index())   

        
    #Not Looking for Parking
    nlfp_end_time = end_time - (lfp_num_T+1)*T
    nlfp_num_T = int((nlfp_end_time - min(clean_df['LocalTimeMS'])) // T)
    
    for i in range(nlfp_num_T):
        nlfp_df = clean_df[clean_df['LocalTimeMS'] <= nlfp_end_time - i*T]
        nlfp_df = nlfp_df[nlfp_df['LocalTimeMS'] > (max(nlfp_df['LocalTimeMS'])- T)]
        nlfp_df['Parking'] = np.array(['Not Looking for Parking' for i in range(len(nlfp_df))])
        features = features.append(impute(extract_features(nlfp_df,
                                                column_id='Parking',
                                                column_sort="LocalTimeMS",
                                                           show_warnings=False)).reset_index())

    features.to_csv(path)
    return features

In [6]:
for dt in RvBsm['Device & Trip'].unique():
    try:
        fh = open('features/' + dt + '.csv', 'r')
    except FileNotFoundError:
        try:
            label_and_save_extracted_features(RvBsm[RvBsm['Device & Trip']==dt])
        except TypeError:
            print(dt)

In [4]:
try:
    all_features = pd.read_csv('all_features.csv')
except FileNotFoundError:
    all_dt = RvBsm['Device & Trip'].unique()
    all_features = pd.read_csv('features/'+all_dt[0]+'.csv')
    count = 0
    for dt in all_dt[1:]:
        all_features = all_features.append(pd.read_csv('features/'+dt+'.csv'))
        count += 1
        print(str(100*count/len(all_dt)) + "% done")
    all_features.to_csv('all_features.csv')
all_features.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,BrakeStatus__abs_energy,BrakeStatus__absolute_sum_of_changes,"BrakeStatus__agg_autocorrelation__f_agg_""mean""__maxlag_40","BrakeStatus__agg_autocorrelation__f_agg_""median""__maxlag_40","BrakeStatus__agg_autocorrelation__f_agg_""var""__maxlag_40","BrakeStatus__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","BrakeStatus__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""",...,YawRate__symmetry_looking__r_0.9,YawRate__symmetry_looking__r_0.9500000000000001,YawRate__time_reversal_asymmetry_statistic__lag_1,YawRate__time_reversal_asymmetry_statistic__lag_2,YawRate__time_reversal_asymmetry_statistic__lag_3,YawRate__value_count__value_-1,YawRate__value_count__value_0,YawRate__value_count__value_1,YawRate__variance,YawRate__variance_larger_than_standard_deviation
0,0,0,Available Parking,27.0,7.0,-0.057428,-0.108746,0.127073,1.238095,-0.828079,...,1.0,1.0,-11.001313,-111.894392,-417.5172,0.0,4.0,0.0,49.085905,1.0
1,1,0,Not Looking for Parking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.01903,0.001159,0.03741159,0.0,24.0,0.0,0.239871,0.0
2,2,0,Not Looking for Parking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,-0.00293,0.0,6.424902e-20,0.0,41.0,0.0,0.026754,0.0
3,3,0,Not Looking for Parking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.054363,0.097551,0.1192642,0.0,27.0,0.0,0.437841,0.0
4,4,0,Not Looking for Parking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,-0.031102,-0.094279,-0.07452204,0.0,31.0,0.0,0.525689,0.0


In [11]:
try:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
except FileNotFoundError:
    split_index = int(len(all_features) * 0.8)

    train_y = all_features['id'].iloc[split_index:]
    test_y = all_features['id'].iloc[:split_index]
    
    train_X = all_features.iloc[split_index:].drop( 'id', axis = 1 )
    test_X = all_features.iloc[:split_index].drop( 'id', axis = 1 )

    train_features_selected = select_features( train_X, train_y, fdr_level = 0.05 )

    train = train_features_selected.copy()
    train['id'] = train_y

    test = test_X[ train_features_selected.columns ].copy()
    test['id'] = test_y

    train.to_csv('data/train.csv', index = None )
    test.to_csv('data/test.csv', index = None )

In [13]:
train_X = train.drop('id', axis = 1 ).values
test_X = test.drop('id', axis = 1 ).values

train_y = train['id'].values
test_y = test['id'].values

classifiers = [make_pipeline( StandardScaler(), LR()),
    make_pipeline( MinMaxScaler(), LR()),
    RF(n_estimators = 100, min_samples_leaf = 5)]

for clf in classifiers:
    clf.fit(train_X, train_y)
    p = clf.predict_proba(test_X)[:,1]
    p_bin = clf.predict(test_X)
    acc = accuracy(test_y, p_bin )
    print(acc)

0.7598613800835796
0.758740189583121
0.8559779838956274


In [18]:
len(test_y) * 0.8559779838956274

8398.0