# Feature Engineering

In [5]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np

In [6]:
path = '/Users/daniellee/Dropbox/Data_Mining/elevator_terrorist_detection/'
df = pd.read_csv(path + 'simdata/simdata.csv')

In [7]:
# Add event id 
df['event_id'] = df.index

In [8]:
df.shape

(21941, 7)

## Current Trip Features 

In [9]:
# Time Variables
def create_time_variables(df):
    df = df.copy() 
    df['timein'], df['timeout'] = [pd.to_datetime(df[v]) for v in ['timein','timeout']]
    df['womin'] = df.timein.dt.week # Week of Month
    df['dowin'] = df.timein.dt.dayofweek # Day of Week
    df['hourin'] = df.timein.dt.hour 
    df['minin'] = df.timein.dt.minute

    df['womout'] = df.timeout.dt.week # Week of Month
    df['dowout'] = df.timeout.dt.dayofweek # Day of Week
    df['hourout'] = df.timeout.dt.hour 
    df['minout'] = df.timeout.dt.minute

    # Binned Hours
    def bin_hours(x):
        if 5 <= x <= 10: return 0  # Morning commute
        if 11 <= x <= 14: return 1 # Lunch 
        if 15 <= x <= 16: return 2 # Rush hour commute
        if 17 <= x <= 23: return 3 # Evening 
        if 0 <= x <= 4: return 4  # Midnight

    df['binned_hours'] = df.hourin.apply(lambda x: bin_hours(x))
    return df

In [10]:
# Number of people encountered during the trip
def number_of_people_encountered(df):
    df = df.copy()
    size = df.shape[0]
    df_timein = df.timein
    df_timeout = df.timeout
    df_elevator = df.elevator
    i = 0
    for e, _in, _out in df[['elevator','timein','timeout']].values:
        cond = (
               (e == df_elevator) &
               (((df_timein <= _in) & (_out <= df_timein)) | 
               ((_in <= df_timein) & (df_timein <= _out) & (_out <= df_timeout)) | 
               ((df_timein <= _in) & (_in <= df_timeout) & (df_timeout <= _out)) | 
               ((_in <= df_timein) & (df_timeout <= _out)))
        )
        temp = df.loc[cond]
        df.loc[i, 'passenger_encountered'] = df.loc[cond, 'id'].unique().shape[0] - 1
        if i % 1000 == 0: print(str((i + 1000)/ size * 100) + '% Complete...') 
        i += 1
    return df

## Passenger Day Features

In [11]:
def create_passengers_day_features(df):
    df = df.copy()
    
    # Number of trips made on a given day 
    trips_per_day = df.groupby(['id','dowin','womin'])['elevator'].count()
    
    # Hours between first entered and exit
    def work_hours(x):
        min_time = x['timein'].min()
        max_time = x['timeout'].max() 
        return (max_time - min_time).seconds / 3600

    work_hours_df = df.groupby(['id','dowin','womin'])['timein','timeout'].apply(lambda x: work_hours(x))
    
    # Joined passenger day features
    joined_passenger_day_feats = (
                pd.concat([work_hours_df,trips_per_day],axis=1)
                .rename(columns={0: 'work_hours', 'elevator': 'trip_cnts'})
                .reset_index()
    )
    
    return df.merge(joined_passenger_day_feats)

## Traffic Features

In [12]:
def create_traffic_features(df):
    df = df.copy() 
    
    # Number of people who entered the building in a given day
    people_count_series =  (
                    df.groupby(['dowin','womin'])['timein'].count()
                     .reset_index()
                     .rename(columns={'timein': 'cnt_visitors'})
    )
    
    # Number of people who entered the floor in a given day, binned hour comb
    def get_unique_id(x):
        return x.id.unique().shape[0]

    people_count_dowin_combo = (
                    df.groupby(['dowin','womin','binned_hours'])
                    .apply(lambda x: get_unique_id(x))
                    .reset_index()
                    .rename(columns={0: 'cnt_of_binned_visitors'})
    )
    
    df = df.merge(people_count_series).merge(people_count_dowin_combo)
    return df 

## Lifetime Passenger Profile Features

In [13]:
def create_lifetime_profile_features(df):
    df = df.copy()
    
    # Number of trips
    number_of_trips = (
                df.groupby('id')
                .apply(lambda x: x.shape[0])
                .to_frame()
                .rename(columns={0: 'passenger_#_trips'})
    ) 
    
    # Probability distribution of trips made per day of week
    dow_mass = pd.crosstab(df.id, df.dowin, normalize='index')
    dow_mass.columns = [str(k)+ '_dow_mass' for k in dow_mass.columns]
    
    # Probability mass function of binned hours when trips are made
    hour_mass = pd.crosstab(df.id, df.binned_hours, normalize='index')
    hour_mass.columns = [str(k)+ 'hour_mass' for k in hour_mass.columns]

    # Probability distribution of floor_in and floor_out
    floorin_mass = pd.crosstab(df.id, df.floorin, normalize='index')
    floorout_mass = pd.crosstab(df.id, df.floorout, normalize='index') 
    floorin_mass.columns = [str(k)+ 'floorin_mass' for k in floorin_mass.columns]
    floorout_mass.columns = [str(k)+ 'floorout_mass' for k in floorout_mass.columns]
    
    joined_feats = pd.concat([number_of_trips, dow_mass, hour_mass, floorin_mass, floorout_mass],axis=1)
    
    return pd.merge(joined_feats, df, left_index = True, right_on='id')

## Social Network Features

In [14]:
def create_network(df):
    df = df.copy() 
    
    G = nx.Graph()

    network_set = {}
    df_timein = df.timein
    df_timeout = df.timeout
    df_elevator = df.elevator
    for id, e, event_id, _in, _out in df[['id','elevator','event_id','timein','timeout']].values:
        cond = (
               (e == df_elevator) &
               (((df_timein <= _in) & (_out <= df_timein)) | 
               ((_in <= df_timein) & (df_timein <= _out) & (_out <= df_timeout)) | 
               ((df_timein <= _in) & (_in <= df_timeout) & (df_timeout <= _out)) | 
               ((_in <= df_timein) & (df_timeout <= _out)))
        )
        unique_id = set(df.loc[(df.id != id) & cond, 'id'].unique())
        if id not in network_set:
            network_set[id] = unique_id
        else:
            network_set[id] = network_set[id].union(set(unique_id))

    for n in df.id.unique():
        G.add_node(n)
            
    for k, s in network_set.items():
        for v in list(s):
            G.add_edge(k, v)
            
    return G

def create_network_features(df, G):
    df = df.copy()
    degree_df = pd.DataFrame(list(G.degree()), columns=['id','degrees'])
    centrality_df = pd.DataFrame([(k, v) for k, v in nx.degree_centrality(G).items()], columns=['id','centrality'])

    return df.merge(degree_df).merge(centrality_df)

## Behavioral Change Features

In [15]:
def last_k_features(x, k):
    
    x['offset'] = pd.DatetimeIndex(x['timeout']) - pd.DateOffset(k)
    
    for i in x.index:
        offsetdate, curdate, eventid = x.loc[i,['offset','timeout','event_id']].values
        
        window_x = x.loc[(x.timein >= offsetdate) & (x.timeout <= curdate)]
        window_binned_hours = window_x.binned_hours.value_counts().index[0]
        window_floorin = window_x.floorin.value_counts().index[0]
        window_floorout = window_x.floorout.value_counts().index[0]
        
        x.loc[x.event_id == eventid, 'window_binned_hours'] = window_binned_hours
        x.loc[x.event_id == eventid, 'window_floorin'] = window_floorin
        x.loc[x.event_id == eventid, 'window_floorout'] = window_floorout
        
    return x[['id','event_id','window_binned_hours','window_floorin','window_floorout']]

def create_lag_features(df, k):
    df = df.copy() 
    lagged_df = df.groupby('id').apply(lambda x: last_k_features(x, k))
    
    return df.merge(lagged_df)

## Merge All

In [16]:
def main(df):
    
    df = create_time_variables(df)
    print('complete')
    df = number_of_people_encountered(df)
    print('complete')
    df = create_passengers_day_features(df)
    print('complete')
    df = create_traffic_features(df)
    print('complete')
    df = create_lifetime_profile_features(df)
    print('complete')
    G =  create_network(df)
    df = create_network_features(df, G)
    print('complete')
    return df, G

data, G = main(df)

complete
4.557677407593091% Complete...
9.115354815186182% Complete...
13.673032222779272% Complete...
18.230709630372363% Complete...
22.788387037965453% Complete...
27.346064445558543% Complete...
31.903741853151637% Complete...
36.46141926074473% Complete...
41.019096668337816% Complete...
45.576774075930906% Complete...
50.134451483524% Complete...
54.692128891117086% Complete...
59.24980629871017% Complete...
63.80748370630327% Complete...
68.36516111389635% Complete...
72.92283852148945% Complete...
77.48051592908254% Complete...
82.03819333667563% Complete...
86.59587074426872% Complete...
91.15354815186181% Complete...
95.7112255594549% Complete...
100.268902967048% Complete...
complete
complete
complete
complete
complete


In [17]:
data.to_csv(path+'feature_data.csv', index=False)
nx.write_gpickle(G,path+'network.gpickle')