# Step 1 - Create Modeling Data
1. Get data from database
2. Process data
3. Add features
4. Train/Test split - random split and by date
5. Cluster - perform all clustering methods on train data with random split and by date
6. Concatenate train and test data
7. Write processed data to database

### import packages

In [None]:
import numpy as np
import pandas as pd
import psycopg2 as pg
import datetime as dt
from sklearn import preprocessing
from collections import OrderedDict
from pprint import pprint
import cPickle as pickle
import gc
import socket
import boto3
from boto.utils import get_instance_metadata
import ast
from Segments import Segments
from Times import Times
from Cluster import Cluster
import time
import datetime
import os
import shutil
import joblib
import string
from AWS import AWS
from Utility import Utility
from sqlalchemy import create_engine

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
start_time = time.time()

### set inputs

In [None]:
# set environment
aws = None
s3_bucket_name = 'dse-cohort3-group3'
s3_dat_dir = 'PreprocessedWazeData'

# assume connection file is always present
conn_str_file = '../conf/db_conn_str.txt'
sqlalchemy_conn_str_file = '../conf/sqlalchemy_conn_str.txt'
sampling_args_file = '../conf/pipeline_args.txt'

In [None]:
fr = open(sampling_args_file, 'r')
fa = fr.read()
file_args = ast.literal_eval(fa)
file_args

In [None]:
save_dir = file_args['save_dir']
save_dir

In [None]:
if os.path.isdir('./{}'.format(save_dir)):
    now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    os.rename('./{}'.format(save_dir), './{}_{}'.format(save_dir, now))

os.mkdir(save_dir)
shutil.copy(sampling_args_file, save_dir)

# 1. get data from db

In [None]:
get_data_start = time.time()

### create AWS object and helper methods

In [None]:
util = Utility(file_args)

In [None]:
if util.isAWS():
    aws = AWS(s3_bucket_name, s3_dat_dir)

pg_conn_str = open(conn_str_file, 'r').read()

### connect to database

In [None]:
conn = pg.connect(pg_conn_str) 
util.conn = conn

### create Segments object and run queries

In [None]:
#samping_options = ['radius', 'sample', 'bounding_box', 'street', 
# 'road_type', 'ignore', 'cum_seg_pct']
#queries = [ 'sample', 'road_type', 'cum_seg_pct']

segments = Segments(conn, file_args['segment_queries_to_run'], file_args)
segments.run_queries()

### create Times object and run queries

In [None]:
#samping_options = ['time_window', 'day_of_week', 'exclude_dates', 'cum_ts_pct']
#queries = ['time_window', 'cum_ts_pct']

times = Times(conn, file_args['time_queries_to_run'], file_args)
times.run_queries()

### create elbow charts for top N% of segments and times

In [None]:
if 'cum_seg_pct' in file_args['segment_queries_to_run']:
    seg_cum_pct_df = pd.read_sql('select distinct cum_seg_pct, cum_pos_pct from seg_cum_pct order by cum_seg_pct', con=conn)

    # plot cum pct for number of positive instances and number of segments
    x = seg_cum_pct_df.cum_pos_pct.values
    y = seg_cum_pct_df.cum_seg_pct.values
    plt.plot(x,y)
    plt.xlabel('cum positive pct')
    plt.ylabel('cum segment pct')
    plt.title('pct of segments required to capture pct of incidents')
    plt.grid()
    plt.show();

In [None]:
if 'cum_ts_pct' in file_args['time_queries_to_run']:
    ts_cum_pct_df = pd.read_sql('select distinct cum_ts_pct, cum_pos_pct from ts_cum_pct order by cum_ts_pct', con=conn)

    # plot cum pct for number of positive instances and number of timestamps
    x = ts_cum_pct_df.cum_pos_pct.values
    y = ts_cum_pct_df.cum_ts_pct.values
    plt.plot(x,y)
    plt.xlabel('cum positive pct')
    plt.ylabel('cum time pct')
    plt.title('pct of timestamps required to capture pct of incidents')
    plt.grid()
    plt.show();

In [None]:
print('--- getting data took {0:.1f} seconds ---'.format(time.time() - get_data_start))

# 2. data processing

In [None]:
data_processing_start = time.time()

### cartesian product of segments and times

In [None]:
# create segments_df
segments_select = "segment_id, street, city, road_type, lat1, lon1, lat2, lon2"
segments_where = ''
sql_segments = 'SELECT {} FROM segments_selected {}'.format(segments_select, segments_where)
print('sql used to obtain segments dataframe:\n' + sql_segments)

segments_df = pd.read_sql(sql_segments, con=conn)
print('segments dataframe has ' + str(len(segments_df))+" rows")

# create time_df
times_select = 'time_id, date, day_of_week, month, time'
times_where = ''
sql_time = 'SELECT {} FROM times_selected {}'.format(times_select, times_where)
print('sql used to obtain time dataframe:\n' + sql_time)

time_df = pd.read_sql(sql_time, con=conn)
print('time dataframe has ' + str(len(time_df))+" rows")

# create cartesian product of segments and times to create segments_time_df
time_df['tmp'] = 1
segments_df['tmp'] = 1
segments_time_df = pd.merge(time_df, segments_df, how='outer', on=['tmp'])
print('cartesian product of segments and time has ' + str(len(segments_time_df))+" rows")

# query database to get matrix of positive traffic incidents
sql_matrix = '''
select m.segment_id, m.time_id, s.street, s.lat1, s.lon1, 
    s.lat2, s.lon2, t.date, t.time, t.day_of_week, 
    s.road_type, s.city,
    min(u.level) as level_min,
    max(u.level) as level_max,
    avg(u.level) as level_mean,
    count(u.level) as level_count
from matrix_''' + str(file_args['time_resolution']) + ''' m, times_selected t, uuid u, segments_selected s
where m.time_id = t.time_id 
    and m.uuid_instance_id = u.uuid_instance_id 
    and s.segment_id = m.segment_id
group by m.segment_id, m.time_id, s.street, s.lat1, s.lon1, 
    s.lat2, s.lon2, t.date, t.time, t.day_of_week, 
    s.road_type, s.city
'''

level_df = pd.read_sql(sql_matrix, con=conn)
print('level dataframe has ' + str(len(level_df))+" rows")

# join positive incidents to cartesian product of segments and times
segments_time_level_df = pd.merge(segments_time_df, level_df[['segment_id','date','time','level_min','level_max','level_mean','level_count']], how='left', on=['segment_id','date','time'])
print('joined segments/time/level dataframe has ' + str(len(segments_time_level_df))+" rows")

### check for duplicates

In [None]:
# check duplicates
df = level_df
print('{} duplicate rows'.format(str(sum(df.duplicated()))))
df_row_counts = df.groupby(df.columns.tolist(), as_index=False).size()
df_row_counts[df_row_counts.values > 1]

In [None]:
# check duplicates - they are coming from same segments having multiple road_types
df = level_df[['segment_id','time','date','road_type']]
print('{} duplicate rows'.format(str(sum(df.duplicated()))))
df_row_counts = df.groupby(df.columns.tolist(), as_index=False).size()
df_row_counts[df_row_counts.values > 1]

In [None]:
segments_df_row_counts = segments_df.groupby(['segment_id','road_type'], as_index=False).size()
segments_df_row_counts[segments_df_row_counts.values > 1]

### fillna with zeros

In [None]:
#Replace na values with zeros for assumption of no congestion
level_cols = [c for c in segments_time_level_df.columns if c.startswith('level')]

for c in level_cols:
    segments_time_level_df[c].fillna(0, inplace=True)

# segments_time_level_df.head()

### process the data:
1. add 'level_binary' column
1. set 'time' column
1. add number of days since earliest date
1. add number of minutes since midnight
1. encode categorical data to numeric using sklearn's labelencoder

In [None]:
# create 'target' column and set it to appropriate value based on input
print('creating level_binary column...')
segments_time_level_df['level_binary'] = segments_time_level_df[['tmp','level_count']].min(axis=1)

# add date_idx for number of days since earliest date
print('adding date_idx for number of days since earliest date...')
td = pd.to_datetime(segments_time_level_df['date']) - pd.to_datetime(segments_time_level_df.date.min())
date_idx_vals = (td / np.timedelta64(1, 'D')).astype(int)
segments_time_level_df.loc[:,'date_idx'] = date_idx_vals

# add time_idx for number of minutes since midnight
print('adding time_idx for number of minutes since midnight...')
time_idx_vals = map(lambda t: t.hour*60 + t.minute, segments_time_level_df['time'].values)
segments_time_level_df.loc[:,'time_idx'] = time_idx_vals

# define features and target
print('subsetting data to date, time, features and target...')
features = ['date_idx','time_idx','day_of_week','segment_id','street','city','road_type','lat1','lon1','lat2','lon2']
targets = [c for c in segments_time_level_df.columns if c.startswith('level')]

# subset data - include date, time, features, and target
segments_time_level_df = segments_time_level_df.loc[:,['date','time'] + features + targets]

# encode categorical data using label encoder - do not encode date and time
print('encoding categorical columns as numeric...')
num_cols = segments_time_level_df._get_numeric_data().columns
cat_cols = list(set(segments_time_level_df.columns) - set(num_cols) - {'date','time'})

le = preprocessing.LabelEncoder()

for col in cat_cols:
    print 'processing {} column'.format(col)
    segments_time_level_df[col] = le.fit_transform(segments_time_level_df[col])

In [None]:
print('--- data processing took {0:.1f} seconds ---'.format(time.time() - data_processing_start))

# 3. add features

In [None]:
add_features_start = time.time()

In [None]:
def clean_event_title(e):
    e_clean = e.translate(string.maketrans("",""), string.punctuation).replace(' ','_')
    return e_clean

In [None]:
if file_args['add_events']:
    # get events from dataframe
    events_df = pd.read_sql('SELECT * FROM events', con=conn)
    events_df['event_start'] = pd.to_datetime(events_df['event_start'])
    events_df['event_end'] = pd.to_datetime(events_df['event_end'])
    
    # subset to events larger than event_attendance_threshold
    events_of_interest = events_df[events_df['exp_attendance']>=file_args['event_attendance_threshold']]

    # add event durations
    #event_durations = events_of_interest['event_end'] - events_of_interest['event_start']
    #events_of_interest.loc[:,'duration'] = event_durations.values

    # add datetime column to data
    segments_time_level_df['datetime'] = segments_time_level_df[['date','time']].apply(lambda row: dt.datetime.combine(row['date'], row['time']), axis=1)
    
    # add event columns to data
    event_features = segments_time_level_df[['datetime']].copy()
    
    # add columns for events
    segments_time_level_df = pd.concat(
        [
            segments_time_level_df,
            pd.DataFrame(
                index=event_features.index, 
                columns=['event_{}'.format(clean_event_title(e)) for e in events_of_interest['event_title'].unique()]
            )
        ], axis=1
    ).fillna(0)
    
    # loop through events_df and set values for that event column to 1 if event was active
    window = file_args['event_active_buffer'] # hours
    for index, row in events_of_interest.iterrows():
        event = 'event_{}'.format(clean_event_title(row['event_title']))
        start = row['event_start'] - dt.timedelta(hours=1)
        end = row['event_end'] + dt.timedelta(hours=1)
        segments_time_level_df.loc[(segments_time_level_df['datetime']>=start) & (segments_time_level_df['datetime']<=end), event] = 1
    
    # drop added datetime column
    segments_time_level_df.drop('datetime', axis=1, inplace=True)

In [None]:
print('--- adding events took {0:.1f} seconds ---'.format(time.time() - add_features_start))
add_padres_start = time.time()

In [None]:
if file_args['add_padres']:
    # get padres from database
    padres_df = pd.read_sql('SELECT * FROM padres_games', con=conn)
    padres_df['game_start'] = pd.to_datetime(padres_df['game_start'])
    padres_df['game_end'] = pd.to_datetime(padres_df['game_end'])

    # add datetime column to data
    segments_time_level_df['datetime'] = segments_time_level_df[['date','time']].apply(lambda row: dt.datetime.combine(row['date'], row['time']), axis=1)
    
    # add padres_game column to data
    segments_time_level_df.loc[:,'padres_game'] = 0
    
    # set values for padres_event column to 1 if padres game was occurring
    padres_start_window_before = file_args['padres_start_window_before'] # 2 # hours
    padres_start_window_after = file_args['padres_start_window_after'] #0.5 # hours
    padres_end_window_before = file_args['padres_end_window_before'] #0.5 # hours
    padres_end_window_after = file_args['padres_end_window_after'] #1 # hours

    for index, row in padres_df.iterrows():
        # set active before/after game start time
        start = row['game_start'] - dt.timedelta(hours=padres_start_window_before)
        end = row['game_start'] + dt.timedelta(hours=padres_start_window_after)
        segments_time_level_df.loc[(segments_time_level_df['datetime']>=start) & 
                                   (segments_time_level_df['datetime']<=end), 'padres_game'] = 1

        # set active before/after game end time
        start = row['game_end'] - dt.timedelta(hours=padres_end_window_before)
        end = row['game_end'] + dt.timedelta(hours=padres_end_window_after)
        segments_time_level_df.loc[(segments_time_level_df['datetime']>=start) & 
                                   (segments_time_level_df['datetime']<=end), 'padres_game'] = 1
    
    # drop added datetime column
    segments_time_level_df.drop('datetime', axis=1, inplace=True)

In [None]:
print('--- adding padres games took {0:.1f} seconds ---'.format(time.time() - add_padres_start))
print('--- adding all features took {0:.1f} seconds ---'.format(time.time() - add_features_start))

### create data structure with params and data and write to pickle file

In [None]:
if file_args['write_pickle_file']:
    data_to_write = {
        'parameters': file_args,
        'data': segments_time_level_df
    }
    
    filename_to_write = file_args['output_file']['filename_base']
    print('writing data to pickle file - {}...'.format(filename_to_write))
    if aws:
        print('saving pickle file to s3')
        s3 = boto3.resource('s3')
        aws.save_file(filename_to_write, data_to_write)
    else:
        print('saving pickle file to local disk')
        pickle.dump(data_to_write, open(filename_to_write, 'wb'))
    print('pickle file dump complete...')
else:
    print('not writing pickle file...')

# 4. train test split

In [None]:
train_test_split_start = time.time()

In [None]:
print('splitting train and test data...')
train_data_random, test_data_random = util.process_train_test(segments_time_level_df, 'random')
train_data_date, test_data_date = util.process_train_test(segments_time_level_df, 'date')

In [None]:
print('--- train test split took {0:.1f} seconds ---'.format(time.time() - train_test_split_start))

# 5. clustering

In [None]:
clustering_start = time.time()

## 5.1 - clustering on random train/test split

In [None]:
clusters_rand_sparse = Cluster(conn, file_args, train_data_random, 'random', file_args['num_clusters']['sparse'])
clusters_rand_nonsparse = Cluster(conn, file_args, train_data_random, 'random', file_args['num_clusters']['nonsparse'])
clusters_rand_sparse_long = Cluster(conn, file_args, train_data_random, 'random', file_args['num_clusters']['sparse_long'])

### 5.1.1 - sparse clustering - train

In [None]:
sparse_model, sparse_clusters = clusters_rand_sparse.train_clustermodel_sparse()

In [None]:
train_data_random = train_data_random.merge(sparse_clusters, how='left', on='segment_id')

### 5.1.2 - nonsparse clustering - train

In [None]:
nonsparse_model, nonsparse_clusters = clusters_rand_nonsparse.train_clustermodel_nonsparse()

In [None]:
train_data_random = train_data_random.merge(nonsparse_clusters, how='left', on=['date','time','segment_id'])

### 5.1.3 - sparse long clustering - train

In [None]:
sparse_long_model, sparse_long_clusters = clusters_rand_sparse_long.train_clustermodel_sparse_long()

In [None]:
train_data_random = train_data_random.merge(sparse_long_clusters, how='left', on='segment_id')

### 5.1.4 - sparse clustering - test

In [None]:
today = datetime.date.today()
filename = file_args['cluster_algorithm'] + '_sparse_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_sparse = clusters_rand_sparse.test_assign_clusters_sparse(test_data_random, filename)

In [None]:
test_data_random = test_data_random.merge(test_clusters_sparse, how='left', on='segment_id')

### 5.1.5 - sparse long clustering - test

In [None]:
today = datetime.date.today()
filename = file_args['cluster_algorithm'] + '_sparse_long_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_sparse_long = clusters_rand_sparse_long.test_assign_clusters_sparse_long(test_data_random, filename)

In [None]:
test_data_random = test_data_random.merge(test_clusters_sparse_long, how='left', on='segment_id')

### 5.1.6 - nonsparse clustering - test

In [None]:
filename = file_args['cluster_algorithm'] + '_nonsparse_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_nonsparse = clusters_rand_nonsparse.test_assign_clusters_nonsparse(test_data_random, filename)

In [None]:
test_data_random = test_data_random.merge(test_clusters_nonsparse, how='left', on=['date','time','segment_id'])

## 5.2 - clustering on train/test split by date

In [None]:
clusters_date_sparse = Cluster(conn, file_args, train_data_date, 'date', file_args['num_clusters']['sparse'])
clusters_date_nonsparse = Cluster(conn, file_args, train_data_date, 'date', file_args['num_clusters']['nonsparse'])
clusters_date_sparse_long = Cluster(conn, file_args, train_data_date, 'date', file_args['num_clusters']['sparse_long'])

### 5.2.1 - sparse clustering - train

In [None]:
sparse_model, sparse_clusters = clusters_date_sparse.train_clustermodel_sparse()
train_data_date = train_data_date.merge(sparse_clusters, how='left', on='segment_id')

### 5.2.2 - sparse clustering - test

In [None]:
filename = file_args['cluster_algorithm'] + '_sparse_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_sparse = clusters_date_sparse.test_assign_clusters_sparse(test_data_date, filename)
test_data_date = test_data_date.merge(test_clusters_sparse, how='left', on='segment_id')

### 5.2.3 - nonsparse clustering - train

In [None]:
nonsparse_model, nonsparse_clusters = clusters_date_nonsparse.train_clustermodel_nonsparse()
train_data_date = train_data_date.merge(nonsparse_clusters, how='left', on=['date','time','segment_id'])

### 5.2.4 - nonsparse clustering - test

In [None]:
filename = file_args['cluster_algorithm'] + '_nonsparse_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_nonsparse = clusters_date_nonsparse.test_assign_clusters_nonsparse(test_data_date, filename)
test_data_date = test_data_date.merge(test_clusters_nonsparse, how='left', on=['date','time','segment_id'])

### 5.2.5 - sparse long clustering - train

In [None]:
sparse_long_model, sparse_long_clusters = clusters_date_sparse_long.train_clustermodel_sparse_long()
train_data_date = train_data_date.merge(sparse_long_clusters, how='left', on='segment_id')

### 5.2.5 - sparse long clustering - test

In [None]:
filename = file_args['cluster_algorithm'] + '_sparse_long_cluster_model_' + today.strftime('%Y%m%d') + '.pkl'
test_clusters_sparse_long = clusters_date_sparse_long.test_assign_clusters_sparse_long(test_data_date, filename)
test_data_date = test_data_date.merge(test_clusters_sparse_long, how='left', on='segment_id')

In [None]:
train_data_random.rename(columns={'cluster_sparse':'cluster_sparse_random'}, inplace=True)
test_data_random.rename(columns={'cluster_sparse':'cluster_sparse_random'}, inplace=True)
train_data_random.rename(columns={'cluster_sparse_long':'cluster_sparse_long_random'}, inplace=True)
test_data_random.rename(columns={'cluster_sparse_long':'cluster_sparse_long_random'}, inplace=True)
train_data_random.rename(columns={'cluster_nonsparse':'cluster_nonsparse_random'}, inplace=True)
test_data_random.rename(columns={'cluster_nonsparse':'cluster_nonsparse_random'}, inplace=True)

In [None]:
train_data_date.rename(columns={'cluster_sparse':'cluster_sparse_date'}, inplace=True)
test_data_date.rename(columns={'cluster_sparse':'cluster_sparse_date'}, inplace=True)
train_data_date.rename(columns={'cluster_sparse_long':'cluster_sparse_long_date'}, inplace=True)
test_data_date.rename(columns={'cluster_sparse_long':'cluster_sparse_long_date'}, inplace=True)
train_data_date.rename(columns={'cluster_nonsparse':'cluster_nonsparse_date'}, inplace=True)
test_data_date.rename(columns={'cluster_nonsparse':'cluster_nonsparse_date'}, inplace=True)

In [None]:
print('--- clustering took {0:.1f} seconds ---'.format(time.time() - clustering_start))

# 6 - concatenate train and test data

In [None]:
# add train_test columns
train_data_random['train_test_random'] = 'train'
test_data_random['train_test_random'] = 'test'

train_data_date['train_test_date'] = 'train'
test_data_date['train_test_date'] = 'test'

In [None]:
train_test_date = pd.concat([train_data_date, test_data_date])
train_test_random = pd.concat([train_data_random, test_data_random])

In [None]:
train_test = train_test_date.merge(train_test_random[['date_idx','time_idx','segment_id',
                                                      'cluster_sparse_random','cluster_nonsparse_random',
                                                      'cluster_sparse_long_random',
                                                      'train_test_random']],
                                  how='left', on=['date_idx','time_idx','segment_id'])

# write processed dataframe to database

In [None]:
writing_to_db_start = time.time()

In [None]:
# write processed dataframe to database
sqlalchemy_conn_str = open(sqlalchemy_conn_str_file, 'r').read()
engine = create_engine(sqlalchemy_conn_str, paramstyle='format')
train_test.to_sql(name='modeling_data', con=engine, 
                  if_exists='replace', index=False, chunksize=1000)

In [None]:
print('--- writing to db took {0:.1f} seconds ---'.format(time.time() - writing_to_db_start))
print('--- full data processing took {0:.1f} seconds ---'.format(time.time() - start_time))