In [1]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings

import os
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf
import matplotlib.pyplot as plt

import cv2

# for copying data
import copy

In [2]:
# Read accident data
accidents = pd.read_csv('accidents_count.csv')

# Split into zero and non-zero parts
acci_zero = accidents[accidents['acci_counts']==0]
acci_nonzero = accidents[accidents['acci_counts']!=0]

In [3]:
# Label non-zero data by numbers from 1 to 4
# 1 denoting relatively safe, 2 denoting medium, 3 denoting realtively dangerous, and 4 denoting dangerous
acci_nonzero['label'] = pd.qcut(acci_nonzero['acci_counts'], 4, \
                        labels=["1", "2", "3", "4"])

# Label zero data by number zero
# 0 denoting safe
acci_zero['label'] = 0

# Randomly sample from the original zero data
safety = acci_zero.sample(n=700)

In [4]:
# Concatenate the sampled zero data (safety) and the nonzero data
combine = pd.concat([acci_nonzero,safety])

# Sort the data by index
combine = combine.sort_index()

In [5]:
combine.head()

Unnamed: 0,location,acci_counts,label
9,"(40.7215815, -73.9977854)",48,3
11,"(40.7214509, -73.9973979)",172,4
12,"(40.7208958, -73.9957503)",98,4
13,"(40.7211736, -73.9965747)",88,4
14,"(40.7206316, -73.9949663)",114,4


In [6]:
# Drop unnecessary column (we have labeled accident counts data, so we do not need this column any more)
combine.drop('acci_counts',axis=1,inplace=True)

In [7]:
combine.head()

Unnamed: 0,location,label
9,"(40.7215815, -73.9977854)",3
11,"(40.7214509, -73.9973979)",4
12,"(40.7208958, -73.9957503)",4
13,"(40.7211736, -73.9965747)",4
14,"(40.7206316, -73.9949663)",4


In [8]:
# Create a new dataframe copied from combine
new = copy.deepcopy(combine)

# Transform the former location data to latitude and longitude columns
new['latitude'] = 0
new['longitude'] = 0
for i in range(0,len(new)):
    new.iloc[i,2] = float(new.iloc[i,0].lstrip('(').rstrip(')').split(',')[0])
    new.iloc[i,3] = float(new.iloc[i,0].lstrip('(').rstrip(')').split(',')[1].lstrip())

# Transfrom the former location data to data without blank
for i in range(0,len(new)):
    new.iloc[i,0] = str('('+str(new.iloc[i,2])+',' + str(new.iloc[i,3])+')')

In [9]:
new.head()

Unnamed: 0,location,label,latitude,longitude
9,"(40.7215815,-73.9977854)",3,40.721581,-73.997785
11,"(40.7214509,-73.9973979)",4,40.721451,-73.997398
12,"(40.7208958,-73.9957503)",4,40.720896,-73.99575
13,"(40.7211736,-73.9965747)",4,40.721174,-73.996575
14,"(40.7206316,-73.9949663)",4,40.720632,-73.994966


In [10]:
# Drop the latitude and longitude columns
new.drop('latitude', axis=1, inplace=True)
new.drop('longitude', axis=1, inplace=True)

In [11]:
# Read intersections data
intersections = pd.read_csv('intersection.csv')
intersections.latitude = round(intersections.latitude,7)
intersections.longitude = round(intersections.longitude,7)
intersections['location']=0

In [12]:
# Create a location column in the same style with "combine"
for i in range(0,len(intersections)):
    intersections.iloc[i,2] = str('('+str(intersections.iloc[i,0])+',' + str(intersections.iloc[i,1])+')')

In [13]:
# Create an index column in the intersections dataframe, for the convenience of linking back to image files
intersections['index']=intersections.index

In [14]:
intersections.head()

Unnamed: 0,latitude,longitude,location,index
0,40.690968,-73.98922,"(40.6909681,-73.9892198)",0
1,40.691787,-73.989009,"(40.6917872,-73.9890093)",1
2,40.806833,-73.934215,"(40.8068327,-73.9342151)",2
3,40.806169,-73.934859,"(40.8061691,-73.9348588)",3
4,40.805752,-73.934709,"(40.8057523,-73.9347088)",4


In [15]:
position = copy.deepcopy(intersections)

# Drop the latitude and longitude columns
position.drop('latitude', axis=1, inplace=True)
position.drop('longitude', axis=1, inplace=True)

In [16]:
#merge tables
position = position.merge(new,on = 'location',how = 'inner')
position.head()

Unnamed: 0,location,index,label
0,"(40.7215815,-73.9977854)",9,3
1,"(40.7214509,-73.9973979)",11,4
2,"(40.7208958,-73.9957503)",12,4
3,"(40.7211736,-73.9965747)",13,4
4,"(40.7206316,-73.9949663)",14,4


In [17]:
# Make sure the shape is right
position.shape

(3493, 3)

In [18]:
# Add a new column for map data in position
position['image'] = 0
position.head(2)

Unnamed: 0,location,index,label,image
0,"(40.7215815,-73.9977854)",9,3,0
1,"(40.7214509,-73.9973979)",11,4,0


In [19]:
# Read images and plug image data into 'image' column

coord_img_dict={}

for i in range(0,len(position)):
    
    # Get the index of path of each map
    idx = position.iloc[i,1]
    img_ = str('intersection_img_size20/'+'testimage_big'+str(idx)+'.png')
    
    # Read map data into a dictionary
    coord_img_dict[i]=cv2.imread(img_)
    
# Plug the dictionary into the position dataframe
maps = pd.DataFrame(list(coord_img_dict.items()), columns=['sequence','image'])
position['image'] = maps['image']

In [20]:
position.head()

Unnamed: 0,location,index,label,image
0,"(40.7215815,-73.9977854)",9,3,"[[[65, 39, 35], [65, 39, 35], [73, 47, 43], [7..."
1,"(40.7214509,-73.9973979)",11,4,"[[[65, 40, 36], [69, 36, 36], [69, 36, 36], [7..."
2,"(40.7208958,-73.9957503)",12,4,"[[[65, 51, 53], [66, 52, 79], [66, 52, 79], [7..."
3,"(40.7211736,-73.9965747)",13,4,"[[[104, 92, 100], [104, 88, 96], [100, 88, 96]..."
4,"(40.7206316,-73.9949663)",14,4,"[[[144, 136, 140], [138, 132, 137], [130, 122,..."


In [21]:
type(position['image'][0])

numpy.ndarray

In [22]:
position['image'][0].shape

(640, 640, 3)

In [23]:
import pickle
import os.path

file_path = "position_df.pkl"
# n_bytes = 2**31
max_bytes = 2**31 - 1
# data = bytearray(n_bytes)

## write
bytes_out = pickle.dumps(position)
with open(file_path, 'wb') as f_out:
    for idx in range(0, len(bytes_out), max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])

In [24]:
## read
bytes_in = bytearray(0)
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
    for _ in range(0, input_size, max_bytes):
        bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)