In [None]:
# from pylab import *
import os
import pandas
import tarfile

locations = {
    'gfs': '/l/cnets/datasets/Telecom_BDC_2015',
    'diskstation': '/media/diskstation/Datasets/Telecom Big Data Challenge 2015',
    'frosty': '/home/giovanni/data/tbdc2015',
    'hdd': '/media/giovanni/Multimedia/Datasets/Telecom Big Data Challenge 2015',
    'repo': '/home/giovanni/repos/tbdc15'
}


def getpaths(city, loc='gfs', boxesloc=None):
    root = locations[loc]
    paths = {
        'trips': '{root}/infoblu/{city}.tar.gz'.format(root=root, city=city),
        'accidents': '{root}/unipol/BDC2015_UnipolsaiClaims2014_{city}.csv'.format(root=root, city=city[:2].upper()),
        'boxes': '{root}/city_boxes.csv'.format(root=root if boxesloc is None else locations[boxesloc]),
        'store': '{root}/trip_accidents_store.hdf'.format(root=root)
    }
    return paths


def getbox(path, city):
    city_code = city[0].lower()
    df_box = pandas.read_csv(path, index_col='city')
    df_box.ix[city_code]
    box = df_box.ix[city_code].to_dict()
    return box


def read_trips(path, box, scale=1000.0, break_at=None):
    trips = pandas.DataFrame(columns=index_columns + ['trips', 'trips_start']).set_index(index_columns)

    # set break_at to an integer and it will stop exactly after that number of iterations
    i = 0

    with tarfile.open(path, mode='r:gz') as tf:
        # open tar file in random access mode with on-the-fly gzip decompression
        for member in tf:
            if break_at is not None and i == break_at:
                break
            i += 1

            # read contents of TAR archive. Each file in the archive contains 
            # the data of a different day.
            print member.name
            f = tf.extractfile(member)

            # do not use the "type" and "speed" columns, since we don't need them. This saves memory.
            df = pandas.read_csv(f, 
                                 names=['trip', 'timestamp', 'lat', 'lon', 'type', 'speed'],
                                 usecols=['trip', 'timestamp', 'lat', 'lon'],
                                 sep=';', 
                                 parse_dates=['timestamp'])

            # compute the cell, weekday, and hour
            df['i'] = ((df['lat'] - box['lat_min']) * scale).round()
            df['j'] = ((df['lon'] - box['lon_min']) * scale).round()
            df['weekday'] = df['timestamp'].map(pandas.Timestamp.weekday)
            df['hour'] = df['timestamp'].map(lambda k: k.hour)

            # count how many trips in each cell, weekday, hour and append. 
            # Note that the first group-by returns a series object, 
            # and we wrap this into a DataFrame.        
            s1 = df.filter(index_columns).groupby(index_columns).apply(len)

            # do the same but only considering the first frame of each trip.
            df_ff = df.groupby('trip', as_index=False).head(1)
            s2 = df_ff.filter(index_columns).groupby(index_columns).apply(len)

            df = pandas.DataFrame({'trips': s1, 'trips_start': s2})

            trips = trips.append(df)

    return trips


def read_accidents(path, box, scale=1000.0):
    df = pandas.read_csv(path)
    df.rename(columns={'day_type': 'weekday', 'time_range': 'hour'}, inplace=True)
    df['i'] = ((df['latitude'] - box['lat_min']) * scale).round()
    df['j'] = ((df['longitude'] - box['lon_min']) * scale).round()
    s = df.groupby(index_columns).apply(len)
    accidents = pandas.DataFrame({'accidents': s})
    return accidents


def make_city_frame(city, 
                    loc='frosty', 
                    boxesloc='frosty', 
                    scale=1000.0, 
                    break_at=None):
    """
    Reads data of trips and accidents and store data frame into HDF format
    """
    paths = getpaths(city, loc=location, boxesloc='frosty')
    index_columns = ['i', 'j', 'weekday', 'hour']
    box = getbox(paths['boxes'], city)
    print "Reading trips..."
    trips = read_trips(paths['trips'], box, scale=scale, break_at=break_at)
    print "Reading accidents..."
    accidents = read_accidents(paths['accidents'], box, scale=scale)
    print "Storing data..."
    joined_df = trips.join(accidents).fillna(0).reset_index()
    joined_df.to_hdf(paths['store'], city)
    print "Data saved to HDF."

# Create dataset

## Select city 

In [None]:
cities = ['bari', 'milano', 'napoli', 'palermo', 'roma', 'torino', 'venezia']
location = 'diskstation'
boxes_location = 'frosty'
scale = 1000.0

## Read data

In [None]:
for city in cities:
    print "City: {}".format(city)
    make_city_frame(city, loc=location, scale=scale, boxesloc=boxes_location, break_at=1)

# Plot the data

## Read the data from HDF 

In [None]:
df = pandas.read_hdf(store_path, city)
# df = df.query('(weekday < 5) & ((hour >= 20) | (hour <= 6))')
df = df.groupby(['i', 'j']).sum().filter(['trips', 'trips_start', 'accidents'])
df = df.query('accidents > 0')

## Scatter plot of trips vs accidents in each cell with average trend

In [None]:
df.plot(x='trips_start', y='trips', kind='scatter', alpha=.2)
xscale('log')
yscale('log')
xlim(1, 1e4)
xlabel('Trip starts')
ylabel('Trips')

In [None]:
%matplotlib inline
from pylab import *

# scatter plot
df.plot(x='trips', y='accidents', kind='scatter', marker='x', alpha=.2, color='k')

# trend line
bins = numpy.logspace(numpy.log10(df['trips'].min()), numpy.log10(df['trips'].max()), 20)
df.groupby(numpy.digitize(df['trips'], bins=bins)).mean().plot(x='trips', y='accidents',
                                                               color='r', linestyle='solid',
                                                               marker='o',
                                                               ax=gca(), alpha=.5, linewidth=2)
grid('off')
title(city)
xlabel('Traffic')
ylabel('Accidents')
#ylim(0, 30)
xscale('log')
#yscale('log')
tight_layout()
savefig('trips_accidents_scatter_{}.pdf'.format(city))
show()