In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline  
import mpu

In [2]:
def find_routes_with_ten_readings(df, route_numbers, min_num_readings=10, verbose=False):
    routes = []
    
    for number in route_numbers:
        route_df = df[df['route_number'] == number]
        
        if len(route_df) >= min_num_readings:
            routes.append(route_df)
        elif verbose:
            print('Route: ', number, ' only has ', len(route_df), ' readings!')
    
    print('Found', len(routes), 'routes that have', min_num_readings, 'or more readings')
    
    return pd.concat(routes)


def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names
    
    route_ids = df['route_number'].unique()
    return find_routes_with_ten_readings(df, route_ids, min_num_readings=10)

In [3]:
north_to_west_routes_df = load_csv_as_df('north-to-west-routes-with-cells.csv', '/')
west_to_north_routes_df = load_csv_as_df('west-to-north-routes-with-cells.csv', '/')

Found 197 routes that have 10 or more readings
Found 283 routes that have 10 or more readings


In [4]:
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date: pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)


def calculate_route_durations(df):
    route_durations = {}
    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]

        start_row = route_df[route_df['route_start'] == True]
        end_row = route_df[route_df['route_end'] == True]

        has_start_and_end = True
        if len(start_row) == 0:
            print('No start for route: ', route_id)
            has_start_and_end = False

        if len(end_row) == 0:
            print('No end for route: ', route_id)
            has_start_and_end = False

        if has_start_and_end:
            start_time = start_row['time'].iloc[0]
            end_time = end_row['time'].iloc[0]

            if end_time < start_time:
                print('End time earlier than start time for route number ', route_id)
                print()

            route_duration = end_time - start_time
            # print('route_duration ', route_duration)

            duration_in_seconds = route_duration.total_seconds()

            # print('Route ', route_id, ' duration in seconds ', duration_in_seconds)

            route_durations[route_id] = duration_in_seconds

    duration_df = pd.DataFrame(list(route_durations.items()), columns=['route_number', 'duration_in_seconds'])
    return duration_df

In [5]:
north_to_west_duration_df = calculate_route_durations(north_to_west_routes_df)
north_to_west_duration_df.describe()

Unnamed: 0,route_number,duration_in_seconds
count,197.0,197.0
mean,591202.436548,1845.994924
std,52771.982781,677.740954
min,501872.0,708.0
25%,542602.0,1320.0
50%,596777.0,1695.0
75%,632147.0,2207.0
max,682586.0,4369.0


In [7]:
west_to_north_duration_df = calculate_route_durations(west_to_north_routes_df)
west_to_north_duration_df.describe()

Unnamed: 0,route_number,duration_in_seconds
count,283.0,283.0
mean,592998.600707,1603.75265
std,52955.039419,484.280056
min,500264.0,809.0
25%,551146.0,1288.5
50%,594651.0,1491.0
75%,639403.0,1836.0
max,682626.0,4215.0


In [8]:
def distance_between_gps(gps_one, gps_two):
    # mpu.haversine_distance((lat1, lon1), (lat2, lon2))
    km_distance = mpu.haversine_distance((gps_one[0], gps_one[1]), (gps_two[0], gps_two[1]))

    if km_distance < 0:
        print('got negative distance that\'s weak')
        km_distance *= -1

    return km_distance

In [9]:
def calculate_route_distances(df):
    route_distances = {}
    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]
        route_df.sort_values('time')
        route_df.reset_index(drop=True)

        distance_sum = 0.0
        is_first_row = True
        
        for index, row in route_df.iterrows():
            if not is_first_row:
                last_row = route_df.loc[index - 1]
                last_lat = last_row['latitude']
                last_long = last_row['longitude']
                last_gps = (last_lat, last_long)

                current_lat = row['latitude']
                current_long = row['longitude']
                current_gps = (current_lat, current_long)

                distance_between_rows = distance_between_gps(last_gps, current_gps)
                distance_sum += distance_between_rows
            else:
                is_first_row = False

        route_distances[route_id] = distance_sum

    distance_df = pd.DataFrame(list(route_distances.items()), columns=['route_number', 'distance_in_km'])
    return distance_df

In [10]:
north_to_west_distance_df = calculate_route_distances(north_to_west_routes_df)
north_to_west_distance_df.describe()


Unnamed: 0,route_number,distance_in_km
count,197.0,197.0
mean,591202.436548,45.744501
std,52771.982781,418.144966
min,501872.0,9.99767
25%,542602.0,13.538408
50%,596777.0,16.027918
75%,632147.0,17.616633
max,682586.0,5884.723507


In [11]:
west_to_north_distance_df = calculate_route_distances(west_to_north_routes_df)
west_to_north_distance_df.describe()


Unnamed: 0,route_number,distance_in_km
count,283.0,283.0
mean,592998.600707,15.766159
std,52955.039419,3.139819
min,500264.0,9.209701
25%,551146.0,13.349983
50%,594651.0,15.948325
75%,639403.0,17.084236
max,682626.0,29.37661
