In [20]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline  
import mpu

In [21]:
def find_routes_with_ten_readings(df, route_numbers, min_num_readings=10, verbose=False):
    routes = []
    
    for number in route_numbers:
        route_df = df[df['route_number'] == number]
        
        if len(route_df) >= min_num_readings:
            routes.append(route_df)
        elif verbose:
            print('Route: ', number, ' only has ', len(route_df), ' readings!')
    
    print('Found', len(routes), 'routes that have', min_num_readings, 'or more readings')
    
    return pd.concat(routes)


def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names
    
    route_ids = df['route_number'].unique()
    return find_routes_with_ten_readings(df, route_ids, min_num_readings=10)

In [22]:
north_to_west_routes_df = load_csv_as_df('north-to-west-routes-with-cells.csv', '/')
west_to_north_routes_df = load_csv_as_df('west-to-north-routes-with-cells.csv', '/')

Found 196 routes that have 10 or more readings
Found 283 routes that have 10 or more readings


In [23]:
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date: pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)


def calculate_route_durations(df):
    route_durations = {}
    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]

        start_row = route_df[route_df['route_start'] == True]
        end_row = route_df[route_df['route_end'] == True]

        has_start_and_end = True
        if len(start_row) == 0:
            print('No start for route: ', route_id)
            has_start_and_end = False

        if len(end_row) == 0:
            print('No end for route: ', route_id)
            has_start_and_end = False

        if has_start_and_end:
            start_time = start_row['time'].iloc[0]
            end_time = end_row['time'].iloc[0]

            if end_time < start_time:
                print('End time earlier than start time for route number ', route_id)
                print()

            route_duration = end_time - start_time
            # print('route_duration ', route_duration)

            duration_in_seconds = route_duration.total_seconds()

            # print('Route ', route_id, ' duration in seconds ', duration_in_seconds)

            route_durations[route_id] = duration_in_seconds

    duration_df = pd.DataFrame(list(route_durations.items()), columns=['route_number', 'duration_in_seconds'])
    return duration_df

In [24]:
north_to_west_duration_df = calculate_route_durations(north_to_west_routes_df)
north_to_west_duration_df.describe()

Unnamed: 0,route_number,duration_in_seconds
count,196.0,196.0
mean,591157.846939,1848.209184
std,52903.40181,678.761755
min,501872.0,708.0
25%,542577.5,1317.25
50%,596413.0,1706.0
75%,632940.0,2208.0
max,682586.0,4369.0


In [25]:
west_to_north_duration_df = calculate_route_durations(west_to_north_routes_df)
west_to_north_duration_df.describe()

Unnamed: 0,route_number,duration_in_seconds
count,283.0,283.0
mean,592998.600707,1603.75265
std,52955.039419,484.280056
min,500264.0,809.0
25%,551146.0,1288.5
50%,594651.0,1491.0
75%,639403.0,1836.0
max,682626.0,4215.0


In [26]:
def distance_between_gps(gps_one, gps_two):
    # mpu.haversine_distance((lat1, lon1), (lat2, lon2))
    km_distance = mpu.haversine_distance((gps_one[0], gps_one[1]), (gps_two[0], gps_two[1]))

    if km_distance < 0:
        print('got negative distance that\'s weak')
        km_distance *= -1

    return km_distance

In [27]:
def calculate_route_distances(df):
    route_distances = {}
    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]
        route_df.sort_values('time')
        route_df.reset_index(drop=True)

        distance_sum = 0.0
        is_first_row = True
        
        for index, row in route_df.iterrows():
            if not is_first_row:
                last_row = route_df.loc[index - 1]
                last_lat = last_row['latitude']
                last_long = last_row['longitude']
                last_gps = (last_lat, last_long)

                current_lat = row['latitude']
                current_long = row['longitude']
                current_gps = (current_lat, current_long)

                distance_between_rows = distance_between_gps(last_gps, current_gps)
                distance_sum += distance_between_rows
            else:
                is_first_row = False

        route_distances[route_id] = distance_sum

    distance_df = pd.DataFrame(list(route_distances.items()), columns=['route_number', 'distance_in_km'])
    return distance_df

In [28]:
north_to_west_distance_df = calculate_route_distances(north_to_west_routes_df)
north_to_west_distance_df.describe()


Unnamed: 0,route_number,distance_in_km
count,196.0,196.0
mean,591157.846939,15.953792
std,52903.40181,3.233245
min,501872.0,9.99767
25%,542577.5,13.532529
50%,596413.0,16.004764
75%,632940.0,17.588881
max,682586.0,26.372868


In [29]:
west_to_north_distance_df = calculate_route_distances(west_to_north_routes_df)
west_to_north_distance_df.describe()


Unnamed: 0,route_number,distance_in_km
count,283.0,283.0
mean,592998.600707,15.766159
std,52955.039419,3.139819
min,500264.0,9.209701
25%,551146.0,13.349983
50%,594651.0,15.948325
75%,639403.0,17.084236
max,682626.0,29.37661


In [30]:
nw_time_distance_df = pd.merge(north_to_west_duration_df, north_to_west_distance_df, on="route_number")
# nw_time_distance_df.to_csv('north_to_west_time_distance_df.csv', encoding='utf-8', index=False)
nw_time_distance_df.describe()

Unnamed: 0,route_number,duration_in_seconds,distance_in_km
count,196.0,196.0,196.0
mean,591157.846939,1848.209184,15.953792
std,52903.40181,678.761755,3.233245
min,501872.0,708.0,9.99767
25%,542577.5,1317.25,13.532529
50%,596413.0,1706.0,16.004764
75%,632940.0,2208.0,17.588881
max,682586.0,4369.0,26.372868


In [31]:
wn_time_distance_df = pd.merge(west_to_north_duration_df, west_to_north_distance_df, on="route_number")
# wn_time_distance_df.to_csv('west_to_north_time_distance_df.csv', encoding='utf-8', index=False)
wn_time_distance_df.describe()

Unnamed: 0,route_number,duration_in_seconds,distance_in_km
count,283.0,283.0,283.0
mean,592998.600707,1603.75265,15.766159
std,52955.039419,484.280056,3.139819
min,500264.0,809.0,9.209701
25%,551146.0,1288.5,13.349983
50%,594651.0,1491.0,15.948325
75%,639403.0,1836.0,17.084236
max,682626.0,4215.0,29.37661


In [32]:
def merge_distance_time_into_route_df(dt_df, df):
    route_dfs = []
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]
        distance_time_df = dt_df[dt_df['route_number'] == route_id]
        
        distance = distance_time_df['distance_in_km'].iloc[0]
        time = distance_time_df['duration_in_seconds'].iloc[0]
        
        route_df['distance_in_km'] = distance
        route_df['duration_in_seconds'] = time
        
        route_dfs.append(route_df)
    
    return pd.concat(route_dfs)

In [33]:
def reduce_dataframe_by_col(df, col_name):
    row_dfs = []
    unique_values = df[col_name].unique()

    for val in unique_values:
        val_df = df[df[col_name] == val]

        row_dfs.append(val_df.iloc[[0]])
    
    return pd.concat(row_dfs)

In [34]:
merged_north = merge_distance_time_into_route_df(nw_time_distance_df, north_to_west_routes_df)
merged_north = reduce_dataframe_by_col(merged_north, 'route_number')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [35]:
merged_west = merge_distance_time_into_route_df(wn_time_distance_df, west_to_north_routes_df)
merged_west = reduce_dataframe_by_col(merged_west, 'route_number')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [36]:
def find_fraud_routes_by_time_distance(df, avg_time, avg_distance):
    return df[(df['distance_in_km'] >= avg_distance) & (df['duration_in_seconds'] >= avg_time)]

In [42]:
merged_west.describe()

Unnamed: 0,taxi_id,longitude,latitude,occupancy_status,speed,route_number,row,column,distance_in_km,duration_in_seconds
count,283.0,283.0,283.0,283.0,283.0,283.0,283.0,283.0,283.0,283.0
mean,30444.876325,114.109491,22.545812,1.0,9.936396,592998.600707,10.293286,21.805654,15.766159,1603.75265
std,4398.303968,0.011087,0.006813,0.0,13.719435,52955.039419,0.456075,0.396398,3.139819,484.280056
min,22262.0,114.085648,22.531549,1.0,0.0,500264.0,10.0,21.0,9.209701,809.0
25%,26799.5,114.102314,22.541316,1.0,0.0,551146.0,10.0,22.0,13.349983,1288.5
50%,31304.0,114.1119,22.54545,1.0,4.0,594651.0,10.0,22.0,15.948325,1491.0
75%,34408.5,114.117058,22.551416,1.0,16.0,639403.0,11.0,22.0,17.084236,1836.0
max,36918.0,114.13028,22.559168,1.0,85.0,682626.0,11.0,22.0,29.37661,4215.0


In [39]:
north_fraud = find_fraud_routes_by_time_distance(merged_north, 1849, 16)
print(len(north_fraud))

55


In [46]:
with open('North_to_West_Suspected_Fraud_By_Time_Distance.txt', 'w') as f:
    for item in north_fraud['route_number'].unique():
        f.write("%s\n" % item)

In [43]:
west_fraud = find_fraud_routes_by_time_distance(merged_west, 1604, 16)

In [44]:
print(len(west_fraud))

78


In [47]:
with open('West_to_North_Suspected_Fraud_By_Time_Distance.txt', 'w') as f:
    for item in west_fraud['route_number'].unique():
        f.write("%s\n" % item)