In [1]:
import pandas as pd
import os

In [2]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names

    return df

In [94]:
# new_train_df = load_csv_as_df('new-train-to-air-routes-no-grids.csv', '/RelevantData/')
# new_air_df = load_csv_as_df('new-air-to-train-routes-no-grids.csv', '/RelevantData/')

# original_train_df = load_csv_as_df('train-to-air-routes-no-grids.csv', '/RelevantData/')
# original_air_df = load_csv_as_df('air-to-train-routes-no-grids.csv', '/RelevantData/')

# train_df = pd.concat([original_train_df, new_train_df])
# train_df = train_df.drop(train_df.columns[0], axis=1)
# train_df.to_csv('all-train-to-air-routes-no-grids.csv', encoding='utf-8', index=False)

# air_df = pd.concat([new_air_df, original_air_df])
# air_df = air_df.drop(air_df.columns[0], axis=1)
# # air_df.to_csv('all-air-to-train-routes-no-grids.csv', encoding='utf-8', index=False)
# air_df.head()

In [96]:
train_df = load_csv_as_df('all-train-to-air-routes-no-grids.csv', '/')
train_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time
0,22.608,114.0326,1,False,324340,True,30907,2018-11-25 06:27:58
1,22.608601,114.032097,1,False,324340,False,30907,2018-11-25 06:28:28
2,22.607599,114.0299,1,False,324340,False,30907,2018-11-25 06:28:58
3,22.6063,114.028099,1,False,324340,False,30907,2018-11-25 06:29:29
4,22.604,114.024597,1,False,324340,False,30907,2018-11-25 06:29:59


In [95]:
air_df = load_csv_as_df('all-air-to-train-routes-no-grids(1).csv', '/')
air_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.626467,113.81015,1,False,2199,True,dodBcDpez8w=,2016-06-13 12:03:02,313.0-405.0,313.0,405.0
1,22.625467,113.809464,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:24,312.0-404.0,312.0,404.0
2,22.625017,113.808647,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:31,312.0-404.0,312.0,404.0
3,22.614933,113.8116,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:04:53,307.0-405.0,307.0,405.0
4,22.6131,113.812599,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:05:03,306.0-406.0,306.0,406.0


In [139]:
max_lat = 23.0
min_lat = 22.0

min_long = 113
max_long = 115

diff_in_latitude = max_lat - min_lat
diff_in_longitude = max_long - min_long

# 40 rows so
cell_size = diff_in_latitude / 500.0

In [140]:
def concat(row, col):
    return str(row) + '-' + str(col)


def map_gps_to_box(latitude, longitude):
    row_number = int((latitude - min_lat) // cell_size)
    col_number = int((longitude - min_long) // cell_size)

    if col_number < 0 or row_number < 0:
        return -1, -1, -1

    cell_number_str = str(row_number) + str(col_number)

    return concat(row_number, col_number), row_number, col_number

In [141]:
def map_gps_to_cell(df):
    cells = []
    rows = []
    cols = []

    for index, row in df.iterrows():
        lat = row['latitude']
        long = row['longitude']

        cell_number, cell_row, cell_col = map_gps_to_box(lat, long)

        cells.append(cell_number)
        cols.append(cell_col)
        rows.append(cell_row)

    df['cell'] = cells
    df['row'] = rows
    df['column'] = cols

    return df

In [142]:
air_df = map_gps_to_cell(air_df)

In [143]:
train_df = map_gps_to_cell(train_df)

In [144]:
air_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.626467,113.81015,1,False,2199,True,dodBcDpez8w=,2016-06-13 12:03:02,313.0-405.0,313.0,405.0
1,22.625467,113.809464,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:24,312.0-404.0,312.0,404.0
2,22.625017,113.808647,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:31,312.0-404.0,312.0,404.0
3,22.614933,113.8116,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:04:53,307.0-405.0,307.0,405.0
4,22.6131,113.812599,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:05:03,306.0-406.0,306.0,406.0


In [145]:
train_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.608,114.0326,1,False,324340,True,30907,2018-11-25 06:27:58,304.0-516.0,304.0,516.0
1,22.608601,114.032097,1,False,324340,False,30907,2018-11-25 06:28:28,304.0-516.0,304.0,516.0
2,22.607599,114.0299,1,False,324340,False,30907,2018-11-25 06:28:58,303.0-514.0,303.0,514.0
3,22.6063,114.028099,1,False,324340,False,30907,2018-11-25 06:29:29,303.0-514.0,303.0,514.0
4,22.604,114.024597,1,False,324340,False,30907,2018-11-25 06:29:59,301.0-512.0,301.0,512.0


In [146]:
air_df.to_csv('all-air-to-train-routes-with-500-cells.csv', encoding='utf-8', index=False)
train_df.to_csv('all-train-to-air-routes-with-500-cells.csv', encoding='utf-8', index=False)