In [None]:
import pandas as pd 
import os

In [None]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name
    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)
    if column_names is not None:
        df.columns = column_names
    return df

In [None]:
train_df = load_csv_as_df('all-train-to-air-routes-no-grids.csv', '/')
train_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time
0,22.608,114.0326,1,False,324340,True,30907,2018-11-25 06:27:58
1,22.608601,114.032097,1,False,324340,False,30907,2018-11-25 06:28:28
2,22.607599,114.0299,1,False,324340,False,30907,2018-11-25 06:28:58
3,22.6063,114.028099,1,False,324340,False,30907,2018-11-25 06:29:29
4,22.604,114.024597,1,False,324340,False,30907,2018-11-25 06:29:59


In [None]:
air_df=load_csv_as_df("https://raw.githubusercontent.com/dtroupe18/TaxiProject/master/AirToTrain/CsvFiles/all-air-to-train-routes-no-grids(1).csv")
air_df.head()

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.626467,113.81015,1,False,2199,True,dodBcDpez8w=,2016-06-13 12:03:02,313.0-405.0,313.0,405.0
1,22.625467,113.809464,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:24,312.0-404.0,312.0,404.0
2,22.625017,113.808647,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:31,312.0-404.0,312.0,404.0
3,22.614933,113.8116,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:04:53,307.0-405.0,307.0,405.0
4,22.6131,113.812599,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:05:03,306.0-406.0,306.0,406.0


Both latitude and longitude are measured in degrees, which are in turn divided into minutes and seconds

In [None]:
max_lat = 23.0
min_lat = 22.0

min_long = 113
max_long = 115

diff_in_latitude = max_lat - min_lat
diff_in_longitude = max_long - min_long

# 20 rows
cell_size = diff_in_latitude / 20.0

In [None]:
def concat(row, col):
    return str(row) + '-' + str(col)


def map_gps_to_box(latitude, longitude):
    row_number = int((latitude - min_lat) // cell_size)
    col_number = int((longitude - min_long) // cell_size)

    if col_number < 0 or row_number < 0:
        return -1, -1, -1

    cell_number_str = str(row_number) + str(col_number)

    return concat(row_number, col_number), row_number, col_number

In [None]:
def map_gps_to_cell(df):
    cells = []
    rows = []
    cols = []

    for index, row in df.iterrows():
        lat = row['latitude']
        long = row['longitude']

        cell_number, cell_row, cell_col = map_gps_to_box(lat, long)

        cells.append(cell_number)
        cols.append(cell_col)
        rows.append(cell_row)

    df['cell'] = cells
    df['row'] = rows
    df['column'] = cols

    return df

In [None]:
air_df = map_gps_to_cell(air_df)
air_df

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.626467,113.810150,1,False,2199,True,dodBcDpez8w=,2016-06-13 12:03:02,12-16,12,16
1,22.625467,113.809464,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:24,12-16,12,16
2,22.625017,113.808647,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:03:31,12-16,12,16
3,22.614933,113.811600,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:04:53,12-16,12,16
4,22.613100,113.812599,1,False,2199,False,dodBcDpez8w=,2016-06-13 12:05:03,12-16,12,16
...,...,...,...,...,...,...,...,...,...,...,...
3604,22.602383,114.025581,1,False,433086,False,36373,2018-11-25 03:52:50,12-20,12,20
3605,22.605801,114.027618,1,False,433086,False,36373,2018-11-25 03:53:38,12-20,12,20
3606,22.609051,114.033279,1,False,433086,False,36373,2018-11-25 03:54:34,12-20,12,20
3607,22.608482,114.033730,1,False,433086,False,36373,2018-11-25 03:55:29,12-20,12,20


In [None]:
train_df=map_gps_to_cell(train_df)
train_df

Unnamed: 0,latitude,longitude,occupancy_status,route_end,route_number,route_start,taxi_id,time,cell,row,column
0,22.608000,114.032600,1,False,324340,True,30907,2018-11-25 06:27:58,12-20,12,20
1,22.608601,114.032097,1,False,324340,False,30907,2018-11-25 06:28:28,12-20,12,20
2,22.607599,114.029900,1,False,324340,False,30907,2018-11-25 06:28:58,12-20,12,20
3,22.606300,114.028099,1,False,324340,False,30907,2018-11-25 06:29:29,12-20,12,20
4,22.604000,114.024597,1,False,324340,False,30907,2018-11-25 06:29:59,12-20,12,20
...,...,...,...,...,...,...,...,...,...,...,...
3093,22.607834,113.839966,1,False,172162,False,gB7yn7HysUY=,2016-06-13 15:26:36,12-16,12,16
3094,22.601282,113.843132,1,False,172162,False,gB7yn7HysUY=,2016-06-13 15:31:36,12-16,12,16
3095,22.601282,113.843132,1,False,172162,False,gB7yn7HysUY=,2016-06-13 15:31:44,12-16,12,16
3096,22.587133,113.854797,1,False,172162,False,gB7yn7HysUY=,2016-06-13 15:36:44,11-17,11,17


In [None]:
air_df.to_csv('all-air-to-train-routes-with-500-cells.csv', encoding='utf-8', index=False)
train_df.to_csv('all-train-to-air-routes-with-500-cells.csv', encoding='utf-8', index=False)