In [1]:
import pandas as pd
import os

In [2]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names

    return df

In [3]:
north_train_df = load_csv_as_df('north-to-west-routes-no-grids.csv', '/')
west_train_df = load_csv_as_df('west-to-north-routes-no-grids.csv', '/')

In [4]:
max_lat = 23.0
min_lat = 22.0

min_long = 113
max_long = 115

diff_in_latitude = max_lat - min_lat
diff_in_longitude = max_long - min_long

# 20 rows
cell_size = diff_in_latitude / 20.0

In [5]:
def concat(row, col):
    return str(row) + '-' + str(col)


def map_gps_to_box(latitude, longitude):
    row_number = int((latitude - min_lat) // cell_size)
    col_number = int((longitude - min_long) // cell_size)

    if col_number < 0 or row_number < 0:
        return -1, -1, -1

    cell_number_str = str(row_number) + str(col_number)

    return concat(row_number, col_number), row_number, col_number

In [6]:
def map_gps_to_cell(df):
    cells = []
    rows = []
    cols = []

    for index, row in df.iterrows():
        lat = row['latitude']
        long = row['longitude']

        cell_number, cell_row, cell_col = map_gps_to_box(lat, long)

        cells.append(cell_number)
        cols.append(cell_col)
        rows.append(cell_row)

    df['cell'] = cells
    df['row'] = rows
    df['column'] = cols

    return df

In [7]:
north_train_df = map_gps_to_cell(north_train_df)
west_train_df = map_gps_to_cell(west_train_df)

In [8]:
west_train_df.head()

Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end,cell,row,column
0,22262,2018-12-08 09:42:28,114.121231,22.547068,1,5,500264,True,False,10-22,10,22
1,22262,2018-12-08 09:42:58,114.123497,22.547533,1,27,500264,False,False,10-22,10,22
2,22262,2018-12-08 09:43:26,114.123619,22.5478,1,14,500264,False,False,10-22,10,22
3,22262,2018-12-08 09:43:29,114.123581,22.547783,1,0,500264,False,False,10-22,10,22
4,22262,2018-12-08 09:43:32,114.123535,22.547783,1,0,500264,False,False,10-22,10,22


In [9]:
north_train_df.head()

Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end,cell,row,column
0,22391,2018-12-08 22:02:44,114.026115,22.6106,1,43,501872,True,False,12-20,12,20
1,22391,2018-12-08 22:03:04,114.023849,22.6092,1,50,501872,False,False,12-20,12,20
2,22391,2018-12-08 22:03:24,114.022003,22.607018,1,58,501872,False,False,12-20,12,20
3,22391,2018-12-08 22:03:44,114.023666,22.604549,1,58,501872,False,False,12-20,12,20
4,22391,2018-12-08 22:04:04,114.025284,22.602533,1,49,501872,False,False,12-20,12,20


In [10]:
north_train_df.to_csv('north-to-west-routes-with-cells.csv', encoding='utf-8', index=False)
west_train_df.to_csv('west-to-north-routes-with-cells.csv', encoding='utf-8', index=False)