In [None]:
#installations
!pip install kshingle

In [None]:
!pip install prince

In [None]:
!pip install rectangle-packer

In [None]:
!pip install grandalf

In [None]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
import kshingle as ks
import random
import math
import itertools
import prince
import warnings
from functools import wraps
from itertools import combinations, product
from scipy.spatial import Voronoi
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.optimize import minimize, NonlinearConstraint
from rpack import pack
from grandalf.graphs import Vertex, Edge, Graph
from grandalf.layouts import SugiyamaLayout, DummyVertex
from netgraph_functions import get_geometric_layout, _initialise_geometric_node_layout, _flatten, _get_unique_nodes, get_fruchterman_reingold_layout, _edge_list_to_adjacency_matrix, _get_fr_repulsion, _get_fr_attraction, _fruchterman_reingold, _get_temperature_decay, _is_within_bbox, _fit_to_frame, _get_angle, _rotate
from custom_functions import title_jaccard_similarity, description_jaccard_similarity, title_and_description_jaccard_similarity, distance_from_similarity, get_edges, get_edge_lengths, normalize, get_node_positions, get_df_cols

In [None]:
#load metadata dataframe
df = pd.read_csv("taxi_metadata_2023_05_04.csv")
display(df)

Unnamed: 0,title,description,size,num_spatial,num_categorical,num_temporal,num_columns,start_date,end_date,temporal_col_names,cat_col_names,spatial_col_names,all_col_names,full_metadata
0,Taxi Medallion Transfers,Operation of a taxi cab in Chicago requires a ...,304895,0,0,1,5,2007-10-10 00:00:00,2017-07-21 00:00:00,closing_date,,,"closing_date, public_vehicle_number, sale_pric...","{'name': 'Taxi Medallion Transfers', 'source':..."
1,2017 Yellow Taxi Data,This dataset includes trip records from all tr...,13949149,0,0,1,3,2017-07-03 23:00:16,2017-11-04 03:58:56,tpep_pickup_datetime,tpep_pickup_datetime,,"tpep_pickup_datetime, pulocationid, n._trips",{'attribute_keywords': ['tpep_pickup_datetime'...
2,Green Taxi Data 2015,This dataset contains green taxi trip records ...,1066116963,4,1,2,23,2015-01-02 21:26:24,2015-03-23 08:42:40,"pickup_datetime, dropoff_datetime",store_and_fwd_flag,"pickup_longitude, pickup_latitude, dropoff_lon...","vendorid, pickup_datetime, dropoff_datetime, s...","{'attribute_keywords': ['VendorID', 'Vendor', ..."
3,Yellow Taxi Data 2015,This dataset contains the daily number of yell...,20244,0,0,1,4,2015-01-07 00:00:00,2015-09-05 00:00:00,pickup_datetime,,,"pickup_datetime, n._trips, price, distance","{'attribute_keywords': ['pickup_datetime', 'pi..."
4,2017-yellow-cab-lga,"new york, taxi, yellow cab, LaGuardia, LGA, ci...",319452961,0,1,2,18,2017-01-08 13:20:00,2017-09-04 00:10:40,"tpep_pickup_datetime, tpep_dropoff_datetime",store_and_fwd_flag,,"tpep_pickup_datetime, unnamed:_0, dolocationid...",{'attribute_keywords': ['tpep_pickup_datetime'...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,Monthly Transportation Statistics,Monthly Transportation Statistics is a compila...,492318,0,0,1,136,1948-04-01 00:00:00,1999-05-01 00:00:00,date,,,"index, date, air_safety_-_general_aviation_fat...","{'name': 'Monthly Transportation Statistics', ..."
105,2010 Census/ACS Detailed Block Group Data,detailed characteristics of people and housing...,506671,1,0,0,190,1948-04-01 00:00:00,1999-05-01 00:00:00,,county,,"county, census_tract, block_group, block_group...",{'name': '2010 Census/ACS Detailed Block Group...
106,2013-2017 American Community Survey Detailed C...,DETAILED CHARACTERISTICS OF PEOPLE AND HOUSING...,435842,0,1,0,211,1948-04-01 00:00:00,1999-05-01 00:00:00,,inside_kcmo_or_not,,"island, census_tract, inside_kcmo_or_not, east...",{'name': '2013-2017 American Community Survey ...
107,Parking - Edmonton Insight Community,This was one single topic among many as part o...,1085353,0,28,2,74,2016-02-09 13:34:56,2016-02-11 16:34:08,"response_date, completion_date",q4a_parking_it_is_easy_to_find_a_place_to_park...,,"response_date, completion_date, q1a_parking_un...","{'attribute_keywords': ['Response Date', 'Resp..."


In [None]:
#get similarity matrices
matrix_title_sim = title_jaccard_similarity(df, 5) 
matrix_desc_sim = description_jaccard_similarity(df, 9) 
matrix_both_sim = title_and_description_jaccard_similarity(df, 9) 

In [None]:
#get distance matrices
matrix_title_dist = distance_from_similarity(matrix_title_sim)
matrix_desc_dist = distance_from_similarity(matrix_desc_sim)
matrix_both_dist = distance_from_similarity(matrix_both_sim)

In [None]:
#normalize distances between 0.0001 and 1.0001 rather than 0 and 1 to avoid breaking least squares optimization constraints
matrix_title_dist_norm = normalize(matrix_title_dist, {'actual': {'lower': 0, 'upper': 1}, 'desired': {'lower': 0.0001, 'upper': 1.0001}})
matrix_desc_dist_norm = normalize(matrix_desc_dist, {'actual': {'lower': 0, 'upper': 1}, 'desired': {'lower': 0.0001, 'upper': 1.0001}})
matrix_both_dist_norm = normalize(matrix_both_dist, {'actual': {'lower': 0, 'upper': 1}, 'desired': {'lower': 0.0001, 'upper': 1.0001}})

In [None]:
#get edges [tuples in the format (source node ID, target node ID)] and edge lengths [dictionary mapping edges to their distances] for each measure
edges = get_edges(df)

title_edge_lengths = get_edge_lengths(edges, matrix_title_dist_norm)
desc_edge_lengths = get_edge_lengths(edges, matrix_desc_dist_norm)
both_edge_lengths = get_edge_lengths(edges, matrix_both_dist_norm)

In [None]:
#get x and y coordinates for each measure
title_node_positions = get_node_positions(edges, title_edge_lengths)
desc_node_positions = get_node_positions(edges, desc_edge_lengths)
both_node_positions = get_node_positions(edges, both_edge_lengths)

In [None]:
#separate column names by type
df_cols = get_df_cols(df)
display(df_cols)

Unnamed: 0,cat_col_names,spatial_col_names,temporal_col_names,misc_col_names
0,,,closing_date,public_vehicle_number sellers_company_name sal...
1,tpep_pickup_datetime,,tpep_pickup_datetime,n._trips pulocationid
2,store_and_fwd_flag,pickup_longitude pickup_latitude dropoff_longi...,pickup_datetime dropoff_datetime,ehail_fee extra1 extra5 distance passenger_cou...
3,,,pickup_datetime,n._trips distance price
4,store_and_fwd_flag,,tpep_pickup_datetime tpep_dropoff_datetime,dolocationid pulocationid payment_type ratecod...
...,...,...,...,...
104,,,date,state_and_local_government_construction_spendi...
105,county,,,"households_with_income_$50,000-$59,999 occupie..."
106,inside_kcmo_or_not,,,"households_with_income_$50,000-$59,999 occupie..."
107,q4a_parking_it_is_easy_to_find_a_place_to_park...,,response_date completion_date,q5b_parking_the_restaurant_is_located_on_the_g...


In [None]:
#perform MCA
X = df_cols
mca = prince.MCA()
mca = mca.fit(X)
mca = mca.transform(X)
display(mca)

Unnamed: 0,0,1
0,-0.082933,-0.133997
1,-0.092854,-0.438948
2,-0.108204,-0.482393
3,-0.081727,-0.133820
4,-0.102378,-0.384970
...,...,...
104,-0.093979,-0.167076
105,-0.110030,-0.145923
106,-0.112457,-0.142089
107,-0.195272,-0.396485


In [None]:
#normalize values between 0.05 and 0.95
mca_arr = mca.to_numpy()
normalized_mca = np.array(normalize(mca_arr, {'actual': {'lower': mca.min(), 'upper': mca.max()}, 'desired': {'lower': 0.05, 'upper': 0.95}}))

In [None]:
#append coordinates to dataframe
df["title_x"] = title_node_positions[:,0]
df["title_y"] = title_node_positions[:,1]

df["description_x"] = desc_node_positions[:,0]
df["description_y"] = desc_node_positions[:,1]

df["title_and_description_x"] = both_node_positions[:,0]
df["title_and_description_y"] = both_node_positions[:,1]

df["column_name_x"] = normalized_mca[:,0]
df["column_name_y"] = normalized_mca[:,1]

df.index.name = 'id'

In [None]:
display(df)

Unnamed: 0_level_0,title,description,size,num_spatial,num_categorical,num_temporal,num_columns,start_date,end_date,temporal_col_names,...,all_col_names,full_metadata,title_x,title_y,description_x,description_y,title_and_description_x,title_and_description_y,column_name_x,column_name_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Taxi Medallion Transfers,Operation of a taxi cab in Chicago requires a ...,304895,0,0,1,5,2007-10-10 00:00:00,2017-07-21 00:00:00,closing_date,...,"closing_date, public_vehicle_number, sale_pric...","{'name': 'Taxi Medallion Transfers', 'source':...",0.346921,0.905951,0.476550,0.569313,0.575485,0.429288,0.077346,0.155180
1,2017 Yellow Taxi Data,This dataset includes trip records from all tr...,13949149,0,0,1,3,2017-07-03 23:00:16,2017-11-04 03:58:56,tpep_pickup_datetime,...,"tpep_pickup_datetime, pulocationid, n._trips",{'attribute_keywords': ['tpep_pickup_datetime'...,0.449688,0.650883,0.812719,0.236679,0.657022,0.310895,0.076517,0.085612
2,Green Taxi Data 2015,This dataset contains green taxi trip records ...,1066116963,4,1,2,23,2015-01-02 21:26:24,2015-03-23 08:42:40,"pickup_datetime, dropoff_datetime",...,"vendorid, pickup_datetime, dropoff_datetime, s...","{'attribute_keywords': ['VendorID', 'Vendor', ...",0.477398,0.456335,0.489782,0.669361,0.486617,0.931488,0.075235,0.075701
3,Yellow Taxi Data 2015,This dataset contains the daily number of yell...,20244,0,0,1,4,2015-01-07 00:00:00,2015-09-05 00:00:00,pickup_datetime,...,"pickup_datetime, n._trips, price, distance","{'attribute_keywords': ['pickup_datetime', 'pi...",0.634079,0.495476,0.397162,0.531212,0.516670,0.494438,0.077446,0.155220
4,2017-yellow-cab-lga,"new york, taxi, yellow cab, LaGuardia, LGA, ci...",319452961,0,1,2,18,2017-01-08 13:20:00,2017-09-04 00:10:40,"tpep_pickup_datetime, tpep_dropoff_datetime",...,"tpep_pickup_datetime, unnamed:_0, dolocationid...",{'attribute_keywords': ['tpep_pickup_datetime'...,0.528860,0.636978,0.636435,0.662199,0.316823,0.082315,0.075722,0.097926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,Monthly Transportation Statistics,Monthly Transportation Statistics is a compila...,492318,0,0,1,136,1948-04-01 00:00:00,1999-05-01 00:00:00,date,...,"index, date, air_safety_-_general_aviation_fat...","{'name': 'Monthly Transportation Statistics', ...",0.167865,0.625588,0.555632,0.486352,0.534801,0.494473,0.076423,0.147634
105,2010 Census/ACS Detailed Block Group Data,detailed characteristics of people and housing...,506671,1,0,0,190,1948-04-01 00:00:00,1999-05-01 00:00:00,,...,"county, census_tract, block_group, block_group...",{'name': '2010 Census/ACS Detailed Block Group...,0.544830,0.424492,0.498140,0.569520,0.552481,0.068512,0.075083,0.152459
106,2013-2017 American Community Survey Detailed C...,DETAILED CHARACTERISTICS OF PEOPLE AND HOUSING...,435842,0,1,0,211,1948-04-01 00:00:00,1999-05-01 00:00:00,,...,"island, census_tract, inside_kcmo_or_not, east...",{'name': '2013-2017 American Community Survey ...,0.748658,0.120384,0.950000,0.438473,0.319247,0.618242,0.074880,0.153334
107,Parking - Edmonton Insight Community,This was one single topic among many as part o...,1085353,0,28,2,74,2016-02-09 13:34:56,2016-02-11 16:34:08,"response_date, completion_date",...,"response_date, completion_date, q1a_parking_un...","{'attribute_keywords': ['Response Date', 'Resp...",0.295394,0.536956,0.726454,0.736140,0.606602,0.334006,0.067963,0.095299


In [None]:
df.to_csv("/content/taxi_full_metadata_and_scatterplot_coordinates.csv")