In [1]:
import pandas as pd
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt
import numpy as np

Leitura do Dataset

In [2]:
df = pd.read_csv('datasets/LondonTrajectoriesDataset.csv', sep=',')

Remove unwanted EOF 

In [3]:
df = df[df['ID'] != ' ']

Reorganiza cada trace existente no dataset em arrays na forma [longitude, latitude]
para posterior análise das rotas

In [4]:
def reorganize_traces(coordinates):
    # copy the coordinates array
    coordinates_array = coordinates.copy()
    coordinates_plot = []
    # iterate over the coordinates
    for coordinate in coordinates_array:
        coordinates_str = coordinate.split(';')
        coordinate_2 = []
        # iterate over the string 
        for coordinate_str in coordinates_str:
            [latitude, longitude] = coordinate_str.split(':')
            coordinate_2.append(np.array([float(latitude), float(longitude)]))
        coordinates_plot.append(np.array(coordinate_2))
    return coordinates_plot

In [5]:
coordinates_plot = reorganize_traces(df[df['setID'] == 56]['coordinates'])

Convert the obtained list to a numpy array

In [6]:
coordinates_plot = np.array(coordinates_plot, dtype='object')

Obtém as maiores latitudes/longitudes de cada trace

In [7]:
def get_minimum_max_coordinates(coordinates_plot):
    longitudes_max, longitudes_min, latitudes_max, latitudes_min, longitudes_latitudes = [], [], [], [], []
    for i in range(len(coordinates_plot)):
        longitude_latitude = coordinates_plot[i].T
        longitudes_latitudes.append(longitude_latitude)
        longitudes_max.append(np.amax(longitude_latitude[0]))
        longitudes_min.append(np.amin(longitude_latitude[0]))
        latitudes_max.append(np.amax(longitude_latitude[1]))
        latitudes_min.append(np.amin(longitude_latitude[1]))

    longitude_max = np.amax(longitudes_max)
    longitude_min = np.amin(longitudes_min)

    latitude_max = np.amax(latitudes_max)
    latitude_min = np.amin(latitudes_min)
    return longitudes_latitudes, longitude_min, longitude_max,latitude_min, latitude_max

Not that we've gotten the traces for each path it's time to associate this information with the CO2 emissions.
This procedure is made by associating the polution in a route with the nearest site

In [8]:
df_sites_data = pd.read_csv('datasets/sites_data_new.csv', sep=',')

Filter the sensors to only consider data related to NO2 

In [9]:
df_sites_data = df_sites_data[df_sites_data['@SpeciesCode'] == 'NO2']

In [10]:
sensors_latitude = df_sites_data['@Latitude'].values
sensors_longitude = df_sites_data['@Longitude'].values 

After getting the data for both traces and polution, we now have to filter the paths which are too far from the sensors because no relevant metric could be extracted from these. Secondly, we apply the polution level taken into consideration the nearest sensor 

In [74]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

def filter_and_attribute_polution(trace_coordinates, sensor_coordinates, tolerance):
    points_id = []
    # for each route    
    for trace in trace_coordinates:
        distance_points = []
        smallest_distance_trace = 1000000000000000
        # for each coordinate in the route
        for point in trace:
            # convert the latitude, longitude pair to radians
            point_in_radians = list(map(radians, point))
            # for each sensor 
            smallest_distance_point = 1000000000000000
            nearestSensor = 0
            for i in range(len(sensor_coordinates)):
                # convert the obtained sensor coordinates to radians
                sensor_coordinates_in_radians = list(map(radians, np.reshape(sensor_coordinates[i], -1)))
                # compute the haversine distance between point in route and sensor
                distance = haversine_distances([point_in_radians, sensor_coordinates_in_radians]) * 6371000
                # save the registered distance
                if distance[0][1] < smallest_distance_point:
                    smallest_distance_point = distance[0][1]
                    nearestSensor = i
            if smallest_distance_point < smallest_distance_trace:
                smallest_distance_trace = smallest_distance_point
            distance_points.append(nearestSensor)
        if smallest_distance_trace > tolerance:
            return 0
        points_id.append(distance_points)
    return points_id

sensors_coordinates = np.array([sensors_longitude, sensors_latitude]).T

In [75]:
number_of_sets = 600
total_ids = []
ids_to_drop = []
for i in range(number_of_sets):
    trace_df = df[df['setID'] == float(i)]
    trace_coordinates = reorganize_traces(trace_df['coordinates'])
    id_sensor_array = filter_and_attribute_polution(trace_coordinates, sensors_coordinates, tolerance=300)
    if id_sensor_array == 0:
        ids_to_drop.append(i)
    else:
        total_ids.append(id_sensor_array)

In [76]:
df_filtered = df[~(df['setID'].isin(ids_to_drop))]

In [77]:
polution_set = []
# iterate over the sets
for id_set in total_ids:
    # store the polution associated to each set
    polution_data = []
    # iterate over the traces in each set
    for id_trace in id_set:
        # store the polution associated to each point in the trace
        trace_values = []
        # for each point in the trace
        for id_item in id_trace:
            # get the sensor with the given id
            trace_values.append(df_sites_data.iloc[id_item]['@annualMean'])
        # get the mean associated to each trace
        mean_trace = np.mean(np.array(trace_values))
        # save the mean associated with the trace
        polution_data.append(mean_trace)
    # save the mean associated   with the set
    polution_set.append(polution_data)

df_filtered['polution'] = np.hstack(polution_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['polution'] = np.hstack(polution_set)


In [79]:
df_filtered

Unnamed: 0,ID,crimeNorm,accidentNorm,natureNorm,attractionNorm,durationNorm,trafficNorm,lengthNorm,setID,crimes,accidents,attractions,nature,traffic,duration,length,coordinates,polution
30,30.0,0.920527,0.924528,0.212596,0.000000,1.000000,0.889155,0.428035,7.0,310580.0,61.0,0.0,837166.117353,0.936039,1454.0,18727.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,35.750511
31,31.0,1.000000,0.490566,0.051620,0.000000,0.995633,1.000000,1.000000,7.0,298705.0,84.0,0.0,823320.786870,0.945704,1455.0,17356.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,34.997732
32,32.0,0.974080,0.471698,0.688265,0.000000,0.877729,0.407789,0.776387,7.0,302578.0,85.0,0.0,878077.719383,0.894062,1482.0,17892.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,34.748889
33,33.0,0.172486,1.000000,0.152550,0.500000,0.000000,0.866310,0.000000,7.0,422353.0,57.0,1.0,832001.606618,0.934046,1683.0,19753.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,33.829126
34,34.0,0.000000,0.924528,0.000000,1.000000,0.344978,0.771865,0.095953,7.0,448126.0,61.0,2.0,818881.017743,0.925810,1604.0,19523.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,33.815324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,3158.0,0.865327,0.905172,0.000000,0.454545,1.000000,0.952936,1.000000,598.0,945244.0,157.0,9.0,527463.871935,0.862765,1982.0,13369.0,-0.073976:51.470078;-0.07394:51.46994;-0.07388...,30.705686
3159,3159.0,1.000000,1.000000,0.069511,0.000000,0.984866,1.000000,0.914072,598.0,890976.0,146.0,4.0,541806.384165,0.873684,1995.0,13587.0,-0.073976:51.470078;-0.07404:51.47032;-0.07423...,30.809028
3160,3160.0,0.000000,0.482759,1.000000,0.363636,0.000000,0.000000,0.000000,598.0,1293936.0,206.0,8.0,733799.038202,0.641675,2841.0,15906.0,-0.073976:51.470078;-0.07394:51.46994;-0.07388...,31.710485
3161,3161.0,0.191761,0.327586,0.284930,0.727273,0.176950,0.343876,0.511234,598.0,1216664.0,224.0,12.0,586254.998908,0.721458,2689.0,14609.0,-0.073976:51.470078;-0.07404:51.47032;-0.07423...,28.795455


In [81]:
def normalize_polution(polution_array):
    max_data = np.amax(polution_array)
    min_data = np.amin(polution_array)
    return list(map(lambda polution: (max_data - polution) / (max_data - min_data), polution_array))

In [88]:
df_filtered['polutionNorm'] = df_filtered.groupby('setID')['polution'].transform(normalize_polution)

  return list(map(lambda polution: (max_data - polution) / (max_data - min_data), polution_array))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['polutionNorm'] = df_filtered.groupby('setID')['polution'].transform(normalize_polution)


In [89]:
df_filtered

Unnamed: 0,ID,crimeNorm,accidentNorm,natureNorm,attractionNorm,durationNorm,trafficNorm,lengthNorm,setID,crimes,accidents,attractions,nature,traffic,duration,length,coordinates,polution,polutionNorm
30,30.0,0.920527,0.924528,0.212596,0.000000,1.000000,0.889155,0.428035,7.0,310580.0,61.0,0.0,837166.117353,0.936039,1454.0,18727.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,35.750511,0.177211
31,31.0,1.000000,0.490566,0.051620,0.000000,0.995633,1.000000,1.000000,7.0,298705.0,84.0,0.0,823320.786870,0.945704,1455.0,17356.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,34.997732,0.497272
32,32.0,0.974080,0.471698,0.688265,0.000000,0.877729,0.407789,0.776387,7.0,302578.0,85.0,0.0,878077.719383,0.894062,1482.0,17892.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,34.748889,0.603073
33,33.0,0.172486,1.000000,0.152550,0.500000,0.000000,0.866310,0.000000,7.0,422353.0,57.0,1.0,832001.606618,0.934046,1683.0,19753.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,33.829126,0.994132
34,34.0,0.000000,0.924528,0.000000,1.000000,0.344978,0.771865,0.095953,7.0,448126.0,61.0,2.0,818881.017743,0.925810,1604.0,19523.0,-0.2677:51.533778;-0.2677:51.53326;-0.26763:51...,33.815324,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,3158.0,0.865327,0.905172,0.000000,0.454545,1.000000,0.952936,1.000000,598.0,945244.0,157.0,9.0,527463.871935,0.862765,1982.0,13369.0,-0.073976:51.470078;-0.07394:51.46994;-0.07388...,30.705686,0.344696
3159,3159.0,1.000000,1.000000,0.069511,0.000000,0.984866,1.000000,0.914072,598.0,890976.0,146.0,4.0,541806.384165,0.873684,1995.0,13587.0,-0.073976:51.470078;-0.07404:51.47032;-0.07423...,30.809028,0.309245
3160,3160.0,0.000000,0.482759,1.000000,0.363636,0.000000,0.000000,0.000000,598.0,1293936.0,206.0,8.0,733799.038202,0.641675,2841.0,15906.0,-0.073976:51.470078;-0.07394:51.46994;-0.07388...,31.710485,0.000000
3161,3161.0,0.191761,0.327586,0.284930,0.727273,0.176950,0.343876,0.511234,598.0,1216664.0,224.0,12.0,586254.998908,0.721458,2689.0,14609.0,-0.073976:51.470078;-0.07404:51.47032;-0.07423...,28.795455,1.000000


In [90]:
df_filtered[df_filtered['setID'] == 56]

Unnamed: 0,ID,crimeNorm,accidentNorm,natureNorm,attractionNorm,durationNorm,trafficNorm,lengthNorm,setID,crimes,accidents,attractions,nature,traffic,duration,length,coordinates,polution,polutionNorm
294,294.0,1.0,1.0,0.837039,0.0,0.985915,1.0,0.677271,56.0,159309.0,12.0,0.0,290542.753787,0.945144,1112.0,9044.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,25.192857,0.023417
295,295.0,0.920751,0.714286,0.725042,0.0,1.0,0.715349,0.66167,56.0,175533.0,28.0,0.0,258559.537764,0.902439,1107.0,9095.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,25.190813,0.024077
296,296.0,0.92293,0.464286,0.788357,0.0,0.777465,0.747426,0.48914,56.0,175087.0,42.0,0.0,276640.563218,0.907251,1186.0,9659.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,25.164875,0.032446
297,297.0,0.979953,0.0,0.96894,0.0,0.0,0.116428,0.44662,56.0,163413.0,68.0,0.0,328209.585576,0.812585,1462.0,9798.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,24.067055,0.386669
298,298.0,0.837111,0.339286,1.0,1.0,0.873239,0.491951,0.0,56.0,192656.0,49.0,1.0,337079.532796,0.868924,1152.0,11258.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,25.265432,0.0
299,299.0,0.525019,1.0,0.0,0.0,0.816901,0.876536,0.934842,56.0,256548.0,12.0,0.0,51509.146338,0.926621,1172.0,8202.0,0.01641:51.43737;0.01765:51.43787;0.01775:51.4...,24.86121,0.130426
300,300.0,0.0,0.142857,0.40191,0.0,0.309859,0.0,1.0,56.0,364031.0,60.0,0.0,166282.854367,0.795118,1352.0,7989.0,0.01641:51.43737;0.01765:51.43787;0.0176:51.43...,22.166197,1.0


In [83]:
df_filtered.to_csv('datasets/filtered_dataset_gas.csv', index=None, header=True)