In [1]:
%pwd

u'/home/giltrapo/Master_Data_Science/TFM/5_Data_Visualization'

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import difflib
import urllib
import zipfile
import os
import json
import geojson

In [3]:
# Load "DataSchools" and "StudentsByPlace" files.

DataSchools = pd.read_csv("../4_Data_Analysis/csv_files/DataSchools.csv", encoding = "utf8")
StudentsByPlace = pd.read_csv("../3_Data_Munging/csv_files/StudentsByPlace.csv", encoding = "utf8")

In [4]:
# City Hall url to download shapefiles of Madrid City districts.

url = "http://datos.madrid.es/egob/catalogo/200078-9-distritos-barrios.zip"

In [5]:
# Download shapefile and unzip it.

urllib.urlretrieve (url, "../5_Data_Visualization/geo_files/DISTRITOS_ETRS89.zip")
with zipfile.ZipFile("../5_Data_Visualization/geo_files/DISTRITOS_ETRS89.zip","r") as zip_file:
    zip_file.extractall("../5_Data_Visualization/geo_files/")

In [6]:
# Load shapefile with geopandas, remove and rename variables.

MAD_districts = gpd.read_file("../5_Data_Visualization/geo_files/SHP_ETRS89/DISTRITOS.shp")
MAD_districts.drop(MAD_districts.columns[[1, 2, 3]], axis = 1, inplace = True)
MAD_districts.columns = ["Distrito", "geometry"]

In [7]:
# We will use "difflib" library to standardize the names of the districts in "MAD_districts" and
# "StudentByPlace" dataframes in order to merge them later.

for i, district in enumerate(StudentsByPlace["Distrito"].str.capitalize()):
    try:
        StudentsByPlace.loc[i, "Distrito"] = difflib.get_close_matches(district, MAD_districts["Distrito"], n = 1)[0]
    except:
        StudentsByPlace.loc[i, "Distrito"] = np.NaN[0]

In [8]:
# We will also standardize the names of the districts in "DataSchools" dataframe.

for i, district in enumerate(DataSchools["Distrito"].str.capitalize()):
    try:
        DataSchools.loc[i, "Distrito"] = difflib.get_close_matches(district, MAD_districts["Distrito"], n = 1)[0]
    except:
        DataSchools.loc[i, "Distrito"] = np.NaN[0]

In [9]:
# Merge information about students and places.

MAD_districts = pd.merge(MAD_districts, StudentsByPlace, on = "Distrito")

In [10]:
# Transform from UTM to WGS84 (Lat-Lon) format.

MAD_districts = MAD_districts.to_crs({'init': 'epsg:4326'})

In [11]:
# Assign colors based on the value of the variable "NinosxPlaza", remove repeated variable and rename columns,
# and encode to utf-8 "Distrito" variable.

MAD_districts.loc[MAD_districts["NinosxPlaza"] <= 1.5, "Color"] = "#e0fbfc"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 1.5) & (MAD_districts["NinosxPlaza"] <= 2.5), "Color"] = "#c2dfe3"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 2.5) & (MAD_districts["NinosxPlaza"] <= 3.5), "Color"] = "#9db4c0"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 3.5) & (MAD_districts["NinosxPlaza"] <= 4.5), "Color"] = "#5c6b73"
MAD_districts.loc[MAD_districts["NinosxPlaza"] > 8, "Color"] = "#253237"
MAD_districts["Distrito"] = MAD_districts["Distrito"].str.encode('utf-8')

In [12]:
# Add columns with coordinates (to center district on the map) and zoom level.

MAD_districts["Latitud"] = ""
MAD_districts["Longitud"] = ""
MAD_districts["Zoom"] = ""
MAD_districts.loc[MAD_districts["Distrito"] == "Arganzuela", "Latitud":"Zoom"] = (40.398, -3.698, 13.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Barajas", "Latitud":"Zoom"] = (40.479, -3.585, 12.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Carabanchel", "Latitud":"Zoom"] = (40.3815, -3.736, 13)
MAD_districts.loc[MAD_districts["Distrito"] == "Centro", "Latitud":"Zoom"] = (40.418, -3.707, 13.9)
MAD_districts.loc[MAD_districts["Distrito"] == "Chamartín", "Latitud":"Zoom"] = (40.461, -3.678, 13.0)
MAD_districts.loc[MAD_districts["Distrito"] == "Chamberí", "Latitud":"Zoom"] = (40.437, -3.705, 14.0)
MAD_districts.loc[MAD_districts["Distrito"] == "Ciudad Lineal", "Latitud":"Zoom"] = (40.450, -3.665, 12.4)
MAD_districts.loc[MAD_districts["Distrito"] == "Fuencarral - El Pardo", "Latitud":"Zoom"] = (40.550, -3.760, 11.0)
MAD_districts.loc[MAD_districts["Distrito"] == "Hortaleza", "Latitud":"Zoom"] = (40.480, -3.640, 12.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Latina", "Latitud":"Zoom"] = (40.390, -3.781, 12.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Moncloa - Aravaca", "Latitud":"Zoom"] = (40.440, -3.768, 12.2)
MAD_districts.loc[MAD_districts["Distrito"] == "Moratalaz", "Latitud":"Zoom"] = (40.406, -3.644, 13.8)
MAD_districts.loc[MAD_districts["Distrito"] == "Puente de Vallecas", "Latitud":"Zoom"] = (40.384, -3.662, 13.0)
MAD_districts.loc[MAD_districts["Distrito"] == "Retiro", "Latitud":"Zoom"] = (40.408, -3.677, 13.8)
MAD_districts.loc[MAD_districts["Distrito"] == "Salamanca", "Latitud":"Zoom"] = (40.432, -3.676, 13.8)
MAD_districts.loc[MAD_districts["Distrito"] == "San Blas - Canillejas", "Latitud":"Zoom"] = (40.432, -3.592, 12.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Tetuán", "Latitud":"Zoom"] = (40.461, -3.700, 13.7)
MAD_districts.loc[MAD_districts["Distrito"] == "Usera", "Latitud":"Zoom"] = (40.378, -3.704, 13.5)
MAD_districts.loc[MAD_districts["Distrito"] == "Vicálvaro", "Latitud":"Zoom"] = (40.391, -3.575, 12.4)
MAD_districts.loc[MAD_districts["Distrito"] == "Villa de Vallecas", "Latitud":"Zoom"] = (40.350, -3.621, 12.2)
MAD_districts.loc[MAD_districts["Distrito"] == "Villaverde", "Latitud":"Zoom"] = (40.344, -3.696, 13.0)

In [13]:
# Save geodataframe in GeoJSON format.
#
# DISCLAIMER: "to_file" method of Geopandas uses "fionna.open", but in read mode, so if it is used without previously
# deleting the previous GeoJSON file, an error will occur. For that reason, a "try" is added to the code to check if
# the file already exists.

try: 
    os.remove("../5_Data_Visualization/geo_files/geojson_districts")
except OSError:
    pass

MAD_districts.to_file("../5_Data_Visualization/geo_files/geojson_districts", driver = "GeoJSON")

In [14]:
# Segment geodataframe based on the variable "NinosxPlaza", and save it in GeoJSON format.
#
#plazas = [1, 2, 3, 4, 8]
#
#for i in plazas:
#    lim_inf = i
#    lim_sup = i + 1
#    MAD_districts[(MAD_districts["NinosxPlaza"] > lim_inf) & (MAD_districts["NinosxPlaza"] < lim_sup)]\
#    .to_file("../4_Map/geo_files/geojson_districts_" + str(i), driver = "GeoJSON")

In [15]:
# Create list of sorted GeoJSON files.
#
#files = ["../4_Map/geo_files/" + f for f in os.listdir("../4_Map/geo_files/") if f.startswith("geojson_districts_")]
#files.sort()

In [16]:
# Load GeoJSON information by "NinosxPlaza" into dictionary.
#
#geojson_districts_groups = dict()
#
#for i, file in enumerate(files):
#    objectname = "group_" + file[-1]
#    with open(file) as data_file:
#        geojson_file = json.load(data_file)
#    geojson_districts_groups[objectname] = geojson_file

In [17]:
# We going to build another file with the schools coordinates

schools = DataSchools[["Colegio", "Distrito", "Latitud", "Longitud", "Solicitudes_2016.2017",
                       "Admisiones_2016.2017", "Solicitudes_2017.2018", "Admisiones_2017.2018"]].copy()
schools.columns = ["Colegio", "Distrito", "Latitud", "Longitud", "Solicitudes2017", "Admisiones2017",
                   "Solicitudes2018", "Admisiones2018"]
schools["Admisiones2017p"] = (schools["Admisiones2017"]/schools["Solicitudes2017"])*100
schools["Admisiones2017p"] = schools["Admisiones2017p"].round(1)
schools["Admisiones2017p_str"] = schools["Admisiones2017p"].apply(str)
schools.loc[schools["Admisiones2017p_str"] == "nan", "Admisiones2017p_str"] = "Sin datos"
schools.loc[schools["Admisiones2017p_str"] != "Sin datos", "Admisiones2017p_str"] = schools["Admisiones2017p_str"] + "%"
schools["Admisiones2018p"] = (schools["Admisiones2018"]/schools["Solicitudes2018"])*100
schools["Admisiones2018p"] = schools["Admisiones2018p"].round(1)
schools["Admisiones2018p_str"] = schools["Admisiones2018p"].apply(str)
schools.loc[schools["Admisiones2018p_str"] == "nan", "Admisiones2018p_str"] = "Sin estimación"
schools.loc[schools["Admisiones2018p_str"] != "Sin estimación", "Admisiones2018p_str"] =\
schools["Admisiones2018p_str"] + "%"

In [18]:
schools.to_csv("../5_Data_Visualization/geo_files/schools.csv", encoding = "utf-8", index = False)

In [19]:
# Function to convert pandas dataframe to geojson format and save it.
#
#def pandas2geojson(df):
#    features = []
#    df.apply(lambda X: features.append(
#        geojson.Feature(geometry = geojson.Point((X["Longitud"],
#                                                  X["Latitud"])),
#                        properties = dict(Colegio = X["Colegio"],
#                                          Distrito = X["Distrito"],
#                                          Admisiones = X["Admisiones_p"],
#                                          Color = X["Color"]
#                                         )
#                       )),
#             axis=1)
#    with open("../4_Map/geo_files/geojson_schools", "w") as fp:
#        geojson.dump(geojson.FeatureCollection(features), fp, sort_keys = True)
#
#pandas2geojson(schools)