In [1]:
%pwd

u'/home/giltrapo/Master_Data_Science/TFM/4_Map'

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import difflib
import urllib
import zipfile
import os
import json
import geojson

In [3]:
# Load "DataSchools" files.

DataSchools = pd.read_csv("../3_Data_Munging/csv_files/DataSchools.csv")

In [4]:
# Load "StudentsByPlace" files.

StudentsByPlace = pd.read_csv("../3_Data_Munging/csv_files/StudentsByPlace.csv")

In [5]:
# City Hall url to download shapefiles of Madrid City districts.

url = "http://datos.madrid.es/egob/catalogo/200078-9-distritos-barrios.zip"

In [6]:
# Download shapefile.

urllib.urlretrieve (url, "geo_files/DISTRITOS_ETRS89.zip")

('geo_files/DISTRITOS_ETRS89.zip',
 <httplib.HTTPMessage instance at 0x7f410d4d3518>)

In [7]:
# Unzip shapefile.

with zipfile.ZipFile("geo_files/DISTRITOS_ETRS89.zip","r") as zip_file:
    zip_file.extractall("geo_files/")

In [8]:
# Load shapefile and add variable.

MAD_districts = gpd.read_file("geo_files/SHP_ETRS89/DISTRITOS.shp")
MAD_districts["Distrito"] = MAD_districts["NOMBRE"].str.upper()

In [9]:
# We use "difflib" library to create a variable in "MAD_districts_ETR89" dataframe with the same district
# name as the one in "StudentByPlace" dataframe.

for i, district in enumerate(MAD_districts["Distrito"]):
    try:
        MAD_districts.loc[i, "Distrito"] = difflib.get_close_matches(district, StudentsByPlace["Distrito"], n = 1)[0]
    except:
        MAD_districts.loc[i, "Distrito"] = np.NaN[0]

In [10]:
# Merge information about students and places.

MAD_districts = pd.merge(MAD_districts, StudentsByPlace, on = "Distrito")
MAD_districts = MAD_districts[["NOMBRE", "geometry", "Distrito", "NinosxPlaza"]]

In [11]:
# Transform from UTM to WGS84 (Lat-Lon) format.

MAD_districts = MAD_districts.to_crs({'init': 'epsg:4326'})

In [12]:
# Assign colors based on the value of the variable "NinosxPlaza", remove repeated variable and rename columns.

MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 1) & (MAD_districts["NinosxPlaza"] < 2), "Color"] = "#eef5db"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 2) & (MAD_districts["NinosxPlaza"] < 3), "Color"] = "#c7efcf"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 3) & (MAD_districts["NinosxPlaza"] < 4), "Color"] = "#d6d1b1"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 4) & (MAD_districts["NinosxPlaza"] < 5), "Color"] = "#f0b67f"
MAD_districts.loc[(MAD_districts["NinosxPlaza"] > 8) & (MAD_districts["NinosxPlaza"] < 9), "Color"] = "#fe5f55"
del MAD_districts["Distrito"]
MAD_districts.columns = ["Distrito", "geometry", "NinosxPlaza", "Color"]

In [13]:
# Save geodataframe in GeoJSON format.

MAD_districts.to_file("geo_files/geojson_districts", driver = "GeoJSON")

In [None]:
# Segment geodataframe based on the variable "NinosxPlaza", and save it in GeoJSON format.
#
#plazas = [1, 2, 3, 4, 8]
#
#for i in plazas:
#    lim_inf = i
#    lim_sup = i + 1
#    MAD_districts[(MAD_districts["NinosxPlaza"] > lim_inf) & (MAD_districts["NinosxPlaza"] < lim_sup)]\
#    .to_file("geo_files/geojson_districts_" + str(i), driver = "GeoJSON")

In [None]:
# Create list of sorted GeoJSON files.
#
#files = ["geo_files/" + f for f in os.listdir("geo_files/") if f.startswith("geojson_districts_")]
#files.sort()

In [None]:
# Load GeoJSON information by "NinosxPlaza" into dictionary.
#
#geojson_districts_groups = dict()
#
#for i, file in enumerate(files):
#    objectname = "group_" + file[-1]
#    with open(file) as data_file:
#        geojson_file = json.load(data_file)
#    geojson_districts_groups[objectname] = geojson_file

In [14]:
# We going to build another geojson file with the schools coordinates

df = DataSchools[["Colegio", "Barrio", "Distrito", "Latitud", "Longitud",
                  "Solicitudes_2016-2017", "Admisiones_2016-2017"]].copy()
df.columns = ["Colegio", "Barrio", "Distrito", "Latitud", "Longitud", "Solicitudes", "Admisiones"]
df["Admisiones_p"] = (df["Admisiones"]/df["Solicitudes"])*100
df.fillna(0, inplace = True)
df[["Solicitudes", "Admisiones"]] = df[["Solicitudes", "Admisiones"]].astype(int)
df["Admisiones_p"] = df["Admisiones_p"].round(1)
df["Admisiones_p"] = df["Admisiones_p"].astype(str) + "%"

In [16]:
# Function to convert pandas dataframe to geojson format.

def pandas2geojson(df):
    features = []
    df.apply(lambda X: features.append(
        geojson.Feature(geometry = geojson.Point((X["Longitud"],
                                                  X["Latitud"])),
                        properties = dict(colegio = X["Colegio"], 
                                          barrio = X["Barrio"],
                                          distrito = X["Distrito"],
                                          solicitudes = X["Solicitudes"],
                                          admisiones = X["Admisiones"],
                                          admisiones_p = X["Admisiones_p"]
                                         )
                       )),
             axis=1)
    with open("geo_files/geojson_schools", "w") as fp:
        geojson.dump(geojson.FeatureCollection(features), fp, sort_keys = True)

pandas2geojson(df)