# Data exploration
1. Full models:
    - **Global** and **local** measures
    - Topological attributes. (No. dead ends , average street lenght, average campus area/perimeter...)
1. Selection of certain local areas - to discuss/agree with Mahdi.
    - **Global** and **local** measures
    - This could be created on the basis of: 
        - Surrounding/buffer areas from selected Campus categories
        - Political breaks of the city - In which case we need to get such boundaries.
1. Summarize information into tables
1. Plot and visualize the correspondant/complementary maps.
    - This will greatly differ depending on the number of them.

In [1]:
import geopandas as gpd
import pandas as pd
from numpy import log, nan

from pyproj import CRS
from shapely.geometry import Point, MultiPoint, LineString, MultiLineString, Polygon, MultiPolygon
#from shapely.wkb import dumps, loads #this is used to flatten geoms
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#modules needed for connecting to PostGIS
from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine #needs to have psycopg2 in the environment but no need to import it
from geoalchemy2 import WKTElement, Geometry #to modify Shapely geometries into WKT before uploading to DB

In [None]:
#importing custom function to normalize measures
#from Syntax_functions import Syntax_normalizer 

# Importing SYNTAX data

In [58]:
#importing the text files from Depthmap
#full model
fp = "cl_data/processing_networks/netAll_sg_pr1.txt"
riyadh_fullDF = pd.read_table(fp, index_col ="Ref")
#base model
fp2 = "cl_data/processing_networks/netBse_sg_pr2.txt"
riyadh_baseDF = pd.read_table(fp2, index_col ="Ref")

print("The shape of BASE syntax file: ", riyadh_baseDF.shape)
print("The shape of FULL syntax file: ", riyadh_fullDF.shape)

The shape of BASE syntax file:  (227470, 37)
The shape of FULL syntax file:  (252591, 26)


In [59]:
for data in [riyadh_baseDF, riyadh_fullDF]:
    # Zip the coordinates into a point object and convert to a GeoDataFrame
    geom_org = [Point(xy) for xy in zip(data.x1, data.y1)]
    geom_dest = [Point(xy) for xy in zip(data.x2, data.y2)]

    # Creating the line segments
    geom_line = [LineString([(p1.x,p1.y),(p2.x,p2.y)]) for p1, p2 in zip(geom_org,geom_dest)]
    #Adding the column to the DataFrame
    data["geometry"] = geom_line

#converting into GeoDataFrames
riyadh_base = gpd.GeoDataFrame(riyadh_baseDF, geometry="geometry", crs=20438)
riyadh_full = gpd.GeoDataFrame(riyadh_fullDF, geometry="geometry", crs=20438)

#removing unnecessary columns
colsB = ['x1', 'y1', 'x2', 'y2', 'Axial Line Ref', 'Metric Step Depth']
colsF = ['x1', 'y1', 'x2', 'y2', 'Axial Line Ref', 'Drawing Layer']

riyadh_base.drop(colsB, axis = 1, inplace=True)
riyadh_full.drop(colsF, axis = 1, inplace=True)

#quick verification
synt_base = [i for i in riyadh_base.columns if (i[:5]=="T1024")]
synt_full = [i for i in riyadh_full.columns if (i[:5]=="T1024")]
print("VERIFICATION STEP:\nBoth tables have the same radii of anlysis: ", len(synt_base) == len(synt_full))

VERIFICATION STEP:
Both tables have the same radii of anlysis:  False


In [67]:
#renaming "N radius" columns
naming_dict= {'T1024 Choice': "T1024 Choice Rn metric",
              'T1024 Integration': "T1024 Intergration Rn metric",
              'T1024 Node Count': "T1024 Node Count Rn metric",
              'T1024 Total Depth': "T1024 Total Depth Rn metric"} 

riyadh_base.rename(columns=naming_dict, inplace=True)
riyadh_full.rename(columns=naming_dict, inplace=True)

## Catchment areas

In [47]:
# DB parameters
HOST = 'localhost'
DB = 'sdb_course'
USER = 'postgres'
PORT = 5433
PWD = 'Dedalo1.'
# Database info
db_url = URL(drivername='postgresql+psycopg2', host=HOST, database=DB, username=USER, port=PORT, password=PWD)
# Create engine
engine = create_engine(db_url)
engine 

Engine(postgresql+psycopg2://postgres:***@localhost:5433/sdb_course)

In [6]:
#importing the CATCHMENTS
sqlquery = "SELECT * FROM riyadh.campus_catchments;"
#note: no CRS given because the data is already projcted in the correct CRS (20438)
catchments = gpd.read_postgis(sqlquery, engine, geom_col='geom', crs=None, index_col='gid')
#creating a spatial index
catchments.sindex
catchments.head(2)

Unnamed: 0_level_0,geom,id,cost_level
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"POLYGON ((662356.667 2736056.540, 662366.673 2...",9,5000.0
2,"POLYGON ((660705.690 2748066.314, 660709.354 2...",9,5000.0


In [7]:
#importing the CAMPUSES
sqlquery = "SELECT * FROM riyadh.campuses_new;"
campuses = gpd.read_postgis(sqlquery, engine, geom_col='geom', crs=None)
#creating a spatial index
campuses.sindex
campuses.tail(2)

Unnamed: 0,id,geom,category,cat_name,name_alias
318,319,"MULTIPOLYGON (((669485.937 2728433.255, 669462...",GOV,Governmental,
319,320,"MULTIPOLYGON (((669454.810 2728941.664, 669923...",E,Educational,


In [8]:
#define a dataframe from only the focus campuses
mask = campuses.name_alias.notnull()
focus_campuses = campuses[mask]

In [9]:
#join the catchments to the correct focus campuses they belong
catchments = gpd.sjoin(catchments, focus_campuses, how='inner', op='intersects', rsuffix='focus')

In [10]:
#selecting the catments according to the desired distance from the entrances
selected_di = 1000 #this is according to what Mahdi asked
selected_catchments = catchments[catchments.cost_level == selected_di]

#cleaning joined data
useless_cols = ["id_left", "id_focus", "cat_name"]
selected_catchments.drop(useless_cols, axis=1, inplace=True)
selected_catchments

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,geom,cost_level,index_focus,category,name_alias
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56,"POLYGON ((664297.816 2733071.357, 664307.822 2...",1000.0,289,SCLSUP,Diplomatic Quarter Riyadh
55,"POLYGON ((671842.282 2728938.658, 671843.418 2...",1000.0,296,SCRT,Royal Guard Barracks
57,"POLYGON ((662756.904 2745215.185, 662766.910 2...",1000.0,225,R,Telal Al-Riyadh
58,"POLYGON ((672412.620 2746997.806, 672422.626 2...",1000.0,313,E,Al-Imam University


## Selecting the lines in the models that belong to the campuses or the catchment areas

In [11]:
#create the join with the lines that are WITHIN the campus
# Note that this is only done on the FULL model as the other one does not have such lines
riyadh_full = gpd.sjoin(riyadh_full,focus_campuses[["geom","category","name_alias"]],
                        how="left", op="within", rsuffix= "campus")

In [63]:
#create the join with the lines that intersect the selected catchemnts
# BASE model
riyadh_base = gpd.sjoin(riyadh_base,selected_catchments[["geom","index_focus","category","name_alias"]], 
                        how="left", op="within")
riyadh_base.drop("index_right",axis=1, inplace=True)

# FULL model
riyadh_full = gpd.sjoin(riyadh_full,selected_catchments[["geom","index_focus"]], 
                        how="left", op="within")
riyadh_full.drop("index_right",axis=1, inplace=True)

In [13]:
# On the FULL model:
# assigning NAN to the rows that are simultaneously catched by the campus polygons and the catchments
mask = (riyadh_full.index_campus.notnull()) & (riyadh_full.index_focus.notnull())
riyadh_full.loc[mask , "index_focus"] = nan #this comes from Numpy.nan imported earlier

#populating index_focus with the correct index focus to group later based on that column
for area in riyadh_full.loc[riyadh_full.index_campus.notnull(),"index_campus"].unique():
    #create a mask whether the line belongs to catchment OR the campus
    mask = (riyadh_full.index_campus == area) | (riyadh_full.index_focus == area)
    #asigning the same value to be able to group later
    riyadh_full.loc[mask, "index_focus"] = area

In [64]:
#renaming the catchments label for reporting
mask = (riyadh_full.index_focus.notnull()) & (riyadh_full.index_campus.isnull())
riyadh_full.loc[mask, "name_alias"] = " Catchment" 

#renaming the alias to be precise when reporting
mask = riyadh_base.index_focus.notnull()
riyadh_base.loc[mask, "name_alias"] = [(str(d) +" catchment area") for d in riyadh_base.loc[mask, "name_alias"]]

print("VERIFICATION STEP!")
print("BASE MODEL\nUnique focus_index labels:\n", riyadh_base.loc[riyadh_base.index_focus.notnull(),"name_alias"].unique())
print("\nFULL MODEL\nUnique focus_index labels:\n", riyadh_full.loc[riyadh_full.index_focus.notnull(),"name_alias"].unique())

VERIFICATION STEP!
BASE MODEL
Unique focus_index labels:
 ['Diplomatic Quarter Riyadh catchment area'
 'Royal Guard Barracks catchment area' 'Al-Imam University catchment area'
 'Telal Al-Riyadh catchment area']


### ================
## Function for Syntax normalization !
### ================

In [65]:
#creating a function to normalize all syntax values
def Syntax_normalizer(df):
    
    # based on Syntax literature 
    # NaCH = logCH+1/logTD+3
    # NaIN =  NC^1.2/TD
    #extracting the useful columns
    useful_cols = [i[6:] for i in df.columns if (i[:5]=="T1024")]
    
    #extracting the radii that need to be calculated
    radii = [r.split()[1][1:] for r in useful_cols if r[:6]=="Choice"]

    #creating the column names
    NaCh_colnames = ["NaCh_"+str(r) for r in radii]
    NaIn_colnames = ["NaIn_"+str(r) for r in radii]
    
    # empty dictionary to store all calculation with their keys
    normalized_dct = {}
    for rad, NaCh_col_name, NaIn_col_name in zip(radii, NaCh_colnames, NaIn_colnames):
        #slice the dataframe with relevant columns
        choice = df["T1024 Choice R%s metric" % str(rad)]
        tdepth = df["T1024 Total Depth R%s metric" % str(rad)]
        ncount = df["T1024 Node Count R%s metric" % str(rad)]

        NaCh_vals = log(choice+1) / log(tdepth+3)
        NaIn_vals = ncount**1.2 / tdepth

        normalized_dct[NaCh_col_name] = NaCh_vals
        normalized_dct[NaIn_col_name] = NaIn_vals
    
    #convert dictionary to dataframe 
    newdf = pd.DataFrame(normalized_dct)
    #join the new dataframe with the input DataFrame (based on index)
    df = df.join(newdf)
    
    print("The following columns have been added successfully:\n",NaCh_colnames,"\n",NaIn_colnames,"\n")
    return df

In [70]:
#call the new function Syntax_normalizer()
riyadh_base = Syntax_normalizer(riyadh_base)
riyadh_full = Syntax_normalizer(riyadh_full)

The following columns have been added successfully:
 ['NaCh_n', 'NaCh_10000', 'NaCh_1200', 'NaCh_2000', 'NaCh_400', 'NaCh_5000', 'NaCh_800'] 
 ['NaIn_n', 'NaIn_10000', 'NaIn_1200', 'NaIn_2000', 'NaIn_400', 'NaIn_5000', 'NaIn_800'] 



In [71]:
riyadh_base.columns

Index(['Angular Connectivity', 'Connectivity', 'Segment Length',
       'T1024 Choice Rn metric', 'T1024 Choice R10000 metric',
       'T1024 Choice R1200 metric', 'T1024 Choice R2000 metric',
       'T1024 Choice R400 metric', 'T1024 Choice R5000 metric',
       'T1024 Choice R800 metric', 'T1024 Intergration Rn metric',
       'T1024 Integration R10000 metric', 'T1024 Integration R1200 metric',
       'T1024 Integration R2000 metric', 'T1024 Integration R400 metric',
       'T1024 Integration R5000 metric', 'T1024 Integration R800 metric',
       'T1024 Node Count Rn metric', 'T1024 Node Count R10000 metric',
       'T1024 Node Count R1200 metric', 'T1024 Node Count R2000 metric',
       'T1024 Node Count R400 metric', 'T1024 Node Count R5000 metric',
       'T1024 Node Count R800 metric', 'T1024 Total Depth Rn metric',
       'T1024 Total Depth R10000 metric', 'T1024 Total Depth R1200 metric',
       'T1024 Total Depth R2000 metric', 'T1024 Total Depth R400 metric',
       'T1024 To

In [19]:
#clean columns
useless_cols = ['T1024 Choice Rn metric', 
                'T1024 Choice R10000 metric','T1024 Choice R1200 metric', 
                'T1024 Choice R2000 metric','T1024 Choice R400 metric', 
                'T1024 Choice R5000 metric','T1024 Choice R800 metric', 
                
                'T1024 Intergration Rn metric',
                'T1024 Integration R10000 metric', 'T1024 Integration R1200 metric',
                'T1024 Integration R2000 metric', 'T1024 Integration R400 metric',
                'T1024 Integration R5000 metric', 'T1024 Integration R800 metric',
                
                'T1024 Total Depth Rn metric',
                'T1024 Total Depth R10000 metric', 'T1024 Total Depth R1200 metric',
                'T1024 Total Depth R2000 metric', 'T1024 Total Depth R400 metric',
                'T1024 Total Depth R5000 metric', 'T1024 Total Depth R800 metric']


riyadh_base.drop(useless_cols, axis=1, inplace = True)
riyadh_full.drop(useless_cols, axis=1, inplace = True)



## create a for loop for both MODELS 
ncdict = {'T1024 Node Count Rn metric': "NC_n",
          'T1024 Node Count R10000 metric': "NC_10000",
          'T1024 Node Count R1200 metric': "NC_1200",
          'T1024 Node Count R2000 metric': "NC_2000", 
          'T1024 Node Count R400 metric': "NC_400", 
          'T1024 Node Count R5000 metric': "NC_5000",
          'T1024 Node Count R800 metric': "NC_800"} 

riyadh_base.rename(columns=ncdict, inplace=True)
riyadh_full.rename(columns=ncdict, inplace=True)
riyadh_base.dtypes

Angular Connectivity     float64
Connectivity               int64
Segment Length           float64
NC_1200                    int64
NC_2000                    int64
NC_400                     int64
NC_800                     int64
geometry                geometry
index_focus              float64
category                  object
name_alias                object
NaCh_1200                float64
NaIn_1200                float64
NaCh_2000                float64
NaIn_2000                float64
NaCh_400                 float64
NaIn_400                 float64
NaCh_800                 float64
NaIn_800                 float64
dtype: object

In [20]:
riyadh_full.dtypes

Angular Connectivity     float64
Connectivity               int64
Segment Length           float64
NC_1200                    int64
NC_2000                    int64
NC_400                     int64
NC_800                     int64
geometry                geometry
index_campus             float64
category                  object
name_alias                object
index_focus              float64
NaCh_1200                float64
NaIn_1200                float64
NaCh_2000                float64
NaIn_2000                float64
NaCh_400                 float64
NaIn_400                 float64
NaCh_800                 float64
NaIn_800                 float64
dtype: object

## ===================
# Summary tables

In [30]:
# Topological description of the focus areas to study
cols= ["index_focus","name_alias",
       'Connectivity', 'Segment Length']

summary_cols = [(  'Connectivity', 'count'),
                (  'Connectivity',   'min'),
                (  'Connectivity',   'max'),
                (  'Connectivity',  'mean'),
                (  'Connectivity',   'std'),
                
                ('Segment Length',   'min'),
                ('Segment Length',   'max'),
                ('Segment Length',  'mean'),
                ('Segment Length',   'std')]

riyadh_full[cols].groupby(["index_focus","name_alias"]).describe()[summary_cols].round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Connectivity,Connectivity,Connectivity,Connectivity,Connectivity,Segment Length,Segment Length,Segment Length,Segment Length
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,min,max,mean,std
index_focus,name_alias,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
225.0,Catchment,622.0,2.0,6.0,4.07,0.58,12.02,537.8,97.57,58.36
225.0,Telal Al-Riyadh,117.0,2.0,6.0,4.05,0.74,11.59,379.79,113.43,69.4
289.0,Catchment,219.0,1.0,6.0,3.92,0.78,2.18,955.88,106.91,92.55
289.0,Diplomatic Quarter Riyadh,870.0,1.0,7.0,3.19,1.1,10.4,369.87,70.91,46.05
296.0,Catchment,760.0,2.0,7.0,4.33,0.78,10.59,393.74,79.91,50.66
296.0,Royal Guard Barracks,158.0,1.0,6.0,3.44,0.89,13.23,232.92,73.65,39.27
313.0,Catchment,614.0,1.0,6.0,3.95,0.68,10.18,589.0,104.03,82.32
313.0,Al-Imam University,243.0,1.0,7.0,3.31,0.99,12.08,462.72,101.62,82.24


In [44]:
# Syntax values described per area
nachcols = ["index_focus","name_alias",
            'NaCh_400', 'NaCh_800', 'NaCh_1200', 'NaCh_2000', 'NaCh_5000', 'NaCh_10000','NaCh_n']  
           
naincols = ["index_focus","name_alias",
            'NaIn_400','NaIn_800','NaIn_1200','NaIn_2000','NaIn_5000','NaIn_10000','NaIn_n']

riyadh_base[nachcols].groupby(["index_focus"]).describe().round(2)

KeyError: "['NaCh_5000', 'NaCh_n', 'NaCh_10000'] not in index"

In [None]:
#dead-end counting
#riyadh_full.hist(column="Connectivity", by=None, grid=False, 
                   xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, 
                   sharex=True, sharey=True, 
                   figsize=None, layout=None, 
                   bins=6, backend=None);

# ============
# Exporting data to SHAPEFILES

In [21]:
riyadh_full.head()

Unnamed: 0_level_0,Angular Connectivity,Connectivity,Segment Length,NC_1200,NC_2000,NC_400,NC_800,geometry,index_campus,category,name_alias,index_focus,NaCh_1200,NaIn_1200,NaCh_2000,NaIn_2000,NaCh_400,NaIn_400,NaCh_800,NaIn_800
Ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,3.999999,4,72.142868,503,1326,51,196,"LINESTRING (677211.890 2732125.950, 677279.804...",,,,,0.838583,0.913652,0.91053,1.109171,0.701303,1.047362,0.793922,0.918492
1,2.0,2,322.59747,473,1580,13,140,"LINESTRING (667716.660 2737252.031, 668009.000...",,,,,0.0,0.971687,0.0,1.148057,0.0,1.190296,0.0,0.93519
2,4.0,4,361.17941,492,1738,18,164,"LINESTRING (668563.242 2735412.081, 668888.583...",,,,,0.900683,1.097474,0.88827,1.244066,0.0,1.14516,0.804627,1.002788
3,4.035942,4,79.431976,594,1406,46,223,"LINESTRING (673497.867 2732113.537, 673509.839...",,,,,0.67559,0.940326,0.683851,1.020288,0.66729,1.065732,0.600491,0.916454
4,4.0,4,38.248165,725,2056,70,281,"LINESTRING (672769.816 2728393.074, 672804.186...",,,,,0.680611,0.967052,0.577335,1.012421,0.619248,0.984078,0.697764,0.981902


In [23]:
fp = "cl_data/processing_networks/riyadhFull_pr_cl(test).shp"
riyadh_full.to_file(fp)

In [22]:
riyadh_base.head()

Unnamed: 0_level_0,Angular Connectivity,Connectivity,Segment Length,NC_1200,NC_2000,NC_400,NC_800,geometry,index_focus,category,name_alias,NaCh_1200,NaIn_1200,NaCh_2000,NaIn_2000,NaCh_400,NaIn_400,NaCh_800,NaIn_800
Ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,3.999999,4,72.142868,503,1314,51,196,"LINESTRING (677211.890 2732125.950, 677279.804...",,,,0.838583,0.913652,0.908137,1.111783,0.701303,1.047362,0.793922,0.918492
1,2.0,2,322.59747,473,1504,13,140,"LINESTRING (667716.660 2737252.031, 668009.000...",,,,0.0,0.971687,0.0,1.144033,0.0,1.190296,0.0,0.93519
2,4.0,4,361.17941,472,1645,18,164,"LINESTRING (668563.242 2735412.081, 668888.583...",,,,0.907285,1.101621,0.892167,1.23473,0.0,1.14516,0.804627,1.002788
3,4.035942,4,79.431976,480,1168,31,149,"LINESTRING (673497.867 2732113.537, 673509.839...",,,,0.68022,0.950467,0.688842,1.027597,0.351272,1.263644,0.589073,0.976714
4,4.0,4,38.248165,638,1714,70,280,"LINESTRING (672769.816 2728393.074, 672804.186...",,,,0.695564,0.996979,0.579235,1.030078,0.619248,0.984078,0.698142,0.981319


In [24]:
fp = "cl_data/processing_networks/riyadhBase_pr_cl(test).shp"
riyadh_base.to_file(fp)

# =================== 
## Experiments with visualization
**NEED TO OPEN A NEW NOTEBOOK**

In [None]:
#bounds of one of the areas 
bbox = (662863.4908694762, 2728254.491706719, 665918.6425654521, 2732369.2887877855)
ofsett = 1000

ax = riyadh_full.plot(figsize =(8,8),
                linewidth= 1, 
                cmap = "Dark2",
                column = "index_focus")

riyadh_full.plot(ax = ax,
                linewidth= 1, cmap= "Pastel2",
                column = "index_campus")

#ax.set_xlim(bbox[0]-ofsett,bbox[2]+ofsett)
#ax.set_ylim(bbox[1]-ofsett,bbox[3]+ofsett)
ax.axis("off")
plt.tight_layout()