In [None]:
import pandas as pd
import os
import copy

from geopy.distance import geodesic
import geocoder
from tqdm import tqdm
import folium
from folium import plugins

import geopandas as gpd
from shapely.geometry import Point

import numpy as np
from tqdm import tqdm  # Importiamo tqdm per la barra di avanzamento
from math import radians, sin, cos, sqrt, atan2

In [None]:
# Import all dataframes with the addresses

directory_path = '/Users/dilettaferri/Desktop/UNIPI/SNA - Project/Project/dataframe con indirizzi'
# Reads all .csv files in the directory
dataframes = [pd.read_csv(os.path.join(directory_path, file)) for file in os.listdir(directory_path) if file.endswith('.csv')] 

df = pd.concat(dataframes, ignore_index=True) # Unites all dataframes

In [None]:
df

In [None]:
# Count the "None"
conteggio_none = df.isna().sum()
conteggio_none

In [None]:
# Drop the records of the dataframe that have one or more "None" in the geographic columns added
colonne_geografiche_aggiunte = ['stato', 'provincia', 'comune', 'indirizzo']
df_senza_na = df.dropna(subset=colonne_geografiche_aggiunte)

df_senza_na

In [None]:
# Count again the number of records with each different type of organization
df_senza_na.value_counts('Sezione')

In [None]:
# Drop the single record with "-" in the Sezione column (it would be a node without label)
riga_no_sezione = df[df['Sezione'] == '-'].index
df_senza_na.drop(riga_no_sezione, inplace=True)

df_senza_na

In [None]:
# Check
df_senza_na.value_counts('Sezione')

In [None]:
df1 = copy.deepcopy(df_senza_na)

In [None]:
df1

In [None]:
# Identify the records where province and municipality are not the same as the original ones
# These records will be eliminated (we have no way of easily knowing which is the correct one)

righe_da_eliminare = df1[(df1['Provincia'] != df1['provincia']) | (df1['Comune'] != df1['comune'])]
df1.drop(righe_da_eliminare.index, inplace=True)


In [None]:
# Drop the useless or duplicated columns
df1.drop(columns=['Provincia', 'Comune', 'Regione', 'Repertorio', 'Codice fiscale', 'Data iscrizione', 'Rete', '5x1000'], inplace=True)

In [None]:
df1

In [None]:
# Add an abbreviation for the different names in the "Sezione" columns - useful as labels for the network 
mappatura = {
    'ASSOCIAZIONI DI PROMOZIONE SOCIALE': 'APS',
    'ORGANIZZAZIONI DI VOLONTARIATO': 'OV',
    'IMPRESE SOCIALI': 'IS',
    'ALTRI ENTI DEL TERZO SETTORE': 'AE',
    'ENTI FILANTROPICI': 'EF',
    "SOCIETA' DI MUTUO SOCCORSO": 'SMS'
}

# Create the new "etichetta" column from the mapping
df1['etichetta'] = df1['Sezione'].map(mappatura)

In [None]:
df1

<h3>Add latitude and longitude from the addresses</h3>

In [None]:
# Add a column with the completed address
df1['indirizzo_completo'] = df1['indirizzo'] + ',' + df1['comune'] + ',' + df1['provincia'] + ',' + df1['stato']
df1

In [None]:
lat = []
long = []

for i in df1['indirizzo_completo']:
    g = geocoder.arcgis(i)
    coordinate = g.latlng
    lat.append(coordinate[0])
    long.append(coordinate[1])

In [None]:
df1['Latitudine'] = lat
df1['Longitudine'] = long
df1

In [None]:
# Save the whole dataframe with lat and long
df1.to_csv('dataframe_coordinate.csv', index = False)

<h3>Visualization with Folium map</h3>

In [None]:
# Get the mean of the lat and long in order to center the map
df1.describe()

In [None]:
mappa_lombardia = folium.Map(location=(45.532592, 9.474202), zoom_start=9)
#mappa_lombardia

In [None]:
# Add points as clusters
datapoints = plugins.MarkerCluster().add_to(mappa_lombardia)

In [None]:
# Loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(df1.Latitudine, df1.Longitudine, df1.Denominazione): # Use as label the name of the org
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(datapoints)

In [None]:
mappa_lombardia

In [None]:
# It's possible to see from the Folium map visualization that there are some points outside of the boundaries of the Lombardia region

In [None]:
# Upload GeoJSON file with boundaries of Lombardia 
geojson_path = "Limiti amministrativi Regione Lombardia 2019 con aggiornamenti DbT_PGT.geojson"  
folium.GeoJson(
    geojson_path,  # Aggiungi GeoJSON file to the map 
    name='Confini Lombardia',
    style_function=lambda x: {'color': 'blue', 'weight': 2, 'fillOpacity': 0.1} 
).add_to(mappa_lombardia)

# Control layer to activate/deactivate the border 
folium.LayerControl().add_to(mappa_lombardia)

# Save the map in a .html in order to visualize it
mappa_lombardia.save("mappa_lombardia_con_confini.html")

I'll create a nwe column which tells if a point (identified by lat, long is inside or outside the border of Lombardia). <br>
These points will be dropped, because they will create wrong distances

In [None]:
# Upload the GeoJSON of the border 
geojson_path = "Limiti amministrativi Regione Lombardia 2019 con aggiornamenti DbT_PGT.geojson"
lombardia_gdf = gpd.read_file(geojson_path)

In [None]:
# Add a "Point" geometry for each record of the DataFrame
df1['geometry'] = df1.apply(lambda row: Point(row['Longitudine'], row['Latitudine']), axis=1)

In [None]:
# Convert df1 in a GeoDataFrame in order to work with the geometries
df1_gdf = gpd.GeoDataFrame(df1, geometry='geometry')

In [None]:
# Check if every point is inside the border, and save the answer in the column inside_lombardia
df1_gdf['inside_lombardia'] = df1_gdf['geometry'].apply(lambda x: lombardia_gdf.contains(x).any())

In [None]:
df1_gdf

In [None]:
df2_gdf = df1_gdf[df1_gdf['inside_lombardia'] == True]
df2_gdf

In [None]:
# Get df2 as a traditional dataframe
df2 = df2_gdf.drop(columns='geometry') 
df2

In [None]:
# Visualize again the map to check if all the points are within the borders

# Create map centered in Lombardia
mappa_lombardia2 = folium.Map(location=(45.532592, 9.474202), zoom_start=9)

datapoints2 = plugins.MarkerCluster().add_to(mappa_lombardia2)

# Add points from df2
for lat, lng, label, in zip(df2.Latitudine, df2.Longitudine, df2.Denominazione): #uso come label il nome dell'ente
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(datapoints2)

# Add borders
folium.GeoJson(
    lombardia_gdf.geometry,
    name="Confini Lombardia",
    style_function=lambda x: {'color': 'blue', 'weight': 2, 'fillOpacity': 0.1}
).add_to(mappa_lombardia2)


# Add control layer
folium.LayerControl().add_to(mappa_lombardia2)

# Visualize map
#mappa_lombardia

# Save
mappa_lombardia2.save("mappa_lombardia_con_confini_2.html")

In [None]:
df2

In [None]:
# Remove duplicates in the "Denominazione" columns
df2_no_duplicates = df2.drop_duplicates(subset=['Denominazione'], keep='first')

In [None]:
df2_no_duplicates.reset_index(drop=True, inplace=True)
df2_no_duplicates

# Final dataset has 16828 records (from the original 17687)

In [None]:
file_path='/Users/dilettaferri/Desktop/UNIPI/SNA - Project/Project/df_finale_coordinate.csv'

#salvo il csv
df2_no_duplicates.to_csv(file_path, index=False)

<h3>Distances</h3>

In [None]:
df_finale =pd.read_csv('df_finale_coordinate.csv')
df_finale

In [None]:
# Function to compute the Haversine distance vectorized with NumPy
def haversine_vectorized(lat1, lon1, lat2, lon2):
    # Earth radius in km 
    R = 6371.0

    # Convert coordinates from degrees to radiants 
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    # Distance in km
    distance = R * c
    return distance


In [None]:
# Create NumPy array for latitudes and longitudes
lat = df_finale['Latitudine'].values
lon = df_finale['Longitudine'].values
n = len(df_finale)

# Create an empty matrix for the distances 
distanze = np.zeros((n, n))

In [None]:
# Compute distances between each pair of org 
for i in tqdm(range(n), desc="Calcolo distanze"):
    # Get i-th record coordinates
    coords_1_lat = lat[i]
    coords_1_lon = lon[i]

    distanze[i, i+1:] = haversine_vectorized(coords_1_lat, coords_1_lon, lat[i+1:], lon[i+1:])
    distanze[i+1:, i] = distanze[i, i+1:]  # Riempire la parte simmetrica della matrice

# Convert NumPy matrix in a Pandas dataframe 
df_distanze = pd.DataFrame(distanze, index=df_finale['Denominazione'], columns=df_finale['Denominazione'])

In [None]:
df_distanze

In [None]:
# Save .csv
file_path='/Users/dilettaferri/Desktop/UNIPI/SNA - Project/Project/df_distanze.csv'

df_distanze.to_csv(file_path, index=False)