In [14]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from sklearn.neighbors import BallTree

Load Data

In [2]:
def load_data(file_path, encoding='latin1'):
    return pd.read_csv(file_path, encoding=encoding)

Add 'urbano' Column

In [3]:
def add_urbano_column(df, population_column='Poblacion', threshold=50000):
    df['urbano'] = (df[population_column] > threshold).astype(int)
    return df


Haversine Function For Efficiency

In [4]:
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance between two points on the earth in meters."""
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km * 1000  # return in meters


Calculate Competitors

In [5]:
def calculate_competitors(df, radius):
    """Calculate the number of competitors within a given radius based on geolocation and feature similarity."""
    coords = np.radians(df[['latitude', 'longitude']].values)
    tree = BallTree(coords, metric='haversine')
    indices = tree.query_radius(coords, r=radius / 6371000)

    features = df.iloc[:, 34:55].values
    competitors_count = np.zeros(len(df), dtype=int)

    for i, neighbors in enumerate(indices):
        valid_neighbors = neighbors[neighbors != i]
        matches = np.all(features[i] == features[valid_neighbors], axis=1)
        competitors_count[i] = np.sum(matches)

    return competitors_count


Add Competitors Columns

In [6]:
def add_competitors_columns(df, radii):
    for radius in radii:
        df[f'competitors_{radius}m'] = calculate_competitors(df, radius)
    return df


Calculate Sector Ratios

In [7]:
def calculate_sector_ratios(df):
    sector_columns = [f"Sector {chr(i)}" for i in range(ord('A'), ord('U') + 1)] + ['Sin Actividad', 'Total']

    for sector in sector_columns:
        df[sector + ' Ratio'] = df[sector] / df['Poblacion']

    df.drop(columns=sector_columns, inplace=True)
    return df


Update Unemployment Metrics

In [8]:
def update_unemployment_metrics(df):
    sectors = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U']
    sector_binary_columns = [f'sector_{sector}' for sector in sectors]
    sector_ratio_columns = [f'Sector {sector} Ratio' for sector in sectors]

    for binary_col, ratio_col in zip(sector_binary_columns, sector_ratio_columns):
        df[ratio_col] = df[binary_col] * df[ratio_col]

    return df


Calculate Business Density

In [9]:
def business_density(df):
    if 'municipio' not in df.columns or 'Poblacion' not in df.columns:
        raise ValueError("DataFrame must contain 'municipio' and 'Poblacion' columns.")

    sector_columns = [col for col in df.columns if 'sector_' in col and col != 'sector_count']
    sector_totals = df.groupby('municipio')[sector_columns].sum()
    population = df.groupby('municipio')['Poblacion'].mean()
    sector_density = sector_totals.div(population, axis=0)
    sector_density.columns = [f'{col}_density' for col in sector_density.columns]

    df = df.merge(sector_density, on='municipio', how='left')
    return df


Update Sector Density Metrics

In [10]:
def update_sector_density_metrics(df):
    sectors = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U']
    sector_binary_columns = [f'sector_{sector}' for sector in sectors]
    sector_density_columns = [f'sector_{sector}_density' for sector in sectors]

    for binary_col, density_col in zip(sector_binary_columns, sector_density_columns):
        df[density_col] = df[binary_col] * df[density_col]

    return df


Save Data

In [11]:
def save_data(df, file_path, encoding='latin1'):
    df.to_csv(file_path, index=False, encoding=encoding)
    

Main Script

In [12]:
def process_valencia_data(file_path_input, file_path_output):
    # Step 1: Load data
    valencia_data = load_data(file_path_input)

    # Step 2: Add 'urbano' column
    valencia_data = add_urbano_column(valencia_data)

    # Step 3: Calculate competitors within specified radii
    radii = [500, 1000, 3000]  # in meters
    valencia_data = add_competitors_columns(valencia_data, radii)

    # Step 4: Calculate sector ratios
    valencia_data = calculate_sector_ratios(valencia_data)

    # Step 5: Update unemployment metrics
    valencia_data = update_unemployment_metrics(valencia_data)

    # Step 6: Calculate business density
    valencia_data_dense = business_density(valencia_data)

    # Step 7: Update sector density metrics
    valencia_data_dense = update_sector_density_metrics(valencia_data_dense)

    # Step 8: Save the cleaned data
    save_data(valencia_data_dense, file_path_output)


In [15]:
file_path_input = '/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_cleaned.csv'
file_path_output = '/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_feature_engineer.csv'

process_valencia_data(file_path_input, file_path_output)