In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(file_path, encoding='latin1'):
    return pd.read_csv(file_path, encoding=encoding)

In [3]:
def clean_capital_column(df, column_name='capital'):
    # Remove the thousands separator (.)
    df[column_name] = df[column_name].str.replace('.', '', regex=False)
    # Replace the decimal separator (,) and convert to float
    df[column_name] = df[column_name].str.replace(',', '.', regex=False).astype(float)
    return df

In [4]:
def drop_unnecessary_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop)

In [5]:
def filter_data(df, column_name='duration', min_value=0):
    return df[df[column_name] >= min_value]

In [6]:
def apply_log_transform(df, column_name, new_column_name=None, add_constant=False):
    if add_constant:
        df[new_column_name] = np.log1p(df[column_name])  # log1p(x) = log(1 + x)
    else:
        df[new_column_name] = np.log(df[column_name])
    return df

In [7]:
def cap_upper_outliers(df, column_name, upper_quantile=0.99):
    upper_bound = df[column_name].quantile(upper_quantile)
    df[column_name] = np.minimum(df[column_name], upper_bound)
    return df

In [8]:
def save_data(df, file_path, encoding='latin1'):
    df.to_csv(file_path, index=False, encoding=encoding)

In [9]:
def process_valencia_data(file_path_input, file_path_output):
    # Step 1: Load data
    valencia_data = load_data(file_path_input)

    # Step 2: Clean 'capital' column
    valencia_data = clean_capital_column(valencia_data, 'capital')

    # Step 3: Drop unnecessary columns
    columns_to_drop = ['address', 'cleaned_address', 'date', 'distance_to_municipio', 'objeto_social', 'registration_number']
    valencia_data_filtered = drop_unnecessary_columns(valencia_data, columns_to_drop)

    # Step 4: Filter data where 'duration' >= 0
    valencia_data_filtered = filter_data(valencia_data_filtered, 'duration', 0)

    # Step 5: Apply log transformation to 'capital'
    valencia_data_filtered = apply_log_transform(valencia_data_filtered, 'capital', 'capital_log', add_constant=True)

    # Step 6: Cap the upper outliers in 'capital_log'
    valencia_data_cleaned = cap_upper_outliers(valencia_data_filtered, 'capital_log')

    # Step 7: Apply log transformation to 'Poblacion'
    valencia_data_cleaned = apply_log_transform(valencia_data_cleaned, 'Poblacion', 'Poblacion_log')

    # Step 8: Save the cleaned data
    save_data(valencia_data_cleaned, file_path_output)

In [10]:
file_path_input = '/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_final_survival.csv'
file_path_output = '/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_cleaned.csv'

process_valencia_data(file_path_input, file_path_output)