In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
valencia_data = pd.read_csv(rf'/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_feature_engineer.csv', encoding = 'latin1')


In [None]:
def distribution_checker(data, xlabel, additional_layers=None):
    
    if additional_layers is None:
        additional_layers = []

    # Combine xlabel with additional layers for grouping
    group_by_columns = [xlabel] + additional_layers + ['failure']
    
    # Group the data by xlabel, additional layers, and failure
    grouped = data.groupby(group_by_columns)['company_name'].count().reset_index()
    
    # Pivot the grouped data to get 'failure' as columns
    pivot = grouped.pivot_table(index=group_by_columns[:-1], columns='failure', values='company_name', fill_value=0)
    
    # Rename columns for clarity (0 = No failure, 1 = Failure)
    pivot.columns = ['continuation', 'disolution']
    
    # Calculate total count of each group (sum of no_failure and failure)
    pivot['total'] = pivot['continuation'] + pivot['disolution']
    
    # Calculate percentages for each failure status
    pivot['perc_failure'] = ((pivot['disolution'] / pivot['total']) * 100).round(1)
    pivot['perc_no_failure'] = ((pivot['continuation'] / pivot['total']) * 100).round(1)
    
    # Dropping the unnecessary columns for a cleaner output
    pivot = pivot.drop(['continuation', 'disolution'], axis=1)
    
    return pivot

In [None]:
def plot_distributions(data, continuous_columns, hue='failure', kind='kde', fill=True, height=6, aspect=1.5, palette='deep'):
    # Create a grid of subplots
    num_vars = len(continuous_columns)
    num_cols = 2  # Number of columns in the subplot grid
    num_rows = (num_vars + num_cols - 1) // num_cols  # Calculate the number of rows needed
    
    # Set up the matplotlib figure
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * height * aspect, num_rows * height), constrained_layout=True)
    
    # Flatten the axes array for easy iteration
    axes = axes.flatten()
    
    for i, column in enumerate(continuous_columns):
        # Create the KDE plot
        sns.kdeplot(
            data=data,
            x=column,
            hue=hue,
            fill=fill,
            palette=palette,
            ax=axes[i]  # Pass the specific subplot axis
        )
        
        # Set titles and labels
        axes[i].set_title(f'Distribution of {column} by {hue}')
        axes[i].set_xlabel(column)
        axes[i].set_ylabel('Density')
    
    # Hide any unused subplots
    for j in range(len(continuous_columns), len(axes)):
        axes[j].axis('off')
    
    # Show the plot
    plt.show()

Urbano

In [None]:
distribution_checker(valencia_data, 'urbano')

There is no significant different between our target classes for urbano

Competitors

In [None]:
sns.displot(
    data=valencia_data, 
    x='competitors_500m',
    hue='failure', 
    kind='kde', 
    fill=True, 
    height=6, 
    aspect=1.5, 
    palette='deep'
)

plt.title('Distribution of Competitors 500m Radius Disolution and Continuation')
plt.xlabel('Competitors within 500m')
plt.ylabel('Density')

plt.show()

In [None]:
sns.displot(
    data=valencia_data, 
    x='competitors_1000m',
    hue='failure', 
    kind='kde', 
    fill=True, 
    height=6, 
    aspect=1.5, 
    palette='deep'
)

plt.title('Distribution of Competitors 1000m Radius Disolution and Continuation')
plt.xlabel('Competitors within 1000m')
plt.ylabel('Density')

plt.show()

In [None]:
sns.displot(
    data=valencia_data, 
    x='competitors_3000m',
    hue='failure', 
    kind='kde', 
    fill=True, 
    height=6, 
    aspect=1.5, 
    palette='deep'
)

plt.title('Distribution of Competitors 3000m Radius Disolution and Continuation')
plt.xlabel('Competitors within 3000m')
plt.ylabel('Density')

plt.show()

There is no apparent significant difference in failure rates for any of the competitor indicators.

Sector Unemployment Ratios

In [None]:
sector_columns = [f"Sector {chr(i)} Ratio" for i in range(ord('A'), ord('U') + 1)] + ['Sin Actividad Ratio', 'Total Ratio']
plot_distributions(valencia_data, sector_columns, hue='failure')

Unfortunately there don't appear to be any significant differences in failure based on the distribution of the ratio of unemployed individuals per sector per municipio over the poulation in the municipio.

Business Density Sector Ratios

In [None]:
density_columns = [f"sector_{chr(i)}_density" for i in range(ord('A'), ord('U') + 1)] 
plot_distributions(valencia_data, density_columns, hue='failure')

It is difficult to see if there is any significant difference classes based on the distributions of business density

Multicollinearity

In [None]:
# Select only numeric columns
numeric_columns = valencia_data.select_dtypes(include=[np.number]).columns

# Create the feature matrix X by including only numeric columns
X = valencia_data[numeric_columns]

# Drop the target columns if present
X = X.drop(columns=['duration', 'failure'], errors='ignore')  # Use errors='ignore' to avoid KeyError if columns are not present

# Add a constant to the features matrix
X = sm.add_constant(X)

# Compute VIF for each feature
vif = pd.DataFrame()
vif['Feature'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF DataFrame
print(vif)

In [None]:
vif.head(100)

We have some extremely high VIF values. We will likely remove some of these columns with high multicollinearity further on.

In [None]:
valencia_data.drop(columns = ['company_name'], inplace = True)