In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../data/clean/building_permits_addition_alteration_clean.csv")
display(df.head(1))

## Numerical Data Analysis

In [None]:
df_numerical = df.select_dtypes(include="number")
df_numerical

In [None]:
#creating correlation matrix

corr=np.abs(df_numerical.corr()) # corr(x,y) = corr(y, x), corr(x,x) = 1

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

### Costs

In [None]:
cost_columns = [col for col in df_numerical.columns if 'cost' in col.lower()]
display(cost_columns)

#### Costs distribution for the different subsystems

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(cost_columns):
    plt.subplot(3, 3, i+1)
    sns.histplot(df[col].dropna(), kde=True, log_scale=True) # Log scale due to skewness
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
plt.show()

#### Comparison costs across different subsystems

In [None]:
plt.figure(figsize=(14, 8))
cost_data = df[cost_columns].melt(var_name='Cost Type', value_name='Amount') # Melt the dataframe costs into one column
sns.boxplot(x='Cost Type', y='Amount', data=cost_data)
plt.yscale('log')  # Log scale for better visualization
plt.title('Cost Comparison Across Different Subsystems')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Correlation matrix for costs

In [None]:
#creating correlation matrix

corr=np.abs(df[cost_columns].corr()) # corr(x,y) = corr(y, x), corr(x,x) = 1

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

#### Comparison Building Cost vs Total Cost

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['building_cost'], df['calc_total_cost'], alpha=0.5)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Building Cost ($)')
plt.ylabel('Total Cost ($)')
plt.title('Building Cost vs Total Cost')
plt.tight_layout()
plt.show()

#### Test Pairplot Building Cost vs Total Cost

In [None]:
sns.pairplot(df[["calc_total_cost", "building_cost", "status"]], hue='status')
# plt.xscale('log')
# plt.yscale('log')
plt.show()

#### Total Cost by Number of Units

In [None]:
units_filter = df[(df['number_of_units'] <= 10)] # Filter number of units for visualization
plt.figure(figsize=(12, 6))
sns.boxplot(x='number_of_units', y='calc_total_cost', data=units_filter)
plt.title('Total Cost by Number of Units')
plt.yscale('log')
plt.tight_layout()
plt.show()

#### Average costs by year

In [None]:
yearly_costs = df.groupby('issue_year')[cost_columns].mean()

plt.figure(figsize=(16, 8))
yearly_costs.plot(marker='o')
plt.title('Average Costs by Year')
plt.ylabel('Cost ($)')
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

#### Number of Permits by Year

In [None]:
permits_by_year = df.groupby('issue_year').size()
    
plt.figure(figsize=(10, 6))
permits_by_year.plot(kind='bar')
plt.title('Number of Permits by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

In [None]:
df

#### Monthly average costs

In [None]:

monthly_costs = df.groupby('issue_month')['calc_total_cost'].mean()#.tail(48)

plt.figure(figsize=(14, 6))
monthly_costs.plot(kind='line', marker='o')
plt.title('Average Total Cost by Month')
plt.xlabel('Year-Month')
plt.ylabel('Average Total Cost ($)')
plt.grid(True)
# plt.xticks(rotation=45)
plt.tight_layout()


#### Total Cost by Permit Status

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='status', y='calc_total_cost', data=df)
plt.title('Total Cost by Permit Status')
plt.yscale('log')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Categorical Data Analysis

In [None]:
columns = df.select_dtypes(exclude="number").columns

for column in columns:
    if df[column][0] != False:
        print("Column:", column, "| Unique Values:", df[column].nunique())
        display(df[column].unique())
        print("-------------------------")

### status

In [None]:
plt.figure(figsize=(10, 6))
status_counts = df['status'].value_counts()
status_counts.plot(kind='bar')
plt.title('Distribution of Permit Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

### building_use

In [None]:
df.groupby(["building_use"]).agg(n=('record_number', 'count'), total_cost_avg=("building_cost", "max")) \
    # .sort_values(by='record_number', ascending=False)



In [None]:
df \
    .groupby(["current_property_use"]) \
    .agg('count') \
    # .sort_values(by='current_property_use', ascending=False)

In [None]:
df \
    .groupby(["current_property_use", "building_use"]) \
    .agg('count') \
    .sort_values(by='current_property_use', ascending=False)

#### Average Total Cost by Building Use

In [None]:
avg_cost_by_use = df.groupby('building_use')['calc_total_cost'].mean().sort_values(ascending=False)
    
plt.figure(figsize=(12, 6))
avg_cost_by_use.plot(kind='bar')
plt.title('Average Total Cost by Building Use')
plt.xlabel('Building Use')
plt.ylabel('Average Total Cost ($)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

### Boolean cases

#### Percentage of Trues for Boolean features

In [None]:
boolean_columns = df.select_dtypes(include=['bool']).columns

# Calculate percentage of True values for each boolean column
bool_percentages = df[boolean_columns].mean() * 100

plt.figure(figsize=(12, 8))
bool_percentages.sort_values().plot(kind='barh')
plt.title('Percentage of True Values in Boolean Features')
plt.xlabel('Percentage (%)')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

#### Incidence of Trues in Total Cost

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(boolean_columns):
        plt.subplot(3, 4, i+1)
        sns.boxplot(x=col, y='calc_total_cost', data=df)
        plt.title(f'{col}')
        plt.yscale('log')
plt.tight_layout()
plt.show()

### building_construction_types

In [None]:
custom_order = df['building_construction_type'].unique()

In [None]:
type_counts = df['building_construction_type'].value_counts()
ordered_type_counts = type_counts.reindex(custom_order)
    
plt.figure(figsize=(12, 6))
ordered_type_counts.plot(kind='bar')
plt.title('Building Construction Types')
plt.xlabel('Construction Type')
plt.ylabel('Count')
plt.xticks(rotation=0, ha='center')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

#### Incidence of Building Construction Types in Total Cost

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='building_construction_type', y='calc_total_cost', 
            data=df[df['building_construction_type'].isin(type_counts.index)],
            order=custom_order)
plt.title('Total Cost by Building Construction Type')
plt.xlabel('Construction Type')
plt.ylabel('Total Cost ($)')
plt.yscale('log')
plt.xticks(rotation=0, ha='center')
plt.tight_layout()
plt.show()