## CALIFORNIA

In [16]:
from scipy.stats import f_oneway
import pandas as pd

df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4) or (year in [2021, 2022] and month <= 9):
        return 'COVID'
    elif (year == 2022 and month >= 10) or (year == 2023):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)


# Filter data for California
california_data = df[df['State'] == 'California']

# Define the selected utility providers
selected_providers = ['Pacific Gas & Electric Co', 'San Diego Gas & Electric Co', 'Southern California Edison Co',
                     'Southern California Gas Company']  # Replace with actual provider names
california_data = california_data[california_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform ANOVA for each utility provider
for provider in california_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = california_data[california_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform ANOVA if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        f_stat, p_value = f_oneway(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'F-statistic': f_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for selected providers
print("\nANOVA Results for Selected Providers in California:")
print(provider_results_df)



ANOVA Results for Selected Providers in California:
                      Utility Name  F-statistic       P-value  \
0        Pacific Gas & Electric Co    91.375288  9.654610e-20   
1      San Diego Gas & Electric Co   717.410262  4.993523e-45   
2    Southern California Edison Co   212.158038  1.816509e-29   
3  Southern California Gas Company   223.561691  1.047450e-27   

  Significant Difference  
0                    Yes  
1                    Yes  
2                    Yes  
3                    Yes  


In [17]:
from scipy.stats import kruskal
import pandas as pd

df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4) or (year in [2021, 2022] and month <= 9):
        return 'COVID'
    elif (year == 2022 and month >= 10) or (year == 2023):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)


# Filter data for California
california_data = df[df['State'] == 'California']

# Define the selected utility providers
selected_providers = ['Pacific Gas & Electric Co', 'San Diego Gas & Electric Co', 'Southern California Edison Co',
                     'Southern California Gas Company']  # Replace with actual provider names
california_data = california_data[california_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform ANOVA for each utility provider
for provider in california_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = california_data[california_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
        
     # Perform kruskal if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for selected providers
print("\nKruskal Results for Selected Providers in California:")
print(provider_results_df)



Kruskal Results for Selected Providers in California:
                      Utility Name  H-statistic       P-value  \
0        Pacific Gas & Electric Co    51.656805  6.065509e-12   
1      San Diego Gas & Electric Co    57.666896  3.004641e-13   
2    Southern California Edison Co    59.690285  1.092494e-13   
3  Southern California Gas Company    50.587029  1.035538e-11   

  Significant Difference  
0                    Yes  
1                    Yes  
2                    Yes  
3                    Yes  


In [18]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4) or (year in [2021, 2022] and month <= 9):
        return 'COVID'
    elif (year == 2022 and month >= 10) or (year == 2023):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for California
california_data = df[df['State'] == 'California']

# Define the selected utility providers
selected_providers = ['Pacific Gas & Electric Co', 'San Diego Gas & Electric Co', 'Southern California Edison Co',
                     'Southern California Gas Company']  # Replace with actual provider names
california_data = california_data[california_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []
dunn_results_all = {}

# Perform Kruskal-Wallis test and Dunn's test for each utility provider
for provider in california_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = california_data[california_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        
        # Combine the data for Dunn's test
        all_rates = np.concatenate([pre_covid_rates, covid_rates, post_covid_rates])
        groups = (
            ['Pre-COVID'] * len(pre_covid_rates) +
            ['COVID'] * len(covid_rates) +
            ['Post-COVID'] * len(post_covid_rates)
        )
        
        # Create a DataFrame for Dunn's test
        dunn_df = pd.DataFrame({
            'Disconnection Rate': all_rates,
            'Period': groups
        })
        
        # Perform Dunn's test
        dunn_results = sp.posthoc_dunn(dunn_df, val_col='Disconnection Rate', group_col='Period', p_adjust='bonferroni')
        
        # Append Kruskal-Wallis results
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })
        
        # Store Dunn's test results
        dunn_results_all[provider] = dunn_results

# Convert Kruskal-Wallis results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)


# Display the results for Dunn's test
for provider, dunn_results in dunn_results_all.items():
    print(f"\nDunn's Test Results for {provider}:")
    print(dunn_results)


# Combine data for all providers in California for Dunn's test
combined_data = california_data[['Disconnection Rate', 'Period']].dropna()

# Perform Dunn's test on the combined data
dunn_results_combined = sp.posthoc_dunn(
    combined_data,
    val_col='Disconnection Rate',
    group_col='Period',
    p_adjust='bonferroni'
)

# Display aggregated Dunn's test results for California
print("\nAggregated Dunn's Test Results for California (All Selected Providers):")
print(dunn_results_combined)


Dunn's Test Results for Pacific Gas & Electric Co:
                   COVID  Post-COVID     Pre-COVID
COVID       1.000000e+00    0.000063  5.426162e-12
Post-COVID  6.343806e-05    1.000000  2.647923e-01
Pre-COVID   5.426162e-12    0.264792  1.000000e+00

Dunn's Test Results for San Diego Gas & Electric Co:
                   COVID  Post-COVID     Pre-COVID
COVID       1.000000e+00    0.730651  1.063025e-12
Post-COVID  7.306510e-01    1.000000  1.466914e-06
Pre-COVID   1.063025e-12    0.000001  1.000000e+00

Dunn's Test Results for Southern California Edison Co:
                   COVID  Post-COVID     Pre-COVID
COVID       1.000000e+00    0.037303  4.702233e-14
Post-COVID  3.730306e-02    1.000000  1.958745e-04
Pre-COVID   4.702233e-14    0.000196  1.000000e+00

Dunn's Test Results for Southern California Gas Company:
                   COVID  Post-COVID     Pre-COVID
COVID       1.000000e+00    0.464651  8.278111e-12
Post-COVID  4.646511e-01    1.000000  5.438418e-06
Pre-COVID   8.2

## UTAH

In [19]:
from scipy.stats import f_oneway
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 7 ):
        return 'COVID'
    elif (year == 2020 and month >= 8) or (year == [2021,2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Utah
utah_data = df[df['State'] == 'Utah']

# Define the selected utility provider
selected_provider = 'Rocky Mountain Power'
utah_data = utah_data[utah_data['Utility Name'] == selected_provider]

# Initialize results list
provider_results = []

# Perform ANOVA for "Rocky Mountain Power"
# Extract rates for each period
pre_covid_rates = utah_data[utah_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
covid_rates = utah_data[utah_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
post_covid_rates = utah_data[utah_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()

# Perform ANOVA if there are rates for all periods
if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
    f_stat, p_value = f_oneway(pre_covid_rates, covid_rates, post_covid_rates)
    provider_results.append({
        'Utility Name': selected_provider,
        'F-statistic': f_stat,
        'P-value': p_value,
        'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
    })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for "Rocky Mountain Power"
print("\nANOVA Results for Rocky Mountain Power in Utah:")
print(provider_results_df)



ANOVA Results for Rocky Mountain Power in Utah:
           Utility Name  F-statistic       P-value Significant Difference
0  Rocky Mountain Power    28.586604  6.261825e-08                    Yes


In [20]:
from scipy.stats import kruskal
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 7 ):
        return 'COVID'
    elif (year == 2020 and month >= 8) or (year == [2021,2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Utah
utah_data = df[df['State'] == 'Utah']

# Define the selected utility provider
selected_provider = 'Rocky Mountain Power'
utah_data = utah_data[utah_data['Utility Name'] == selected_provider]

# Initialize results list
provider_results = []

# Perform Kruskal-Wallis test for the selected provider
for provider in utah_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = utah_data[utah_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for the selected provider
print("\nKruskal Results for Rocky Mountain Power in Utah:")
print(provider_results_df)



Kruskal Results for Rocky Mountain Power in Utah:
           Utility Name  H-statistic   P-value Significant Difference
0  Rocky Mountain Power    10.405116  0.005502                    Yes


In [21]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 7 ):
        return 'COVID'
    elif (year == 2020 and month >= 8) or (year == [2021,2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Utah
utah_data = df[df['State'] == 'Utah']

# Focus on the provider "Rocky Mountain Power"
utah_data = utah_data[utah_data['Utility Name'] == 'Rocky Mountain Power']

# Initialize results list
provider_results = []
dunn_results_all = {}

# Extract rates for each period
pre_covid_rates = utah_data[utah_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
covid_rates = utah_data[utah_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
post_covid_rates = utah_data[utah_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()

# Perform Kruskal-Wallis test if there are rates for all periods
if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
    h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)

    # Combine the data for Dunn's test
    all_rates = np.concatenate([pre_covid_rates, covid_rates, post_covid_rates])
    groups = (
        ['Pre-COVID'] * len(pre_covid_rates) +
        ['COVID'] * len(covid_rates) +
        ['Post-COVID'] * len(post_covid_rates)
    )

    # Create a DataFrame for Dunn's test
    dunn_df = pd.DataFrame({
        'Disconnection Rate': all_rates,
        'Period': groups
    })

    # Perform Dunn's test
    dunn_results = sp.posthoc_dunn(dunn_df, val_col='Disconnection Rate', group_col='Period', p_adjust='bonferroni')

    # Append Kruskal-Wallis results
    provider_results.append({
        'Utility Name': 'Rocky Mountain Power',
        'H-statistic': h_stat,
        'P-value': p_value,
        'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
    })

    # Store Dunn's test results
    dunn_results_all['Rocky Mountain Power'] = dunn_results

# Convert Kruskal-Wallis results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display Dunn's test results
print("\nDunn's Test Results for Rocky Mountain Power in Utah:")
print(dunn_results_all['Rocky Mountain Power'])



Dunn's Test Results for Rocky Mountain Power in Utah:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.039669   0.004019
Post-COVID  0.039669    1.000000   1.000000
Pre-COVID   0.004019    1.000000   1.000000


## IOWA

In [22]:
from scipy.stats import f_oneway
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6 ):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year == [2021,2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Iowa
iowa_data = df[df['State'] == 'Iowa']

# Define the selected utility providers
selected_providers = [
    'Alliant Energy (Interstate Power & Light) - Electric',
    'Alliant Energy (Interstate Power & Light) - Gas',
    'Black Hills Energy',
    'Liberty Utilities',
    'MidAmerican Energy - Electric',
    'MidAmerican Energy - Gas'
]
iowa_data = iowa_data[iowa_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform ANOVA for each utility provider
for provider in iowa_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = iowa_data[iowa_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform ANOVA if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        f_stat, p_value = f_oneway(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'F-statistic': f_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for selected providers
print("\nANOVA Results for Selected Providers in Iowa:")
print(provider_results_df)



ANOVA Results for Selected Providers in Iowa:
                                        Utility Name  F-statistic   P-value  \
0  Alliant Energy (Interstate Power & Light) - El...     2.522069  0.096136   
1    Alliant Energy (Interstate Power & Light) - Gas     2.840747  0.073173   
2                                 Black Hills Energy     2.283432  0.118303   
3                                  Liberty Utilities     2.808338  0.075216   
4                      MidAmerican Energy - Electric     2.962092  0.066030   
5                           MidAmerican Energy - Gas     1.827784  0.177158   

  Significant Difference  
0                     No  
1                     No  
2                     No  
3                     No  
4                     No  
5                     No  


In [23]:
from scipy.stats import kruskal
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year in [2021, 2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Iowa
iowa_data = df[df['State'] == 'Iowa']

# Define the selected utility providers
selected_providers = [
    'Alliant Energy (Interstate Power & Light) - Electric',
    'Alliant Energy (Interstate Power & Light) - Gas',
    'Black Hills Energy',
    'Liberty Utilities',
    'MidAmerican Energy - Electric',
    'MidAmerican Energy - Gas'
]
iowa_data = iowa_data[iowa_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform Kruskal-Wallis test for each provider
for provider in iowa_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = iowa_data[iowa_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for the selected providers
print("\nKruskal Results for Selected Providers in Iowa:")
print(provider_results_df)



Kruskal Results for Selected Providers in Iowa:
                                        Utility Name  H-statistic   P-value  \
0  Alliant Energy (Interstate Power & Light) - El...     8.914225  0.011596   
1    Alliant Energy (Interstate Power & Light) - Gas     9.692966  0.007856   
2                                 Black Hills Energy     9.593419  0.008257   
3                                  Liberty Utilities     7.183126  0.027555   
4                      MidAmerican Energy - Electric     9.710857  0.007786   
5                           MidAmerican Energy - Gas    11.691823  0.002892   

  Significant Difference  
0                    Yes  
1                    Yes  
2                    Yes  
3                    Yes  
4                    Yes  
5                    Yes  


In [31]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year in [2021, 2022]):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Iowa
iowa_data = df[df['State'] == 'Iowa']

# Define the selected utility providers
selected_providers = [
    'Alliant Energy (Interstate Power & Light) - Electric',
    'Alliant Energy (Interstate Power & Light) - Gas',
    'Black Hills Energy',
    'Liberty Utilities',
    'MidAmerican Energy - Electric',
    'MidAmerican Energy - Gas'
]
iowa_data = iowa_data[iowa_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []
dunn_results_all = {}

# Perform Kruskal-Wallis test and Dunn's test for each provider
for provider in iowa_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = iowa_data[iowa_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        
        # Combine the data for Dunn's test
        all_rates = np.concatenate([pre_covid_rates, covid_rates, post_covid_rates])
        groups = (
            ['Pre-COVID'] * len(pre_covid_rates) +
            ['COVID'] * len(covid_rates) +
            ['Post-COVID'] * len(post_covid_rates)
        )
        
        # Create a DataFrame for Dunn's test
        dunn_df = pd.DataFrame({
            'Disconnection Rate': all_rates,
            'Period': groups
        })
        
        # Perform Dunn's test
        dunn_results = sp.posthoc_dunn(dunn_df, val_col='Disconnection Rate', group_col='Period', p_adjust='bonferroni')
        
        # Append Kruskal-Wallis results
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })
        
        # Store Dunn's test results
        dunn_results_all[provider] = dunn_results

# Convert Kruskal-Wallis results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display Dunn's test results for each provider
for provider, dunn_results in dunn_results_all.items():
    print(f"\nDunn's Test Results for {provider}:")
    print(dunn_results)
    
    
# Combine data for all providers in California for Dunn's test
combined_data = iowa_data[['Disconnection Rate', 'Period']].dropna()

# Perform Dunn's test on the combined data
dunn_results_combined = sp.posthoc_dunn(
    combined_data,
    val_col='Disconnection Rate',
    group_col='Period',
    p_adjust='bonferroni'
)

# Display aggregated Dunn's test results for Iowa
print("\nAggregated Dunn's Test Results for Iowa (All Selected Providers):")
print(dunn_results_combined)



Dunn's Test Results for Alliant Energy (Interstate Power & Light) - Electric:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.066595   0.011573
Post-COVID  0.066595    1.000000   0.494237
Pre-COVID   0.011573    0.494237   1.000000

Dunn's Test Results for Alliant Energy (Interstate Power & Light) - Gas:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.043985   0.007236
Post-COVID  0.043985    1.000000   0.512183
Pre-COVID   0.007236    0.512183   1.000000

Dunn's Test Results for Black Hills Energy:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.075487   0.009815
Post-COVID  0.075487    1.000000   0.318714
Pre-COVID   0.009815    0.318714   1.000000

Dunn's Test Results for Liberty Utilities:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.038595   0.022424
Post-COVID  0.038595    1.000000   1.000000
Pre-COVID   0.022424    1.000000   1.000000

Dunn's Test Results for MidAmerican Energy - Electric

## MISSOURI

In [25]:
from scipy.stats import f_oneway
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2019 and month >= 8) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year ==2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Missouri
missouri_data = df[df['State'] == 'Missouri']

# Define the selected utility providers
selected_providers = [
    'Ameren Missouri',
    'City Utilities of Springfield',
    'Empire',
    'Evergy Metro',
    'Evergy West',
    'Liberty Utilities',
    'Spire',
    'Summit'
]
missouri_data = missouri_data[missouri_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform ANOVA for each utility provider
for provider in missouri_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = missouri_data[missouri_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform ANOVA if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        f_stat, p_value = f_oneway(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'F-statistic': f_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for selected providers in Missouri
print("\nANOVA Results for Selected Providers in Missouri:")
print(provider_results_df)



ANOVA Results for Selected Providers in Missouri:
                    Utility Name  F-statistic       P-value  \
0                Ameren Missouri     3.399657  4.583687e-02   
1  City Utilities of Springfield    36.466797  9.299547e-09   
2                         Empire     3.279466  5.062903e-02   
3                   Evergy Metro     2.185759  1.288893e-01   
4                    Evergy West     1.847551  1.740451e-01   
5              Liberty Utilities     3.091790  5.920704e-02   
6                          Spire     2.542907  9.442232e-02   
7                         Summit     1.055756  3.597361e-01   

  Significant Difference  
0                    Yes  
1                    Yes  
2                     No  
3                     No  
4                     No  
5                     No  
6                     No  
7                     No  


In [26]:
from scipy.stats import kruskal
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2019 and month >= 8) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year ==2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Missouri
missouri_data = df[df['State'] == 'Missouri']

# Define the selected utility providers
selected_providers = [
    'Ameren Missouri',
    'City Utilities of Springfield',
    'Empire',
    'Evergy Metro',
    'Evergy West',
    'Liberty Utilities',
    'Spire',
    'Summit'
]
missouri_data = missouri_data[missouri_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform Kruskal-Wallis test for each provider
for provider in missouri_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = missouri_data[missouri_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for the selected providers
print("\nKruskal Results for Selected Providers in Missouri:")
print(provider_results_df)



Kruskal Results for Selected Providers in Missouri:
                    Utility Name  H-statistic   P-value Significant Difference
0                Ameren Missouri     8.029895  0.018044                    Yes
1  City Utilities of Springfield     8.537549  0.013999                    Yes
2                         Empire     9.768543  0.007565                    Yes
3                   Evergy Metro     6.349802  0.041798                    Yes
4                    Evergy West     6.193017  0.045207                    Yes
5              Liberty Utilities     9.009146  0.011058                    Yes
6                          Spire     8.018778  0.018144                    Yes
7                         Summit     7.536448  0.023093                    Yes


In [27]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2019 and month >= 8) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 6):
        return 'COVID'
    elif (year == 2020 and month >= 7) or (year ==2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for Missouri
missouri_data = df[df['State'] == 'Missouri']

# Define the selected utility providers
selected_providers = [
    'Ameren Missouri',
    'City Utilities of Springfield',
    'Empire',
    'Evergy Metro',
    'Evergy West',
    'Liberty Utilities',
    'Spire',
    'Summit'
]
missouri_data = missouri_data[missouri_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []
dunn_results_all = {}

# Perform Kruskal-Wallis test and Dunn's test for each provider
for provider in missouri_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = missouri_data[missouri_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        
        # Combine the data for Dunn's test
        all_rates = np.concatenate([pre_covid_rates, covid_rates, post_covid_rates])
        groups = (
            ['Pre-COVID'] * len(pre_covid_rates) +
            ['COVID'] * len(covid_rates) +
            ['Post-COVID'] * len(post_covid_rates)
        )
        
        # Create a DataFrame for Dunn's test
        dunn_df = pd.DataFrame({
            'Disconnection Rate': all_rates,
            'Period': groups
        })
        
        # Perform Dunn's test
        dunn_results = sp.posthoc_dunn(dunn_df, val_col='Disconnection Rate', group_col='Period', p_adjust='bonferroni')
        
        # Append Kruskal-Wallis results
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })
        
        # Store Dunn's test results
        dunn_results_all[provider] = dunn_results

# Convert Kruskal-Wallis results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display Dunn's test results for each provider
for provider, dunn_results in dunn_results_all.items():
    print(f"\nDunn's Test Results for {provider}:")
    print(dunn_results)

# Combine data for all providers in Missouri for Dunn's test
combined_data = missouri_data[['Disconnection Rate', 'Period']].dropna()

# Perform Dunn's test on the combined data
dunn_results_combined = sp.posthoc_dunn(
    combined_data,
    val_col='Disconnection Rate',
    group_col='Period',
    p_adjust='bonferroni'
)

# Display aggregated Dunn's test results for Missouri
print("\nAggregated Dunn's Test Results for Missouri (All Selected Providers):")
print(dunn_results_combined)



Dunn's Test Results for Ameren Missouri:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.017178   0.028342
Post-COVID  0.017178    1.000000   1.000000
Pre-COVID   0.028342    1.000000   1.000000

Dunn's Test Results for City Utilities of Springfield:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.010808   0.088324
Post-COVID  0.010808    1.000000   1.000000
Pre-COVID   0.088324    1.000000   1.000000

Dunn's Test Results for Empire:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.009422   0.231647
Post-COVID  0.009422    1.000000   0.400791
Pre-COVID   0.231647    0.400791   1.000000

Dunn's Test Results for Evergy Metro:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.035218   0.128276
Post-COVID  0.035218    1.000000   1.000000
Pre-COVID   0.128276    1.000000   1.000000

Dunn's Test Results for Evergy West:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.040171   0.094029


## NORTH CAROLINA

In [28]:
from scipy.stats import f_oneway
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 10):
        return 'COVID'
    elif (year == 2020 and month >= 11) or (year == 2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for North Carolina
nc_data = df[df['State'] == 'North Carolina']

# Define the selected utility providers
selected_providers = [
    'City of New Bern',
    'Dominion Energy North Carolina',
    'Duke Energy Carolinas',
    'Duke Energy Progress',
    'Frontier Natural Gas',
    'New River Light & Power',
    'Piedmont Natural Gas',
    'Toccoa Natural Gas'
]
nc_data = nc_data[nc_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform ANOVA for each utility provider
for provider in nc_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = nc_data[nc_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform ANOVA if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        f_stat, p_value = f_oneway(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'F-statistic': f_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for selected providers in North Carolina
print("\nANOVA Results for Selected Providers in North Carolina:")
print(provider_results_df)



ANOVA Results for Selected Providers in North Carolina:
                     Utility Name  F-statistic       P-value  \
0                City of New Bern    10.328642  1.713565e-04   
1  Dominion Energy North Carolina     7.427045  1.476316e-03   
2           Duke Energy Carolinas    77.358987  3.585894e-16   
3            Duke Energy Progress    12.414994  4.191392e-05   
4            Frontier Natural Gas     2.452472  9.617426e-02   
5         New River Light & Power    10.033389  2.169934e-04   
6            Piedmont Natural Gas    12.357840  4.354681e-05   
7              Toccoa Natural Gas     1.262368  2.922083e-01   

  Significant Difference  
0                    Yes  
1                    Yes  
2                    Yes  
3                    Yes  
4                     No  
5                    Yes  
6                    Yes  
7                     No  


In [29]:
from scipy.stats import kruskal
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 10):
        return 'COVID'
    elif (year == 2020 and month >= 11) or (year == 2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for North Carolina
nc_data = df[df['State'] == 'North Carolina']

# Define the selected utility providers
selected_providers = [
    'City of New Bern',
    'Dominion Energy North Carolina',
    'Duke Energy Carolinas',
    'Duke Energy Progress',
    'Frontier Natural Gas',
    'New River Light & Power',
    'Piedmont Natural Gas',
    'Toccoa Natural Gas'
]
nc_data = nc_data[nc_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []

# Perform Kruskal-Wallis test for each provider
for provider in nc_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = nc_data[nc_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })

# Convert results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display the results for the selected providers
print("\nKruskal Results for Selected Providers in North Carolina:")
print(provider_results_df)



Kruskal Results for Selected Providers in North Carolina:
                     Utility Name  H-statistic       P-value  \
0                City of New Bern    21.364990  2.294306e-05   
1  Dominion Energy North Carolina    14.983288  5.577254e-04   
2           Duke Energy Carolinas    38.945253  3.492575e-09   
3            Duke Energy Progress    20.338187  3.833705e-05   
4            Frontier Natural Gas     9.149384  1.030947e-02   
5         New River Light & Power    15.452956  4.409945e-04   
6            Piedmont Natural Gas    23.766951  6.903547e-06   
7              Toccoa Natural Gas     3.925204  1.404924e-01   

  Significant Difference  
0                    Yes  
1                    Yes  
2                    Yes  
3                    Yes  
4                    Yes  
5                    Yes  
6                    Yes  
7                     No  


In [30]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

# Load the dataset
df = pd.read_csv("C:/Users/gokul/Downloads/hard_fix_september_11_2024.csv")

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Map month names to numeric values
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month'] = df['Month'].map(month_mapping)

# Categorize years and months into specific periods
def categorize_period(year, month):
    if pd.isna(year) or pd.isna(month):
        return 'Unknown'  # Handle missing values
    if (year == 2018) or (year == 2019) or (year == 2020 and month <= 3):
        return 'Pre-COVID'
    elif (year == 2020 and month >= 4 and month <= 10):
        return 'COVID'
    elif (year == 2020 and month >= 11) or (year == 2021) or (year == 2022 and month <= 6):
        return 'Post-COVID'

# Apply the function to create the Period column
df['Period'] = df.apply(lambda row: categorize_period(row['Year'], row['Month']), axis=1)

# Filter data for North Carolina
nc_data = df[df['State'] == 'North Carolina']

# Define the selected utility providers
selected_providers = [
    'City of New Bern',
    'Dominion Energy North Carolina',
    'Duke Energy Carolinas',
    'Duke Energy Progress',
    'Frontier Natural Gas',
    'New River Light & Power',
    'Piedmont Natural Gas',
    'Toccoa Natural Gas'
]
nc_data = nc_data[nc_data['Utility Name'].isin(selected_providers)]

# Initialize results list
provider_results = []
dunn_results_all = {}

# Perform Kruskal-Wallis test and Dunn's test for each provider
for provider in nc_data['Utility Name'].unique():
    # Filter data for the current provider
    provider_data = nc_data[nc_data['Utility Name'] == provider]
    
    # Extract rates for each period
    pre_covid_rates = provider_data[provider_data['Period'] == 'Pre-COVID']['Disconnection Rate'].dropna()
    covid_rates = provider_data[provider_data['Period'] == 'COVID']['Disconnection Rate'].dropna()
    post_covid_rates = provider_data[provider_data['Period'] == 'Post-COVID']['Disconnection Rate'].dropna()
    
    # Perform Kruskal-Wallis test if there are rates for all periods
    if not (pre_covid_rates.empty or covid_rates.empty or post_covid_rates.empty):
        h_stat, p_value = kruskal(pre_covid_rates, covid_rates, post_covid_rates)
        
        # Combine the data for Dunn's test
        all_rates = np.concatenate([pre_covid_rates, covid_rates, post_covid_rates])
        groups = (
            ['Pre-COVID'] * len(pre_covid_rates) +
            ['COVID'] * len(covid_rates) +
            ['Post-COVID'] * len(post_covid_rates)
        )
        
        # Create a DataFrame for Dunn's test
        dunn_df = pd.DataFrame({
            'Disconnection Rate': all_rates,
            'Period': groups
        })
        
        # Perform Dunn's test
        dunn_results = sp.posthoc_dunn(dunn_df, val_col='Disconnection Rate', group_col='Period', p_adjust='bonferroni')
        
        # Append Kruskal-Wallis results
        provider_results.append({
            'Utility Name': provider,
            'H-statistic': h_stat,
            'P-value': p_value,
            'Significant Difference': 'Yes' if p_value < 0.05 else 'No'
        })
        
        # Store Dunn's test results
        dunn_results_all[provider] = dunn_results

# Convert Kruskal-Wallis results to a DataFrame
provider_results_df = pd.DataFrame(provider_results)

# Display Dunn's test results for each provider
for provider, dunn_results in dunn_results_all.items():
    print(f"\nDunn's Test Results for {provider}:")
    print(dunn_results)

# Combine data for all providers in North Carolina for Dunn's test
combined_data = nc_data[['Disconnection Rate', 'Period']].dropna()

# Perform Dunn's test on the combined data
dunn_results_combined = sp.posthoc_dunn(
    combined_data,
    val_col='Disconnection Rate',
    group_col='Period',
    p_adjust='bonferroni'
)

# Display aggregated Dunn's test results for North Carolina
print("\nAggregated Dunn's Test Results for North Carolina (All Selected Providers):")
print(dunn_results_combined)



Dunn's Test Results for City of New Bern:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    1.000000   0.012607
Post-COVID  1.000000    1.000000   0.000048
Pre-COVID   0.012607    0.000048   1.000000

Dunn's Test Results for Dominion Energy North Carolina:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.039191   0.000417
Post-COVID  0.039191    1.000000   0.224078
Pre-COVID   0.000417    0.224078   1.000000

Dunn's Test Results for Duke Energy Carolinas:
                   COVID  Post-COVID     Pre-COVID
COVID       1.000000e+00    0.262968  7.347616e-07
Post-COVID  2.629678e-01    1.000000  3.210913e-06
Pre-COVID   7.347616e-07    0.000003  1.000000e+00

Dunn's Test Results for Duke Energy Progress:
               COVID  Post-COVID  Pre-COVID
COVID       1.000000    0.020619   0.000033
Post-COVID  0.020619    1.000000   0.064282
Pre-COVID   0.000033    0.064282   1.000000

Dunn's Test Results for Frontier Natural Gas:
               COVID  Post