In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# gervasio file_path
file_path = r"C:\Users\gerva\OneDrive\Documentos\GitHub\ST498_CAPSTONE\FLASH\toShare"


In [2]:
pre_start = pd.Timestamp("2023-02-01 00:00:00")
pre_end   = pd.Timestamp("2024-01-31 23:59:59")

post_start = pd.Timestamp("2024-04-01 00:00:00")
post_end   = pd.Timestamp("2024-12-31 23:59:59")

def preprocess(file_name, start_date, end_date):
    df = pd.read_csv(os.path.join(file_path, file_name))
    df = df.rename(columns={'Unnamed: 0': 'Time'})
    
    # Convert Time column to datetime
    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %H:%M:%S")
    print(f"Processing {file_name} -> Min Time: {df['Time'].min()}, Max Time: {df['Time'].max()}")
    df = df[(df['Time'] >= start_date) & (df['Time'] <= end_date)] # Filter by time range (pre or post)
  
    df_long = df.melt(id_vars=['Time'], var_name='ANON_ID', value_name='ELEC_KWH') # Convert from wide to long format
    df_long.drop_duplicates(subset=['Time', 'ANON_ID'], inplace=True) # Remove duplicates after melting

    return df_long



In [3]:
df_controlpre        = preprocess("controlePreConsolide.csv", pre_start, pre_end)
df_interventionpre   = preprocess("interventionPreConsolide.csv", pre_start, pre_end)
print(df_controlpre.shape)
print(df_interventionpre.shape)
df_pre = pd.concat([df_controlpre,df_interventionpre],ignore_index=True)
df_pre

Processing controlePreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
Processing interventionPreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
(5540480, 3)
(6722577, 3)


Unnamed: 0,Time,ANON_ID,ELEC_KWH
0,2023-02-02 00:00:00,4dd3317694364b953434dc42eff7f9950095c4ab537c3b...,0.140
1,2023-02-02 00:30:00,4dd3317694364b953434dc42eff7f9950095c4ab537c3b...,0.138
2,2023-02-02 01:00:00,4dd3317694364b953434dc42eff7f9950095c4ab537c3b...,0.128
3,2023-02-02 01:30:00,4dd3317694364b953434dc42eff7f9950095c4ab537c3b...,0.149
4,2023-02-02 02:00:00,4dd3317694364b953434dc42eff7f9950095c4ab537c3b...,0.137
...,...,...,...
12263052,2024-01-31 21:30:00,0ac6b700dd2b349baeca58a73029d17346d6d2396c9d77...,0.182
12263053,2024-01-31 22:00:00,0ac6b700dd2b349baeca58a73029d17346d6d2396c9d77...,0.113
12263054,2024-01-31 22:30:00,0ac6b700dd2b349baeca58a73029d17346d6d2396c9d77...,0.084
12263055,2024-01-31 23:00:00,0ac6b700dd2b349baeca58a73029d17346d6d2396c9d77...,0.027


In [5]:
df_controlpost        = preprocess("controleConsolide.csv", post_start, post_end)
df_interventionpost   = preprocess("interventionConsolide.csv", post_start, post_end)
print(df_controlpost.shape)
print(df_interventionpost.shape)
df_post = pd.concat([df_controlpost,df_interventionpost],ignore_index=True)
df_post

Processing controleConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00
Processing interventionConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00
(5706668, 3)
(7497828, 3)


Unnamed: 0,Time,ANON_ID,ELEC_KWH
0,2024-04-01 00:00:00,8d606eeffaed80dba63c8b58b3c8c5268854bb561079c9...,0.020
1,2024-04-01 00:30:00,8d606eeffaed80dba63c8b58b3c8c5268854bb561079c9...,0.063
2,2024-04-01 01:00:00,8d606eeffaed80dba63c8b58b3c8c5268854bb561079c9...,0.047
3,2024-04-01 01:30:00,8d606eeffaed80dba63c8b58b3c8c5268854bb561079c9...,0.015
4,2024-04-01 02:00:00,8d606eeffaed80dba63c8b58b3c8c5268854bb561079c9...,0.051
...,...,...,...
13204491,2024-12-31 21:30:00,de09a2d4d2f5c6284ec02524272ba0ece1ffbdb6557348...,0.048
13204492,2024-12-31 22:00:00,de09a2d4d2f5c6284ec02524272ba0ece1ffbdb6557348...,0.056
13204493,2024-12-31 22:30:00,de09a2d4d2f5c6284ec02524272ba0ece1ffbdb6557348...,0.095
13204494,2024-12-31 23:00:00,de09a2d4d2f5c6284ec02524272ba0ece1ffbdb6557348...,0.039


In [9]:
import pandas as pd
import os

pre_start = pd.Timestamp("2023-02-01 00:00:00")
pre_end    = pd.Timestamp("2024-01-31 23:59:59")

post_start = pd.Timestamp("2024-04-01 00:00:00")
post_end    = pd.Timestamp("2024-12-31 23:59:59")

file_path = r"C:\Users\gerva\OneDrive\Documentos\GitHub\ST498_CAPSTONE\FLASH\toShare"

def preprocess(file_name, start_date, end_date, treatment_value, time_value):
    df = pd.read_csv(os.path.join(file_path, file_name))
    df = df.rename(columns={'Unnamed: 0': 'Time'})

    # Convert Time column to datetime
    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %H:%M:%S")
    print(f"Processing {file_name} -> Min Time: {df['Time'].min()}, Max Time: {df['Time'].max()}")
    df = df[(df['Time'] >= start_date) & (df['Time'] <= end_date)] # Filter by time range (pre or post)

    df_long = df.melt(id_vars=['Time'], var_name='ANON_ID', value_name='ELEC_KWH') # Convert from wide to long format
    df_long.drop_duplicates(subset=['Time', 'ANON_ID'], inplace=True) # Remove duplicates after melting

    # Add 'Treatment' and 'Posterior' columns
    df_long['Treatment'] = treatment_value
    df_long['Posterior'] = time_value

    return df_long

df_controlpre    = preprocess("controlePreConsolide.csv", pre_start, pre_end, 0, 0)
df_interventionpre = preprocess("interventionPreConsolide.csv", pre_start, pre_end, 1, 0)
print(df_controlpre.shape)
print(df_interventionpre.shape)
df_pre = pd.concat([df_controlpre,df_interventionpre],ignore_index=True)
print(df_pre.head())

df_controlpost    = preprocess("controleConsolide.csv", post_start, post_end, 0, 1)
df_interventionpost = preprocess("interventionConsolide.csv", post_start, post_end, 1, 1)
print(df_controlpost.shape)
print(df_interventionpost.shape)
df_post = pd.concat([df_controlpost, df_interventionpost], ignore_index=True)
print(df_post.head())

df_all = pd.concat([df_pre, df_post], ignore_index=True)
print(df_all.head())

# Add the 'Special_Month' column
def assign_special_month(row):
    month = row['Time'].month
    if month == 12:
        return 1
    elif month in [4, 5]:
        return 2
    elif month in [6, 7, 9]:
        return 3
    elif month == 8:
        return 4
    elif month == 10:
        return 5
    elif month == 11:
        return 6
    else:
        return 0

df_all['Special_Month'] = df_all.apply(assign_special_month, axis=1)
print(df_all.head())

# Create the dataframe averaging lectures between 16 and 19 hours
df_lectures = df_all[(df_all['Time'].dt.hour >= 16) & (df_all['Time'].dt.hour < 19)].copy()
df_lectures_avg = df_lectures.groupby(['ANON_ID', 'Treatment', 'Posterior', 'Special_Month'])['ELEC_KWH'].mean().reset_index()

print(df_lectures_avg.head())

Processing controlePreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
Processing interventionPreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
(5540480, 5)
(6722577, 5)
                 Time                                            ANON_ID  \
0 2023-02-02 00:00:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
1 2023-02-02 00:30:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
2 2023-02-02 01:00:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
3 2023-02-02 01:30:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
4 2023-02-02 02:00:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   

   ELEC_KWH  Treatment  Posterior  
0     0.140          0          0  
1     0.138          0          0  
2     0.128          0          0  
3     0.149          0          0  
4     0.137          0          0  
Processing controleConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00


In [16]:
import pandas as pd
import os

pre_start = pd.Timestamp("2023-04-01 00:00:00")
pre_end    = pd.Timestamp("2023-12-31 23:59:59")

post_start = pd.Timestamp("2024-04-01 00:00:00")
post_end    = pd.Timestamp("2024-12-31 23:59:59")

file_path = r"C:\Users\gerva\OneDrive\Documentos\GitHub\ST498_CAPSTONE\FLASH\toShare"

def preprocess(file_name, start_date, end_date, treatment_value, time_value):
    df = pd.read_csv(os.path.join(file_path, file_name))
    df = df.rename(columns={'Unnamed: 0': 'Time'})

    # Convert Time column to datetime
    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %H:%M:%S")
    print(f"Processing {file_name} -> Min Time: {df['Time'].min()}, Max Time: {df['Time'].max()}")
    df = df[(df['Time'] >= start_date) & (df['Time'] <= end_date)] # Filter by time range (pre or post)

    df_long = df.melt(id_vars=['Time'], var_name='ANON_ID', value_name='ELEC_KWH') # Convert from wide to long format
    df_long.drop_duplicates(subset=['Time', 'ANON_ID'], inplace=True) # Remove duplicates after melting

    # Add 'Treatment' and 'Posterior' columns
    df_long['Treatment'] = treatment_value
    df_long['Posterior'] = time_value

    return df_long

df_controlpre    = preprocess("controlePreConsolide.csv", pre_start, pre_end, 0, 0)
df_interventionpre = preprocess("interventionPreConsolide.csv", pre_start, pre_end, 1, 0)
df_pre = pd.concat([df_controlpre, df_interventionpre], ignore_index=True)
print(f"Shape of combined pre data: {df_pre.shape}")

df_controlpost    = preprocess("controleConsolide.csv", post_start, post_end, 0, 1)
df_interventionpost = preprocess("interventionConsolide.csv", post_start, post_end, 1, 1)
df_post = pd.concat([df_controlpost, df_interventionpost], ignore_index=True)
print(f"Shape of combined post data: {df_post.shape}")

# Get unique ANON_IDs from the Post data
pre_unique_ids = df_pre['ANON_ID'].unique().tolist()
print(f"Number of unique ANON_IDs in Post data: {len(pre_unique_ids)}")

# Filter the Pre data to keep only ANON_IDs present in Post
df_post_filtered = df_post[df_post['ANON_ID'].isin(pre_unique_ids)].reset_index(drop=True)
print(f"Shape of filtered pre data: {df_post_filtered.shape}")

# Combine the filtered Pre data with the original Post data
df_all = pd.concat([df_pre, df_post_filtered], ignore_index=True)
print(df_all.head())

# Add the 'Special_Month' column
def assign_special_month(row):
    month = row['Time'].month
    if month == 12:
        return 1
    elif month in [4, 5]:
        return 2
    elif month in [6, 7, 9]:
        return 3
    elif month == 8:
        return 4
    elif month == 10:
        return 5
    elif month == 11:
        return 6
    else:
        return 0

df_all['Special_Month'] = df_all.apply(assign_special_month, axis=1)
print(df_all.head())

# Create the dataframe averaging lectures between 16 and 19 hours
df_lectures = df_all[(df_all['Time'].dt.hour >= 16) & (df_all['Time'].dt.hour < 19)].copy()
df_lectures_avg = df_lectures.groupby(['Time', 'ANON_ID', 'Treatment', 'Posterior', 'Special_Month'])['ELEC_KWH'].mean().reset_index()

print(df_lectures_avg.head())

Processing controlePreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
Processing interventionPreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
Shape of combined pre data: (12263057, 5)
Processing controleConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00
Processing interventionConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00
Shape of combined post data: (13204496, 5)
Number of unique ANON_IDs in Post data: 707
Shape of filtered pre data: (7954548, 5)
                 Time                                            ANON_ID  \
0 2023-02-02 00:00:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
1 2023-02-02 00:30:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
2 2023-02-02 01:00:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
3 2023-02-02 01:30:00  4dd3317694364b953434dc42eff7f9950095c4ab537c3b...   
4 2023-02-02 02:00:00  4dd3317694364b953434dc42e

In [20]:
import statsmodels.formula.api as smf

# baseline FE model, HH + calendar‑month FE
fml = ('kWh ~ Post*Treatment*C(Special_Month)'
       ' + C(month) + C(ANON_ID)')
mod = smf.ols(fml, data=monthly).fit(
          cov_type='cluster',
          cov_kwds={'groups': monthly['ANON_ID']})

print(mod.summary().tables[1])


NameError: name 'monthly' is not defined