In [1]:
import pandas as pd

# Load Dataset
df = pd.read_csv('../data/raw/electricity_marketing_dataset.csv')

# Display the first few rows of the dataframe
print("First 5 rows of the data:")
print(df.head())

# Get a summary of the data, including data types and missing values
print("\nData Info:")
df.info()

First 5 rows of the data:
             timestamp  temperature  humidity  is_weekend  is_holiday  \
0  2024-01-01 00:00:00     0.526922  0.265203           0           1   
1  2024-01-01 01:00:00     0.437412  0.539677           0           1   
2  2024-01-01 02:00:00     0.548204  0.632031           0           1   
3  2024-01-01 03:00:00     0.671595  0.254353           0           1   
4  2024-01-01 04:00:00     0.423895  0.135163           0           1   

  consumer_type  price_signal  historical_avg_demand  voltage_level  \
0   residential      0.469818               0.080994       0.538437   
1   residential      0.431329               0.080842       0.412133   
2    commercial      0.712748               0.434790       0.784449   
3    commercial      0.708677               0.641761       0.596124   
4   residential      0.537836               0.585632       0.535468   

   grid_frequency  energy_source_mix demand_category  
0        0.228592           0.440540          Medium 

In [2]:
print("Columns:")
print(df.columns.to_list())

print("\nData types:")
print(df.dtypes)

Columns:
['timestamp', 'temperature', 'humidity', 'is_weekend', 'is_holiday', 'consumer_type', 'price_signal', 'historical_avg_demand', 'voltage_level', 'grid_frequency', 'energy_source_mix', 'demand_category']

Data types:
timestamp                 object
temperature              float64
humidity                 float64
is_weekend                 int64
is_holiday                 int64
consumer_type             object
price_signal             float64
historical_avg_demand    float64
voltage_level            float64
grid_frequency           float64
energy_source_mix        float64
demand_category           object
dtype: object


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create a directory to save the figures if it doesn't exist
import os
if not os.path.exists('figures'):
    os.makedirs('figures')
    
# --- DATA PREPARATION ---
# Convert 'timestamp' column to datetime objects for proper plotting
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Set the timestamp as the index to facilitate time-series analysis
df.set_index('timestamp', inplace=True)


# --- GRAPH 1: TIME-SERIES PLOT OF OVERALL DEMAND ---
print("Generating Time-Series Plot...")
plt.figure(figsize=(12, 6))
df['historical_avg_demand'].plot(linewidth=1)
plt.title('Historical Average Energy Demand Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Average Demand (kWh)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
# Save the figure to the 'figures' subfolder
plt.savefig('figures/demand_over_time.png', dpi=300, bbox_inches='tight')
print("Saved 'demand_over_time.png' to the 'figures' folder.")
# plt.show() # You can uncomment this to display the plot interactively


# --- GRAPH 2: BOX PLOT OF DEMAND BY CONSUMER TYPE ---
print("\nGenerating Box Plot by Consumer Type...")
plt.figure(figsize=(10, 6))
sns.boxplot(x='consumer_type', y='historical_avg_demand', data=df)
plt.title('Energy Demand Distribution by Consumer Type', fontsize=16)
plt.xlabel('Consumer Type', fontsize=12)
plt.ylabel('Average Demand (kWh)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6, axis='y')
# Save the figure to the 'figures' subfolder
plt.savefig('figures/demand_boxplot.png', dpi=300, bbox_inches='tight')
print("Saved 'demand_boxplot.png' to the 'figures' folder.")
# plt.show() # You can uncomment this to display the plot interactively

print("\nScript finished. Your graphs are ready to be inserted into your LaTeX document.")


KeyError: 'timestamp'

In [5]:
print("\n[3/3] Generating Correlation Heatmap...")
plt.figure(figsize=(10, 8))
numeric_cols = df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_cols.corr(), annot=True, fmt=".2f", cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix of Numerical Variables', fontsize=16)
plt.savefig('figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
print(" -> Saved 'correlation_heatmap.png' to 'figures' folder.")
plt.close()


[3/3] Generating Correlation Heatmap...
 -> Saved 'correlation_heatmap.png' to 'figures' folder.


In [7]:
from scipy import stats
# Table A: Descriptive Statistics
print("\n--- LaTeX Code for Descriptive Statistics Table (Table \\ref{tab:desc_stats}) ---")
desc_stats = df[['historical_avg_demand', 'temperature', 'humidity', 'price_signal']].describe()
# Using .to_latex() to generate a ready-to-paste table
print(desc_stats.to_latex(float_format="%.2f", caption="Descriptive Statistics of Key Numerical Variables", label="tab:desc_stats"))
print("-" * 50)


# Numbers for T-Test and Formula Simulation
print("\n--- Numbers for T-Test Table (Table \\ref{tab:ttest_results}) and Formula Simulation ---")
weekday_demand = df[df['is_weekend'] == 0]['historical_avg_demand']
weekend_demand = df[df['is_weekend'] == 1]['historical_avg_demand']
t_stat, p_value = stats.ttest_ind(weekday_demand, weekend_demand, nan_policy='omit')

print(f"Weekday Mean: {weekday_demand.mean():.2f}, Weekend Mean: {weekend_demand.mean():.2f}")
print(f"Numerator for formula simulation (X1_bar - X2_bar): {weekday_demand.mean() - weekend_demand.mean():.2f}")
print("\n---\n")
print(f"To fill Table \\ref{{tab:ttest_results}}:")
print(f"  Weekday N: {len(weekday_demand)}, Mean: {weekday_demand.mean():.2f}")
print(f"  Weekend N: {len(weekend_demand)}, Mean: {weekend_demand.mean():.2f}")
print(f"  t-statistic: {t_stat:.2f}")
print(f"  p-value: {'<0.001' if p_value < 0.001 else f'{p_value:.3f}'}") # Format p-value nicely
print("-" * 50)


# --- SCRIPT FINISH ---
print("\n--- Analysis Script Finished Successfully ---")
print("All graphs are saved in the 'figures' folder.")
print("Copy and paste the LaTeX code from the console into your .tex file.")


--- LaTeX Code for Descriptive Statistics Table (Table \ref{tab:desc_stats}) ---
\begin{table}
\caption{Descriptive Statistics of Key Numerical Variables}
\label{tab:desc_stats}
\begin{tabular}{lrrrr}
\toprule
 & historical_avg_demand & temperature & humidity & price_signal \\
\midrule
count & 720.00 & 720.00 & 720.00 & 720.00 \\
mean & 0.46 & 0.46 & 0.51 & 0.52 \\
std & 0.19 & 0.14 & 0.29 & 0.20 \\
min & 0.00 & 0.00 & 0.00 & 0.00 \\
25% & 0.32 & 0.36 & 0.24 & 0.38 \\
50% & 0.49 & 0.46 & 0.52 & 0.52 \\
75% & 0.61 & 0.55 & 0.75 & 0.67 \\
max & 1.00 & 1.00 & 1.00 & 1.00 \\
\bottomrule
\end{tabular}
\end{table}

--------------------------------------------------

--- Numbers for T-Test Table (Table \ref{tab:ttest_results}) and Formula Simulation ---
Weekday Mean: 0.52, Weekend Mean: 0.29
Numerator for formula simulation (X1_bar - X2_bar): 0.23

---

To fill Table \ref{tab:ttest_results}:
  Weekday N: 528, Mean: 0.52
  Weekend N: 192, Mean: 0.29
  t-statistic: 16.84
  p-value: <0.001
----