In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import phik
from phik import resources, report

# Import necessary libraries

# Load the dataset
data = pd.read_csv('../data/electric-consumption (2).csv')

# Define features and target variable
features = ['oc3', 'al4', 'pet2', 'can', 'gbl', 'sachet', 'enmix', 'date_type', 'status_process']
target = 'electric_consume'

# Split the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for numerical and categorical features
numeric_features = ['oc3', 'al4', 'pet2', 'can', 'gbl', 'sachet', 'enmix']
categorical_features = ['date_type', 'status_process']

preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
  ])

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Save the model
joblib.dump(model, 'electric_consumption_model.pkl')
# Perform correlation analysis using phik

# Calculate the correlation matrix
phik_matrix = data.phik_matrix(interval_cols=numeric_features)

# Display the correlation matrix
print(phik_matrix)

# Save the correlation matrix to a CSV file
phik_matrix.to_csv('phik_correlation_matrix.csv')
# Data exploration
print("Data Head:")
print(data.head())

print("\nData Info:")
print(data.info())

print("\nData Description:")
print(data.describe())

print("\nMissing Values:")
print(data.isnull().sum())

print("\nUnique Values in Categorical Features:")
for col in categorical_features:
  print(f"{col}: {data[col].nunique()} unique values")

Mean Absolute Error: 12123.323630183251




                  date       day       oc3       al4      pet2       can  \
date               1.0  1.000000  1.000000  1.000000  1.000000  1.000000   
day                1.0  1.000000  0.000000  0.000000  0.269114  0.273931   
oc3                1.0  0.000000  1.000000  0.089138  0.111741  0.495239   
al4                1.0  0.000000  0.089138  1.000000  0.092701  0.100432   
pet2               1.0  0.269114  0.111741  0.092701  1.000000  0.216725   
can                1.0  0.273931  0.495239  0.100432  0.216725  1.000000   
gbl                1.0  0.275802  0.027664  0.046758  0.306762  0.113006   
sachet             1.0  0.377470  0.170276  0.175343  0.204076  0.200066   
enmix              1.0  0.645439  0.000000  0.148316  0.305503  0.252534   
date_type          1.0  0.761246  0.243230  0.142893  0.329724  0.344650   
status_process     1.0  0.223569  0.303069  0.413792  0.200920  0.204542   
electric_consume   1.0  0.347886  0.000000  0.598071  0.935525  0.891306   

           

In [3]:
# Load the saved model
loaded_model = joblib.load('electric_consumption_model.pkl')

# Make predictions on the test set
new_predictions = loaded_model.predict(X_test)

# Display the predictions
print(new_predictions)

[ 87051.04        97803.2         94893.92       105765.6
  73635.52        71108.96        89912.48        85754.72
  52665.55296721  75970.72        98465.76       105210.24
  52665.55296721  71430.24        99821.44        83024.
  66610.24        77311.36        71411.52        71497.28
  88987.04        95072.32        83670.24       103077.44
  95055.2         70855.68        47960.29243054 100294.88
 107229.76        64411.50628571  75359.04       110749.44
  87022.56        83743.84        87527.04        78370.24
  78088.          98385.44        85896.48        69901.76
  89303.36        57288.05818182  80765.6        100990.24
  86617.76        86568.64        52665.55296721  87072.64
 117291.2         74340.48       103316.48        91373.92
  75830.24        91391.2         62593.76       101081.6
  82884.48        80147.84        97589.36       103028.16
  75549.76        68609.12        93919.68        97163.04
 102061.12        79015.2         99729.92        74838.4
  

In [5]:
# Create dummy data
sample_data_input = pd.DataFrame({
  'oc3': [0.5, 0.7],
  'al4': [0.3, 0.6],
  'pet2': [0.2, 0.8],
  'can': [0.1, 0.4],
  'gbl': [0.5, 0.9],
  'sachet': [0.3, 0.7],
  'enmix': [0.4, 0.6],
  'date_type': ['workday', 'weekend'],
  'status_process': [1, 1]
})

# Make predictions on the dummy data
dummy_predictions = loaded_model.predict(sample_data_input)

# Display the predictions
print(dummy_predictions)

[57288.05818182 65476.09777778]
