# [Go to Training Code](#training-code)
# [Go to Inference Code](#inference-code)

### Ignore Future Warninngs

In [None]:
import warnings

# Ignore all future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Training Code


# Import Dependencies

# <a id="training-code"></a>
# Your training code starts here

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import joblib
import lightgbm as lgb

# Reading the Dataset

In [None]:
# Load your data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('test.csv')

### Encoding:
- Encoding is the most crucial preprocess step of the process. Our models need the numeric data to process, so connversion from categorical to Numeric is so crucial. we used the `OneHotEncoder`.

In [None]:
# Selecting the columns for one-hot encoding
columns_to_encode = ['connection_type', 'location']

# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

# Fit and transform the selected columns
encoded_columns = onehot_encoder.fit_transform(train_data[columns_to_encode])

# Get the new column names for the encoded features
encoded_column_names = onehot_encoder.get_feature_names_out(columns_to_encode)

# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names)

# Concatenate the original DataFrame (without the encoded columns) with the new encoded DataFrame
df_encoded = pd.concat([train_data.drop(columns=columns_to_encode), encoded_df], axis=1)

# Display the resulting DataFrame
df_encoded.head(2)



Unnamed: 0,timestamp,system_id,generation_W,load_W,panels_capacity,load_capacity,date,tavg,tmin,tmax,...,location_MARDAN,location_MULTAN,location_PATTOKI,location_PESHAWAR,location_QUETTA,location_RAWALPINDI,location_SHEIKHUPURA,location_SIALKOT,location_SUKKUR,location_SWAT
0,2023-08-01 11:00:00,3,586.00003,7784.506767,10.35,10.0,2023-08-01,28.6,27.1,30.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-08-01 11:10:00,3,573.791696,7783.528568,10.35,10.0,2023-08-01,28.6,27.1,30.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Concatenated the Encoding columnns into the original dataset to start procedure**

In [None]:
train_data = pd.concat([train_data, encoded_df], axis =1)

**After Encoding, the original categorical columns have no meaning now, it's redundant**

In [None]:
# Drop unnecessary columns
train_data.drop(['connection_type', 'location', 'date'], axis=1, inplace=True)
# test_data.drop(['connection_type', 'location', 'date'], axis=1, inplace=True)

In [None]:
# Convert timestamp to datetime and create time-based features
train_data['timestamp'] = pd.to_datetime(train_data['timestamp'])
# test_data['timestamp'] = pd.to_datetime(test_data['timestamp'])

## Feature Engineering

In [None]:
train_data['year'] = train_data['timestamp'].dt.year
train_data['month'] = train_data['timestamp'].dt.month
train_data['day'] = train_data['timestamp'].dt.day
train_data['hour'] = train_data['timestamp'].dt.hour
train_data['minute'] = train_data['timestamp'].dt.minute
train_data['day_of_week'] = train_data['timestamp'].dt.dayofweek
train_data['hour_sin'] = np.sin(2 * np.pi * train_data['hour'] / 24)
train_data['hour_cos'] = np.cos(2 * np.pi * train_data['hour'] / 24)
train_data['month_sin'] = np.sin(2 * np.pi * train_data['month'] / 12)
train_data['month_cos'] = np.cos(2 * np.pi * train_data['month'] / 12)

train_data.drop(columns=['timestamp'], inplace=True)

In [None]:
train_data.head(2)

Unnamed: 0,system_id,generation_W,load_W,panels_capacity,load_capacity,tavg,tmin,tmax,prcp,wdir,...,year,month,day,hour,minute,day_of_week,hour_sin,hour_cos,month_sin,month_cos
0,3,586.00003,7784.506767,10.35,10.0,28.6,27.1,30.5,0.6,249.0,...,2023,8,1,11,0,1,0.258819,-0.965926,-0.866025,-0.5
1,3,573.791696,7783.528568,10.35,10.0,28.6,27.1,30.5,0.6,249.0,...,2023,8,1,11,10,1,0.258819,-0.965926,-0.866025,-0.5


**Split the dataset**

In [None]:
# Prepare features and target variables
X = train_data.drop(['generation_W', 'load_W'], axis=1)
y_generation = train_data['generation_W']
y_load = train_data['load_W']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train_gen, y_test_gen = train_test_split(X, y_generation, test_size=0.2, random_state=42)
X_train, X_test, y_train_load, y_test_load = train_test_split(X, y_load, test_size=0.2, random_state=42)

## Defining the model with hyperparameters

In [None]:
# Set num_leaves to be close to 2^max_depth
num_leaves = 2**8  # Since max_depth is 8

lgb_model_gen = lgb.LGBMRegressor(
    n_estimators=8000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=num_leaves,
    random_state=42,
    verbosity=2,
    n_jobs = -1
)

lgb_model_load = lgb.LGBMRegressor(
    n_estimators=8000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=num_leaves,
    random_state=42,
    verbosity=2,
    n_jobs =-1
)

### Fit Gen Model

In [None]:
# Measure time to train xgboost_model_gen
start_time_gen = time.time()
# Fit the model for generation
lgb_model_gen.fit(
    X_train,
    y_train_gen,
    eval_set=[(X_test, y_test_gen)],
    eval_metric='mae'
)
end_time_gen = time.time()
training_time_gen = end_time_gen - start_time_gen
print(f"Training time for lgb_model_gen: {training_time_gen/60} minutes")

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.070991
[LightGBM] [Debug] init for col-wise cost 0.000015 seconds, init for row-wise cost 0.162477 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2041
[LightGBM] [Info] Number of data points in the train set: 3854797, number of used features: 40
[LightGBM] [Info] Start training from score 1503.238151
[LightGBM] [Debug] Trained a tree with leaves = 958 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 967 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 967 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 982 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 991 and depth = 10
[LightGBM] [Debug] Trained a tre

### Save Gen Model

In [None]:
# Save the model to a file
model_filename = 'lgb_model_gen_20keh10n.pkl'
joblib.dump(lgb_model_gen, model_filename)
print(f"Model saved to {model_filename}")

## Fit Load Model

In [None]:
# Measure time to train xgboost_model_load
start_time_load = time.time()
# Fit the model for generation
lgb_model_load.fit(
    X_train,
    y_train_load,
    eval_set=[(X_test, y_test_load)],
    eval_metric='mae'
)
end_time_load = time.time()
training_time_load = end_time_load - start_time_load
print(f"Training time for lgb_model_load: {training_time_load/60} minutes")

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.070991
[LightGBM] [Debug] init for col-wise cost 0.000009 seconds, init for row-wise cost 0.179227 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2041
[LightGBM] [Info] Number of data points in the train set: 3854797, number of used features: 40
[LightGBM] [Info] Start training from score 1591.642914
[LightGBM] [Debug] Trained a tree with leaves = 977 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 970 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 963 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 974 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 970 and depth = 10
[LightGBM] [Debug] Trained a tre

## Save Load Model

In [None]:
# Save the model to a file
model_filename = 'lgb_model_load_20keh10n.pkl'
joblib.dump(lgb_model_load, model_filename)
print(f"Model saved to {model_filename}")

### Make predictions

In [None]:
# Make predictions
y_pred_gen = lgb_model_gen.predict(X_test)
y_pred_load = lgb_model_load.predict(X_test)

### Calculate MAE for generation and load

In [None]:
# Calculate MAE for generation and load
mae_gen = mean_absolute_error(y_test_gen, y_pred_gen)
mae_load = mean_absolute_error(y_test_load, y_pred_load)
mae_gen, mae_load

(167.00286409694405, 213.8922177874193)

### Inference Code


# <a id="inference-code"></a>
# Your inference code starts here

### Readinng the Data

In [None]:
test_data = pd.read_csv('test.csv')
test_data.head()

### Encoding

In [None]:
# Selecting the columns for one-hot encoding
columns_to_encode = ['connection_type', 'location']

# Fit and transform the selected columns
encoded_columns = onehot_encoder.transform(test_data[columns_to_encode])

# Get the new column names for the encoded features
encoded_column_names = onehot_encoder.get_feature_names_out(columns_to_encode)

# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names)

# Concatenate the original DataFrame (without the encoded columns) with the new encoded DataFrame
df_encoded = pd.concat([test_data.drop(columns=columns_to_encode), encoded_df], axis=1)

# Display the resulting DataFrame
df_encoded.head(2)

Unnamed: 0,test_id,system_id,timestamp,generation_W,load_W,panels_capacity,load_capacity,date,tavg,tmin,...,location_MARDAN,location_MULTAN,location_PATTOKI,location_PESHAWAR,location_QUETTA,location_RAWALPINDI,location_SHEIKHUPURA,location_SIALKOT,location_SUKKUR,location_SWAT
0,8RLHZP9Q,32,2023-08-14 02:00:00,0.0,1954.66668,10.465,10.0,2023-08-14,27.6,25.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AOOUUP1G,32,2023-08-14 02:10:00,0.0,1947.9,10.465,10.0,2023-08-14,27.6,25.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_data = pd.concat([test_data, encoded_df], axis =1)

In [None]:
test_data.drop(['connection_type', 'location', 'date'], axis=1, inplace=True)

In [None]:
# Creating time-based features
test_data['timestamp'] = pd.to_datetime(test_data['timestamp'])

### Feature Engineering

In [None]:
# Creating time-based features
test_data['year'] = test_data['timestamp'].dt.year
test_data['month'] = test_data['timestamp'].dt.month
test_data['day'] = test_data['timestamp'].dt.day
test_data['hour'] = test_data['timestamp'].dt.hour
test_data['minute'] = test_data['timestamp'].dt.minute
test_data['day_of_week'] = test_data['timestamp'].dt.dayofweek
test_data['hour_sin'] = np.sin(2 * np.pi * test_data['hour'] / 24)
test_data['hour_cos'] = np.cos(2 * np.pi * test_data['hour'] / 24)
test_data['month_sin'] = np.sin(2 * np.pi * test_data['month'] / 12)
test_data['month_cos'] = np.cos(2 * np.pi * test_data['month'] / 12)

In [None]:
train_data.columns

Index(['system_id', 'generation_W', 'load_W', 'panels_capacity',
       'load_capacity', 'tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres',
       'connection_type_RESIDENTIAL', 'location_DERA ISMAIL KHAN',
       'location_FAISALABAD', 'location_GUJRANWALA', 'location_HYDERABAD',
       'location_ISLAMABAD', 'location_KAMRA', 'location_KARACHI',
       'location_LAHORE', 'location_LARKANA', 'location_MARDAN',
       'location_MULTAN', 'location_PATTOKI', 'location_PESHAWAR',
       'location_QUETTA', 'location_RAWALPINDI', 'location_SHEIKHUPURA',
       'location_SIALKOT', 'location_SUKKUR', 'location_SWAT', 'year', 'month',
       'day', 'hour', 'minute', 'day_of_week', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos'],
      dtype='object')

In [None]:
test_data.columns

Index(['test_id', 'system_id', 'timestamp', 'generation_W', 'load_W',
       'panels_capacity', 'load_capacity', 'tavg', 'tmin', 'tmax', 'prcp',
       'wdir', 'wspd', 'pres', 'connection_type_RESIDENTIAL',
       'location_DERA ISMAIL KHAN', 'location_FAISALABAD',
       'location_GUJRANWALA', 'location_HYDERABAD', 'location_ISLAMABAD',
       'location_KAMRA', 'location_KARACHI', 'location_LAHORE',
       'location_LARKANA', 'location_MARDAN', 'location_MULTAN',
       'location_PATTOKI', 'location_PESHAWAR', 'location_QUETTA',
       'location_RAWALPINDI', 'location_SHEIKHUPURA', 'location_SIALKOT',
       'location_SUKKUR', 'location_SWAT', 'year', 'month', 'day', 'hour',
       'minute', 'day_of_week', 'hour_sin', 'hour_cos', 'month_sin',
       'month_cos'],
      dtype='object')

In [None]:
features = ['system_id', 'panels_capacity',
       'load_capacity', 'tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres',
       'connection_type_RESIDENTIAL', 'location_DERA ISMAIL KHAN',
       'location_FAISALABAD', 'location_GUJRANWALA', 'location_HYDERABAD',
       'location_ISLAMABAD', 'location_KAMRA', 'location_KARACHI',
       'location_LAHORE', 'location_LARKANA', 'location_MARDAN',
       'location_MULTAN', 'location_PATTOKI', 'location_PESHAWAR',
       'location_QUETTA', 'location_RAWALPINDI', 'location_SHEIKHUPURA',
       'location_SIALKOT', 'location_SUKKUR', 'location_SWAT', 'year', 'month',
       'day', 'hour', 'minute', 'day_of_week', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos']

## Inferance and the submission file creation

In [None]:
df_test = test_data
# Filter out the rows where generation_W or load_W are masked (-1)
masked_data = df_test[(df_test['generation_W'] == -1) & (df_test['load_W'] == -1)]

# Ensure that masked_data contains the features needed for prediction
masked_data_features = masked_data[features]

# Predict generation and load only for the masked rows
masked_data['generation_W'] = lgb_model_gen.predict(masked_data_features)
masked_data['load_W'] = lgb_model_load.predict(masked_data_features)
# Filter out the rows where generation_W or load_W are masked (-1)
masked_data = df_test[(df_test['generation_W'] == -1) & (df_test['load_W'] == -1)]

# Ensure that masked_data contains the features needed for prediction
masked_data_features = masked_data[features]

# Predict generation and load only for the masked rows
masked_data['generation_W'] = lgb_model_gen.predict(masked_data_features)
masked_data['load_W'] = lgb_model_load.predict(masked_data_features)

# Prepare the submission file, including only the necessary columns
submission = masked_data[['test_id', 'system_id', 'timestamp', 'generation_W', 'load_W']]

# Save the submission file in the required format
submission.to_csv('light_8keh10n.csv', index=False)

print("Submission file created successfully.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  masked_data['generation_W'] = lgb_model_gen.predict(masked_data_features)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  masked_data['load_W'] = lgb_model_load.predict(masked_data_features)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  masked_data['generation_W'] = lgb_model_gen.predict(masked_dat

Submission file created successfully.
