In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'renew-power-hiring-hackathon:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2434258%2F4118350%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240413%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240413T172314Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dae8267c635ca6a2fb3262ce8c9f66de51dce354f115a6eacec0da54e29ce21bd5aa6ef0a0ce11b027f6600204c4ef74f7b49c18d68c52a15d0449b1c005d9e1fb38f85d30e018435728e166751a8d8e569c0073763b4e5c95decf5db683dc0f229e219edb7a2f29a8d2771732788a580ddc2b7c5ca5638f6b63f44359c74748a9cb0c3bdfd20f08a4c8d3d134bc4df5361c2ef062f3aa9fd643fccc04738fef581851e3b44e60182a8254b66c07604b73582f01c33229daf1e56365870791423b55a96b046eb1f28700902f31e3fb404a8330d18214c769c7957b8c52fadc81a1075b8bb834aa505789bb32da97745fb3d0a9101e6661fb2915b9dab79919d86'

KAGGLE_INPUT_PATH='content/kaggle/input'
KAGGLE_WORKING_PATH='content/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('content/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


![Add a heading (1).jpg](attachment:bcaaf972-a5b8-4c23-a9fd-f1f3e71d173e.jpg)

## <h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> INTRODUCTION </h2>

<div style="color:white;
           display:fill;
           background-color:#4E944F;
           padding: 10px;
           font-family:Verdana;
           letter-spacing:0.5px">
    <h3 style="color:white;padding-left:20px"><b>Problem Statement 🧠</b></h3>
       <p style="color:white;font-size:110%;padding-left:50px">
           1. Unplanned downtime of wind turbines can result in a significant loss of revenue and energy and can easily scale to millions of dollars a year. <br>
           2. It is therefore pivotal that flagging of the failure of components is made to prevent further loss and perform maintenance. <br>
           3. It, however, involves replacement of components and higher costs.
 <br>
           4. The company has shared minute-wise normalised data of wind speed, power and temperature data for multiple components of a wind turbine.  <br>
           5. The company is looking to create a model to get an ideally functioning turbine’s expected rotor bearing temperature. It will then use the model to check the deviation of the actual rotor bearing temperature of the faulty turbine from the expected temperature.
        </p>
    </div>
    
<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp;It is to be noted that the DATA is not a time series-based prediction, as data is divided randomly.
</div>

## <p style='color:white;font-size:1%;'>Import Libraries & Dataset</p>
## <h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> IMPORTING LIBRARIES & DATASETS </h2>

In [None]:
!pip install --pre pycaret --q
!pip install optuna --q
!pip install flaml --q

In [None]:
import pandas as pd # Python library for data analysis and data frame
import numpy as np # Numerical Python library for linear algebra and computations
pd.set_option('display.max_columns', None) # code to display all columns

# Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Findout best performing models
from pycaret.regression import *

# all the required models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import SGDRegressor

# for evaluation
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from sklearn.model_selection import cross_val_score

# for hyperparameter tuning
import optuna

#auto ml
from flaml import AutoML

# to save the trained model
import pickle

import warnings
warnings.filterwarnings("ignore") # To prevent kernel from showing any warning

# set the color palette
palette = sns.color_palette(["#E9EFC0",'#B4E197','#83BD75','#4E944F'])
sns.palplot(palette) # print color palette

loading the train, test and sample submission datasets

In [None]:
train_df = pd.read_csv('../input/renew-power-hiring-hackathon/ReNew_Participants_Data/train.csv')
train_df.sample(5)

In [None]:
test_df = pd.read_csv('../input/renew-power-hiring-hackathon/ReNew_Participants_Data/test.csv')
test_df.sample(5)

In [None]:
submission_df = pd.read_csv('../input/renew-power-hiring-hackathon/ReNew_Participants_Data/submission.csv')
submission_df.sample(5)

## <p style='color:white;font-size:1%;'>Combine Train & Test Dataset</p>
## <h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> COMBINE TRAIN & TEST DATA </h2>

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp;Combining both the datasets, so I won't have to perform all the preprocessing/transformation steps separately on train and test data sets.
</div>


In [None]:
target = train_df['Target'] # store Target column from train set to new varible

# dropping redudant columns from train data, so the number of columns is same for both data sets
train_df1 = train_df.drop(['timestamp','Target'], axis=1)

df = pd.concat([train_df1, test_df]).reset_index(drop=True) # combining both data sets

Since it's a huge amount of data, the computation will take lot of time and might reach the max ram and cpu usage capacity. That's why it's better to compress the data size by allocating appropriate memory alocation to data.

In [None]:
def reduce_mem_usage(data):
    start_mem = data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in data.columns:
        col_type = data[col].dtype

        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return None




In [None]:
reduce_mem_usage(df)

As you can see the above function helped a lot! it reduced the data size by 75%, thats a lot!! </br>       
Now let's try to understand the data...

In [None]:
# what data looks like
df.head()

In [None]:
# whats the size of the data
df.shape

In [None]:
# what kinda datatypes data has
df.info()

In [None]:
# number of unique categories
df['turbine_id'].value_counts()

In [None]:
# number of null values in data
df.isnull().sum()

In [None]:
# basic stats of data
df.describe()

<div style="color:white;
           display:fill;
           background-color:#4E944F;
           padding: 10px;
           font-family:Verdana;
           letter-spacing:0.5px">
    <h3 style="color:white;padding-left:20px"><b>Observations 👀</b></h3>
       <p style="color:white;font-size:110%;padding-left:50px">
           1. Data contains 1212806 rows and 14 columns <br>
           2. It has one string/object column, remaining all columns have float dtype <br>
           3. There are no null values within data <br>
        </p>
    </div>

## <p style='color:white;font-size:1%;'>EDA</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> EDA </h2>

Performing eda on train data, cause it has target column in it.

In [None]:
corr = train_df.corr()
plt.figure(figsize=(20, 8))
sns.heatmap(corr, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
_, ax1 = plt.subplots(7,2, figsize=(25,40))
for i, col in enumerate(train_df.columns[train_df.dtypes==float]):
    sns.distplot(train_df[col], ax=ax1[i//2, i%2], color=palette[1])

plt.show()

In [None]:
_, ax1 = plt.subplots(7,2, figsize=(25,40))
for i, col in enumerate(train_df.columns[train_df.dtypes==float]):
    sns.boxplot(train_df[col], ax=ax1[i//2, i%2], color=palette[1])

plt.show()

<div style="color:white;
           display:fill;
           background-color:#4E944F;
           padding: 10px;
           font-family:Verdana;
           letter-spacing:0.5px">
    <h3 style="color:white;padding-left:20px"><b>Observations 👀</b></h3>
       <p style="color:white;font-size:110%;padding-left:50px">
           Since I will be using tree based models, I won't <br>
                1.scale the data<br>
                2.transform the features<br>
                3.handle the outliers<br>
           Tree based models are not affected by the outliers, scale and distribution of the data <br>
        </p>
    </div>
    

## <p style='color:white;font-size:1%;'>Preprocessing</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> PREPROCESSING </h2>

Encoding categorical feature

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Create some new features

In [None]:
df['difference_power_raw_conv'] = df['active_power_raw'] - df['active_power_calculated_by_converter']

df['difference_temp_nc'] = df['nacelle_temp'] - df['nc1_inside_temp']

df['difference_rc_power_raw_conv'] = df['reactice_power_calculated_by_converter'] - df['reactive_power']

df['nc_ratio'] = df['nacelle_temp']/df['nc1_inside_temp']

df['wind_direction_binned'] = df.wind_direction_raw//45 + 1

## <p style='color:white;font-size:1%;'>Split Data</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> SPLIT THE DATA </h2>

In [None]:
train_final = df.loc[:train_df.index.max(), :]
test_final = df.loc[train_df.index.max() + 1:, :].reset_index(drop=True)

In [None]:
train_final.shape

In [None]:
test_final.shape

## <p style='color:white;font-size:1%;'>Model Selection</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> MODEL SELECTION </h2>

Only passing 1 lakh rows to make process faster

In [None]:
_ = setup(data=pd.concat([train_final.iloc[:100000], target.iloc[:100000]], axis=1), target='Target', fold=5) #passing limited data, so computation won't take much time

In [None]:
%%time
compare_models(sort='MAPE')

**Top 5 Best Performing Models**
* Extra Trees Regressor
* Random Forest Regressor
* CatBoost Regressor
* xgboost
* Light Gradient Boosting Machine

## <p style='color:white;font-size:1%;'>Baseline Models</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> BASELINE MODELS </h2>

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp;Many procedure, evaluations, and models were taking too much time and space resulting in crashing the notebook, since we lots of data. So i have not run many codeblocks, but you can run them to for better result if you high confuguration machine.
</div>

**ExtraTreesRegressor**

In [None]:
%%time
extr_model = ExtraTreesRegressor(random_state=0, n_jobs=-1)
extr_model.fit(train_final, target)

In [None]:
-(cross_val_score(extr_model, train_final, target, n_jobs=-1, cv=5,scoring='neg_mean_absolute_percentage_error').mean())

In [None]:
extr_prediction = extr_model.predict(test_final)
extr_submission = pd.Series(extr_prediction, name='Target')
extr_submission.to_csv('./extr_submission.csv', index=False, header=True) # saving the file

**RandomForestRegressor**

In [None]:
%%time
rf_model = RandomForestRegressor(random_state=0, n_jobs=-1)
rf_model.fit(train_final, target)

In [None]:
-(cross_val_score(rf, train_final, target, n_jobs=-1, cv=5,scoring='neg_mean_absolute_percentage_error').mean())

In [None]:
rf_prediction = rf_model.predict(test_final)
rf_submission = pd.Series(rf_prediction, name='Target')
rf_submission.to_csv('./rf_submission.csv', index=False, header=True)

**CatBoostRegressor**

In [None]:
%%time
catboost_model = CatBoostRegressor(verbose=0, random_state=0)
catboost_model.fit(train_final, target)

In [None]:
-(cross_val_score(catboost_model, train_final, target, n_jobs=-1, cv=5,scoring='neg_mean_absolute_percentage_error').mean())

In [None]:
catboost_prediction = catboost_model.predict(test_final)
catboost_submission = pd.Series(catboost_prediction, name='Target')
catboost_submission.to_csv('./catboost_submission.csv', index=False, header=True)

**XGBRegressor**

In [None]:
%%time
xgboost_model = XGBRegressor(n_jobs=-1, random_state=0)
xgboost_model.fit(train_final, target)

In [None]:
-(cross_val_score(xgboost_model, train_final, target, n_jobs=-1, cv=5,scoring='neg_mean_absolute_percentage_error').mean())

In [None]:
xgboost_prediction = xgboost_model.predict(test_final)
xgboost_submission = pd.Series(xgboost_prediction, name='Target')
xgboost_submission.to_csv('./xgboost_submission.csv', index=False, header=True)

**LGBMRegressor**

In [None]:
%%time
lgbm_model = LGBMRegressor(n_jobs=-1, random_state=0)
lgbm_model.fit(train_final, target)

In [None]:
lgbm_prediction = lgbm_model.predict(test_final)
lgbm_submission = pd.Series(lgbm_prediction, name='Target')
lgbm_submission.to_csv('./lgbm_submission.csv', index=False, header=True)

## <p style='color:white;font-size:1%;'>Hyperparameter Tuning</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> HYPERPARAMETER TUNING </h2>

**ExtraTreesRegressor Parameter Tuning**  
You can do same for other models, however it will require lot of memory so i haven't done it.

In [None]:
def objective(trial):
        n_estimators = trial.suggest_categorical('n_estimators',[100, 300, 500])
        max_samples = trial.suggest_categorical('max_samples', [0.50,0.70,0.90])
        optuna_extr = sklearn.ensemble.ExtraTreesRegressor(
                                        n_estimators=n_estimators,
                                        random_state=42,
                                        max_samples=max_samples,
                                        bootstrap=True,
                                        oob_score=True,
                                        n_jobs=-1,
                                        verbose=1

        )
        return sklearn.model_selection.cross_val_score(optuna_extr, train_final, target, n_jobs=-1, cv=5,scoring='neg_mean_absolute_percentage_error').mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

## <p style='color:white;font-size:1%;'>Stacking</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> STACKING </h2>

In [None]:
estimators = [
        ('xgboost', XGBRegressor(n_jobs=-1, random_state=0)),
        ('lgbm', LGBMRegressor(n_jobs=-1, random_state=0)),
        ('catboost', CatBoostRegressor(verbose=0, random_state=0))
]

stacking = StackingRegressor(
                             estimators=estimators,
                             final_estimator=SGDRegressor(random_state=0))

stacking.fit(train_final, target)

In [None]:
stacking_prediction = stacking.predict(test_final)
stacking_submission = pd.Series(stacking_prediction, name='Target')
stacking_submission.to_csv('./stacking_submission.csv', index=False, header=True)

Combining the trained models predictions

In [None]:
extr = pd.read_csv('../input/my-trained-models-outputs/new_extr_submission.csv')
catboost = pd.read_csv('../input/my-trained-models-outputs/catboost_submission.csv')
lgbm = pd.read_csv('../input/my-trained-models-outputs/lgbm_submission.csv')
xgboost = pd.read_csv('../input/my-trained-models-outputs/xgboost_submission.csv')

In [None]:
combination_submission = 0.4*extr + 0.2*catboost + 0.2*lgbm + 0.2*xgboost

In [None]:
combination_submission.to_csv('./combination_submission.csv', index=False, header=True)

## <p style='color:white;font-size:1%;'>Auto ML</p>
<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> AUTO ML </h2>

In [None]:
lgbm_auto_ml = AutoML()

In [None]:
%%time
lgbm_auto_ml.fit(train_final, target, task="regression", metric='mape', estimator_list = ['lgbm'],
          n_splits=5, early_stop=True, n_jobs=-1, time_budget=4500, sample=True, seed=0, ensemble= {
        "final_estimator": SGDRegressor(random_state=0),
        "passthrough": False})

In [None]:
lgbm_auto_ml_prediction = lgbm_auto_ml.predict(test_final)
lgbm_auto_ml_submission = pd.Series(lgbm_auto_ml_prediction, name='Target')
lgbm_auto_ml_submission.to_csv('./lgbm_tuned_submission.csv', index=False, header=True) # saving the file

<h2 style="background-color:#4E944F;font-family:verdana;color:white;font-size:200%;text-align:center;letter-spacing:0.5px;padding: 10px"> If You Liked The NoteBook, Please Upvote✌  </h2>