In [1]:
import helper as h

import pandas as pd
import numpy as np
np.random.seed(0)

import re
import sys

import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

from sklearn.preprocessing import minmax_scale
from tqdm import tqdm

import seaborn as sns

from functools import partial
from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

%matplotlib inline

In [2]:
def get_feats(df):
    feats = df.select_dtypes(include=[int, float]).columns 
    return feats[ (feats != 'CO') & (feats != 'id')  & (feats != 'CO_log') ].values

def get_X(df):
    return df[ get_feats(df) ].values

def get_y(df, target_var='CO'):
    return df[target_var].values

def get_models():
    return [
        ('dummy_mean', DummyRegressor(strategy='mean')),
        ('dummy_median', DummyRegressor(strategy='median'))
    ]

def run_cv(model, X, y, folds=4, target_log=False,cv_type=KFold, success_metric=mean_absolute_error):
    cv = cv_type(n_splits=folds)
    
    scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if target_log:
            y_train = np.log(y_train)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        if target_log:
            y_pred = np.exp(y_pred)
            y_pred[y_pred < 0] = 0 #czasem może być wartość ujemna

        score = success_metric(y_test, y_pred)
        scores.append( score )
        
    return np.mean(scores), np.std(scores)


def plot_learning_curve(model, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), target_log=False):
    
    plt.figure(figsize=(12,8))
    plt.title(title)
    if ylim is not None:plt.ylim(*ylim)

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    if target_log:
        y = np.log(y)
    
    def my_scorer(model, X, y):
        y_pred = model.predict(X)
        
        if target_log:
            y = np.exp(y)
            y_pred = np.exp(y_pred)
            y_pred[ y_pred<0 ] = 0
        
        return mean_absolute_error(y, y_pred)

        
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=my_scorer)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


def run(train, plot_lc=False, folds=3, ylim=(0, 2), target_log=False):
    X, y  = get_X(train), get_y(train)

    for model_name, model in get_models():
        score_mean, score_std = run_cv(model, X, y, folds=folds, target_log=target_log)
        print("[{0}]: {1} +/-{2}".format(model_name, score_mean, score_std))
        sys.stdout.flush() #wypisujemy wynik natychmiast, bez buforowania
#         eli5.show_weights(model, feature_names=get_feats(train))
#         if False == plot_lc: continue
#         plt = plot_learning_curve(model, model_name, X, y, ylim=ylim, cv=folds, target_log=target_log)
#         plt.show()

In [3]:
## sqrt i log1p pogorszyły wynik z 0.41010 do  0.42367
## sprawdzić samo sqrt i samo log1p czy pomogą czyli 2 submisiony jeszcze jutro

In [4]:
def feature_engineering(df):
    df['AT_AH'] = df.AT/df.AH
    df['AT_AP'] = df.AT/df.AP
    df['AFDP_GTEP'] = df.AFDP/df.GTEP
    df['TIT_TAT'] = df.TIT/df.TAT
    df['GTEP_TEY'] = df.GTEP/df.TEY
    df['NOX_AP'] = df.NOX/df.AP
    df['TEY_NOX'] = df.TEY/df.NOX   
    df['AH_NOX_SUM'] = df.AH+df.NOX
    
#     for feat in tqdm(get_feats(df)):
#         if df[feat].skew() < 0.25: continue
# #         df[feat + '_sqrt'] = np.sqrt( minmax_scale(df[feat]) )
#         df[feat + '_log1p'] = np.log1p( minmax_scale(df[feat]) )

    return df

In [5]:
train = pd.read_hdf("../input/train_power_plant.h5")
test = pd.read_hdf("../input/test_power_plant.h5")
train['CO_log'] = np.log( train['CO'] + 2 )
train = feature_engineering(train)
X_train,y_train = get_X(train), get_y(train)
def get_models():
    return [
#         ('dummy_mean', DummyRegressor(strategy='mean')),
#         ('dummy_median', DummyRegressor(strategy='median')),
#         ('dt-5md', DecisionTreeRegressor(max_depth=5)),
        ('et-n200',  ExtraTreesRegressor(n_estimators=200,random_state=0)),
    ]

run(train,folds=3, plot_lc=True, target_log = True)

[et-n200]: 0.4469605258527911 +/-0.023463477390586008


In [6]:
train.columns

Index(['id', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'CO_log', 'AT_AH', 'AT_AP', 'AFDP_GTEP', 'TIT_TAT',
       'GTEP_TEY', 'NOX_AP', 'TEY_NOX', 'AH_NOX_SUM'],
      dtype='object')

In [7]:
test = feature_engineering(test)

In [8]:
feats = get_feats(train)
X_train = train[feats].values
y_train = train["CO_log"].values

X_test = test[feats].values

model = ExtraTreesRegressor(n_estimators=200,max_depth=45, min_samples_leaf = 1,random_state=0)
model.fit(X_train, y_train)
test["CO_log"] = model.predict(X_test)
test['CO'] = np.exp(test['CO_log']) - 2
test[ ["id", "CO"] ].to_csv("../output/etregressor_log_fe_20220401_08.csv", index=False)