In [21]:
!pip3 install catboost



In [22]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import math
import joblib


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.width', 500)
pd.set_option("display.max_columns", None)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate

#Data Analysis

In [23]:

df = pd.read_csv("hitters.csv")
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [24]:

def grab_col_names(dataframe, cat_th=10, car_th=20):
    """



    Parameters
    ------
        dataframe: dataframe

        cat_th: int, optional numeric but categoric variables
        car_th: int, optinal categoric but cardinal variables

    Returns
    ------
        cat_cols: list
                categoric variables list
        num_cols: list
                numeric variables list
        cat_but_car: list
                carsdnal variables but looks like categoric

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = total variables
        cat_cols >num_but_cat .


    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    # print(f"Observations: {dataframe.shape[0]}")
    # print(f"Variables: {dataframe.shape[1]}")
    # print(f'cat_cols: {len(cat_cols)}')
    # print(f'num_cols: {len(num_cols)}')
    # print(f'cat_but_car: {len(cat_but_car)}')
    # print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [25]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [26]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


In [27]:

def outlier_thresholds(dataframe, col_name, q1=0.1, q3=0.9):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


In [28]:

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


In [29]:
    def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
        dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=False)
        return dataframe

In [30]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [31]:

def hitters_data_prep(dataframe):

    ############ Specifying variable types ############

    cat_cols, num_cols, cat_but_car = grab_col_names(dataframe, cat_th=5, car_th=20)

    ############ We replace with thresholds ############

    for col in num_cols:
        replace_with_thresholds(dataframe, col)

    ############ remove salary bigger than up limit ############

    q3 = 0.90
    salary_up = int(dataframe["Salary"].quantile(q3))
    dataframe = dataframe[(dataframe["Salary"] < salary_up)]

    ############ Feature engineering  ############

    # New variables were created with the most appropriate variables according to their proportions.
    dataframe["new_Hits/CHits"] = dataframe["Hits"] / dataframe["CHits"]
    dataframe["new_OrtCHits"] = dataframe["CHits"] / dataframe["Years"]
    dataframe["new_OrtCHmRun"] = dataframe["CHmRun"] / dataframe["Years"]
    dataframe["new_OrtCruns"] = dataframe["CRuns"] / dataframe["Years"]
    dataframe["new_OrtCRBI"] = dataframe["CRBI"] / dataframe["Years"]
    dataframe["new_OrtCWalks"] = dataframe["CWalks"] / dataframe["Years"]

    dataframe["New_Average"] = dataframe["Hits"] / dataframe["AtBat"]
    dataframe['new_PutOutsYears'] = dataframe['PutOuts'] * dataframe['Years']
    dataframe["new_RBIWalksRatio"] = dataframe["RBI"] / dataframe["Walks"]
    dataframe["New_CHmRunCAtBatRatio"] = dataframe["CHmRun"] / dataframe["CAtBat"]
    dataframe["New_BattingAverage"] = dataframe["CHits"] / dataframe["CAtBat"]
    dataframe.dropna(inplace=True)

    ############ Binary Encoding ############
    # label encoding of categorical features (League, Division, NewLeague) with two class
    binary_cols = [col for col in dataframe.columns if dataframe[col].dtype not in
                   [int, float] and dataframe[col].nunique() == 2]

    for col in binary_cols:
        labelencoder = LabelEncoder()
        dataframe[col] = labelencoder.fit_transform(dataframe[col])

    ############ One-Hot Encoding ############
    cat_cols, num_cols, cat_but_car = grab_col_names(dataframe)
    dataframe = one_hot_encoder(dataframe, cat_cols)

    ############ MODEL ############

    y = dataframe["Salary"]
    X = dataframe.drop(["Salary"], axis=1)


    ############ Scaler ############
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y

X, y = hitters_data_prep(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_Hits/CHits"] = dataframe["Hits"] / dataframe["CHits"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_OrtCHits"] = dataframe["CHits"] / dataframe["Years"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_OrtCHmRun"] = dataframe["CHmRun"] / dataframe["Years"

In [32]:


def plot_importance(model, features, num=len(X), save=False):
  feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
  plt.figure(figsize=(10, 10))
  sns.set(font_scale=1)
  sns.barplot(X="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                    ascending=False)[0:num])
  plt.title('Features')
  plt.tight_layout()
  plt.show()
  if save:
      plt.savefig('importances.png')
  return feature_imp



In [33]:
def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          ('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
          # ("CatBoost", CatBoostRegressor(verbose=False))
          ]
    for name, regressor in models:
        rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=10, scoring="neg_mean_squared_error")))
        print(f"RMSE: {round(rmse, 4)} ({name}) ")
  ######################################################
# Automated Hyperparameter Optimization
######################################################

cart_params = {'max_depth': range(1, 20),  # ne kadar dallanacak
               "min_samples_split": range(2, 30)}

rf_params = {"max_depth": [5, 8, 15, None],
             "max_features": [5, 7, "auto"],
             "min_samples_split": [3, 5, 8, 15, 20],
             "n_estimators": [600, 650, 1000]}

xgboost_params = {"learning_rate": [0.1, 0.01, 0.01],
                  "max_depth": [5, 8, 12, 20],
                  "n_estimators": [100, 200, 300, 500],
                  "colsample_bytree": [0.5, 0.8, 1]}

lightgbm_params = {"learning_rate": [0.001, 0.01, 0.1, 0.001],
                   "n_estimators": [250, 300, 500, 1500, 2500,3000],
                   "colsample_bytree": [0.1, 0.3, 0.5, 0.7, 1]}

regressors = [("CART", DecisionTreeRegressor(), cart_params),
              ("RF", RandomForestRegressor(), rf_params),
              ('XGBoost', XGBRegressor(objective='reg:squarederror'), xgboost_params),
              ('LightGBM', LGBMRegressor(), lightgbm_params)]





In [34]:
def hyperparameter_optimization(X, y, cv=10, scoring="neg_mean_squared_error"):
  print("Hyperparameter Optimization....")
  best_models = {}
  for name, regressor, params in regressors:
      print(f"########## {name} ##########")
      rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=cv, scoring=scoring)))
      print(f"RMSE: {round(rmse, 4)} ({name}) ")

      gs_best = GridSearchCV(regressor, params, cv=3, n_jobs=-1, verbose=False).fit(X, y)

      final_model = regressor.set_params(**gs_best.best_params_)
      rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))
      print(f"RMSE (After): {round(rmse, 4)} ({name}) ")

      print(f"{name} best params: {gs_best.best_params_}", end="\n\n")

      best_models[name] = final_model
  return best_models

In [35]:

# Stacking & Ensemble Learning
def voting_regressor(best_models, X, y):
    print("Voting Regressor...")
    voting_reg = VotingRegressor(estimators=[('RF', best_models["RF"]),
                                         ('LightGBM', best_models["LightGBM"])])
    voting_reg.fit(X, y)



    neg_mean_squared = np.mean(np.sqrt(-cross_val_score(voting_reg,
                                 X, y,
                                 cv=10,
                                 scoring="neg_mean_squared_error")))
    print(f"neg_mean_squared_error: {neg_mean_squared.mean()}")
    return voting_reg

In [36]:
def main():
    df = pd.read_csv("hitters.csv")
    X, y = hitters_data_prep(df)
    base_models(X, y)
    best_models = hyperparameter_optimization(X, y)
    voting_reg = voting_regressor(best_models, X, y)

    print("Voting_reg has been created")
    return voting_reg

if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_Hits/CHits"] = dataframe["Hits"] / dataframe["CHits"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_OrtCHits"] = dataframe["CHits"] / dataframe["Years"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["new_OrtCHmRun"] = dataframe["CHmRun"] / dataframe["Years"

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1661
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 33
[LightGBM] [Info] Start training from score 410.876258
neg_mean_squared_error: 153.17728353449675
Voting_reg has been created
