In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### ✏️ **Let's analyze the total length**

In [None]:
df = pd.read_csv('/kaggle/input/openintro-possum/possum.csv')
df

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
pre_df = df.copy()

In [None]:
pre_df = pre_df.drop(labels=['Pop','sex', 'case'], axis=1)

In [None]:
pre_df

In [None]:
pre_df.corr()['totlngth'].sort_values(ascending=False)[1:]

In [None]:
import seaborn as sns

sns.heatmap(pre_df.corr())

In [None]:
columns = ['site','age','hdlngth','skullw','taill','footlgth','earconch','eye','chest','belly','totlngth']
pre_df = pre_df[columns]
pre_df

In [None]:
pre_df.hist(figsize=(15,15))

In [None]:
pre_df.describe().T

In [None]:
pre_df.info()

### 📌 Let's Start Regression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression

features, targets = pre_df.iloc[:,:-1], pre_df.iloc[:,-1]

X_train, X_test ,y_train, y_test = \
train_test_split(features, targets , test_size=0.2, random_state=105)

LR = LinearRegression()
LR.fit(X_train, y_train)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [None]:
prediction = LR.predict(X_test)
get_evaluation(y_test, prediction)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

features, targets = pre_df.iloc[:,:-1], pre_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

dt_r = DecisionTreeRegressor(random_state=105)
rf_r = RandomForestRegressor(random_state=105, n_estimators=105)
gb_r = GradientBoostingRegressor(random_state=105)
xgb_r = XGBRegressor()
lgb_r = LGBMRegressor(n_estimators=100)

models = [dt_r, rf_r, gb_r, xgb_r, lgb_r]
for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(model.__class__.__name__)
    get_evaluation(y_test, prediction)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2).fit_transform(features)

features, targets = pre_df.iloc[:,:-1], pre_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(poly_features, targets, test_size=0.2, random_state=105)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [None]:
prediction = l_r.predict(X_test)
get_evaluation(y_test, prediction)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

rf_r = RandomForestRegressor(random_state=105)

parameters = {'max_depth': [1,2,3], 'min_samples_split': [5,6,7], 'n_estimators': [10, 50, 100]}
kfold = KFold(n_splits=10, random_state=105, shuffle=True)

# grid_rf_r = GridSearchCV(rf_r, param_grid=parameters, scoring='neg_mean_squared_error', cv=kfold)
grid_rf_r = GridSearchCV(rf_r, param_grid=parameters, scoring='r2', cv=kfold)
grid_rf_r.fit(X_train, y_train)

In [None]:
prediction = grid_rf_r.predict(X_test)
get_evaluation(y_test, prediction)

In [None]:
from statsmodels.api import OLS

features, targets = pre_df.iloc[:, :-1], pre_df.iloc[:, -1]

model = OLS(targets, features)
print(model.fit().summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif(features):
    vif = pd.DataFrame()
    vif['vif_score'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
    vif['feature'] = features.columns
    return vif

In [None]:
get_vif(features)

### 💡 1 Cycle 결과
> LinearRegression (선형회귀) 모델에서 R2 스코어가 0.8080으로 가장 높게 나왔다.
>


### ✏️ 2 Cycle
> 전처리 작업 및 일반화 작업 후 분석 

In [None]:
pre_df

In [None]:
pre_df.hist(figsize=(10, 10))

In [None]:
np.log1p(pre_df).hist(figsize=(10,10))

In [None]:
log_df = np.log1p(pre_df)
log_df

In [None]:
from sklearn.preprocessing import PowerTransformer

# box-cox: 양수만 가능
# yeo-johnson: 음수, 0 포함 가능, default

# ptf = PowerTransformer(method='box-cox', standardize=False)
ptf = PowerTransformer(standardize=False)
result = ptf.fit_transform(pre_df[['totlngth']])
pre_df['Target_yeo'] = result
pre_df['Target_yeo'].hist()

In [None]:
np.log1p(pre_df['totlngth']).hist(figsize=(4,4))

### 📌 log vs PowerTransformer 
#### ✨ 로그 치환을 사용하자

In [None]:
log_df.hist(figsize=(10,10))

In [None]:
# 이상치 제거 

# Z 점수 계산
z_scores = (log_df['totlngth'] - log_df['totlngth'].mean()) / log_df['totlngth'].std()

# Z 점수가 3 이하인 데이터 포인트만 선택
log_df = log_df[abs(z_scores) <= 3]
log_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

features, targets = log_df.iloc[:,:-1] , log_df.iloc[:,-1]

X_train, X_test , y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [None]:
prediction = l_r.predict(X_test)
get_evaluation(y_test, prediction)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

features, targets = log_df.iloc[:,:-1], log_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

dt_r = DecisionTreeRegressor(random_state=124)
rf_r = RandomForestRegressor(random_state=124, n_estimators=100)
gb_r = GradientBoostingRegressor(random_state=124)
xgb_r = XGBRegressor()
lgb_r = LGBMRegressor(n_estimators=100)

models = [dt_r, rf_r, gb_r, xgb_r, lgb_r]
for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(model.__class__.__name__)
    get_evaluation(y_test, prediction)

### log 치환 X

In [None]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
result = std.fit_transform(pre_df)
std_pre_df = pd.DataFrame(result, columns=pre_df.columns)
std_pre_df

In [None]:
condition = True

for column in std_pre_df.columns:
    condition &= std_pre_df[column].between(-1.96, 1.96)

std_pre_df = std_pre_df[condition]
std_pre_df

In [None]:
pre_df = pre_df.iloc[std_pre_df.index].reset_index(drop=True)
pre_df

In [None]:
pre_df.hist(figsize=(10,10))

In [None]:
pre_df = pre_df.drop(labels=['Target_yeo'], axis=1)

In [None]:
pre_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

features, targets = pre_df.iloc[:,:-1] , pre_df.iloc[:,-1]

X_train, X_test , y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [None]:
prediction = l_r.predict(X_test)
get_evaluation(y_test, prediction)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

features, targets = pre_df.iloc[:,:-1], pre_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=105)

dt_r = DecisionTreeRegressor(random_state=124)
rf_r = RandomForestRegressor(random_state=124, n_estimators=100)
gb_r = GradientBoostingRegressor(random_state=124)
xgb_r = XGBRegressor()
lgb_r = LGBMRegressor(n_estimators=100)

models = [dt_r, rf_r, gb_r, xgb_r, lgb_r]
for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(model.__class__.__name__)
    get_evaluation(y_test, prediction)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

columns = ['site', 'age', 'hdlngth', 'skullw', 'taill', 'footlgth', 'earconch',
       'eye', 'chest', 'belly']

sns.pairplot(pre_df[columns])
plt.show()

In [None]:
from statsmodels.api import OLS

features, targets = pre_df.iloc[:, :-1], pre_df.iloc[:, -1]

model = OLS(targets, features)
print(model.fit().summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif(features):
    vif = pd.DataFrame()
    vif['vif_score'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
    vif['feature'] = features.columns
    return vif

In [None]:
get_vif(features)

#### High R2 Score = 0.8080
#### 따로 분포 작업 및 일반화 작업 없이 하였을 때 가장 성능이 좋았던 것을 확인 