In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
import random


In [None]:
google = pd.DataFrame(pd.read_csv('/kaggle/input/google-play-store-apps/googleplaystore.csv'))

In [None]:
google.head()

In [None]:
google.info()

In [None]:
google.dropna(inplace=True)

In [None]:
sns.distplot(google['Rating']) # 비대칭함

In [None]:
#Rating 로그 변환
log_Rating = google['Rating']
google['Rating_lo'] = np.log1p(google['Rating'])

In [None]:
google.head()

In [None]:
google.info()

In [None]:
google['Category'].value_counts()

In [None]:
# Cleaning Categories into integers

categoryVal = google["Category"].unique()
categoryValCount = len(categoryVal)
category_dict = {}
for i in range(0,categoryValCount):
    category_dict[categoryVal[i]] = i
google["Category_c"] = google["Category"].map(category_dict).astype(int)

In [None]:
google.tail()

In [None]:
google['Size'].value_counts()

In [None]:
#scaling and cleaning size of installation

def change_size(size):
    if 'M' in size:
        x = size[:-1]
        x = float(x)*1000000
        return(x)
    elif 'k' in size:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    else:
        return None

google["Size"] = google["Size"].map(change_size)


In [None]:
#filling Size which had NA
google.Size.fillna(method = 'ffill', inplace = True)

In [None]:
google.head()

In [None]:
google['Installs'].value_counts()

In [None]:
google['Installs'] = [int(i[:-1].replace(',','')) for i in google['Installs']]

In [None]:
google['Type'].value_counts()

In [None]:
#Converting Type classification into binary

def type_cat(types):
    if types == 'Free':
        return 0
    else:
        return 1

google['Type'] = google['Type'].map(type_cat)

In [None]:
google['Content Rating'].value_counts()

In [None]:
#Cleaning of content rating classification

RatingL = google['Content Rating'].unique()
RatingDict = {}
for i in range(len(RatingL)):
    RatingDict[RatingL[i]] = i
google['Content Rating'] = google['Content Rating'].map(RatingDict).astype(int)

In [None]:
google.head()

In [None]:
#dropping of unrelated and unnecessary items

google.drop(labels = ['Last Updated','Current Ver','Android Ver','App'], axis = 1, inplace = True)

In [None]:
google.tail()

In [None]:
google['Price'].value_counts()

In [None]:
#Cleaning prices

def price_clean(price):
    if price == '0':
        return 0
    else:
        price = price[1:]
        price = float(price)
        return price

google['Price'] = google['Price'].map(price_clean).astype(float)

In [None]:
#Cleaning of genres

GenresL = google.Genres.unique()
GenresDict = {}
for i in range(len(GenresL)):
    GenresDict[GenresL[i]] = i
google['Genres_c'] = google['Genres'].map(GenresDict).astype(int)

In [None]:
# convert reviews to numeric

google['Reviews'] = google['Reviews'].astype(int)

In [None]:
google.info()

In [None]:
# for dummy variable encoding for Categories

google2 = pd.get_dummies(google, columns=['Category'])

In [None]:
print('get_dummies 수행 전 데이터 Shape: ', google.shape)
print('get_dummies 수행 후 데이터 Shape: ', google2.shape)

In [None]:
google2.head()

In [None]:
#선형 회귀 모델 학습/예측/평가

In [None]:
def get_rmse(model):
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test,pred)
    rmse = np.sqrt(mse)
    print(model.__class__.__name__,'로그 변환된 RMSE:',np.round(rmse,3))
    return rmse

def get_rmses(models):
    rmses=[]
    for model in models:
        rmse = get_rmse(model)
        rmses.append(rmse)
    return rmses

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

y_target = google2['Rating_lo']
X_features = google2.drop(['Rating_lo','Rating','Genres'],axis=1,inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2,random_state=156)

#LinearRegresson,Ridge,Lasso 학습/예측/평가
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
ridge_reg = Ridge()
ridge_reg.fit(X_train,y_train)
lasso_reg = Lasso()
lasso_reg.fit(X_train,y_train)

models = [lr_reg,ridge_reg,lasso_reg]
get_rmses(models)

In [None]:
# 피처별 회귀계수 시각화

In [None]:
def get_top_bottom_coef(model, n=10):
    coef = pd.Series(model.coef_, index = X_features.columns)
    
    #상위 10개, 하위 10개 회귀 계수 추출
    coef_high= coef.sort_values(ascending=False).head(n)
    coef_low = coef.sort_values(ascending=False).tail(n)
    return coef_high, coef_low

In [None]:
def visualize_coefficient(models):
    fig, ax = plt.subplots(figsize=(24,10), nrows=1, ncols=3)
    fig.tight_layout()
    
    for i_num, model in enumerate(models):
        coef_high, coef_low = get_top_bottom_coef(model)
        coef_concat = pd.concat([coef_high, coef_low])
        ax[i_num].set_title(model.__class__.__name__+'Coeffiecents',size=25)
        ax[i_num].tick_params(axis='y',direction='in',pad=-120)
        for label in (ax[i_num].get_xticklabels()+ax[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=coef_concat.values, y=coef_concat.index, ax=ax[i_num])
        
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)

In [None]:
from sklearn.model_selection import cross_val_score

def get_avg_rmse_cv(models):
    
    for model in models:
        #분할하지 않고 전체 데이터로 cross_val_score()수행, 모델별 CV RMSE 값과 평균 RMSE 출력
        rmse_list = np.sqrt(-cross_val_score(model, X_features, y_target,
                                            scoring='neg_mean_squared_error', cv=5))
        rmse_avg = np.mean(rmse_list)
        print('\n{0} CV RMSE 값 리스트: {1}'.format(model.__class__.__name__, np.round(rmse_list,3)))
        print('{0} CV 평균 RMSE 값:{1}'.format(model.__class__.__name__, np.round(rmse_avg,3)))
        
#lr_reg, ridge_reg, lasso_reg 모델의 CV RMSE 값 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_avg_rmse_cv(models)

In [None]:
from sklearn.model_selection import GridSearchCV

def print_best_params(model,params):
    grid_model = GridSearchCV(model, param_grid=params,
                             scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features,y_target)
    rmse= np.sqrt(-1*grid_model.best_score_)
    print('{0} 5 CV 시 최적 평균 RMSE 값:{1}, 최적 alpha:{2}'.format(model.__class__.__name__,
                                                            np.round(rmse,4),grid_model.best_params_))
    
ridge_params = {'alpha':[0.05,0.1,1,5,8,10,12,15,20]}
lasso_params = {'alpha':[0.001,0.005,0.008,0.05,0.03,0.1,0.5,1,5,10]}
print_best_params(ridge_reg, ridge_params)
print_best_params(lasso_reg, lasso_params)


In [None]:
#앞의 최적화 alpha 값으로 학습 데이터로 학습, 테스트 데이터로 예측 및 평가
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
ridge_reg = Ridge(alpha=20)
ridge_reg.fit(X_train,y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train,y_train)

#모든 모델의 RMSE 출력
models = [lr_reg,ridge_reg,lasso_reg]
get_rmses(models)

#모든 모델의 회귀 계수 시각화
visualize_coefficient(models)

In [None]:
# 08. 회귀 트리를 이용하여 회귀 모델 비교

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

y_target = google2['Rating_lo']
X_data = google2.drop(['Rating','Genres','Rating_lo'],axis=1,inplace=False)

rf= RandomForestRegressor(random_state=0, n_estimators=1000)
neg_mse_scores = cross_val_score(rf, X_data, y_target, scoring='neg_mean_squared_error',cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('5 교차 검증의 개별 Negative MSE scores: ', np.round(neg_mse_scores,2))
print('5 교차 검증의 개별 RMSE scores: ', np.round(rmse_scores,2))
print('5 교차 검증의 평균 RMSE: {0:.3f}'.format(avg_rmse))

In [None]:
def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring = 'neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-1*neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('### ',model.__class__.__name__,'####')
    print('5 교차 검증의 평균 RMSE: {0:.3f}'.format(avg_rmse))

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(random_state=0, max_depth=4)
rf_reg = RandomForestRegressor(random_state=0, n_estimators =1000)
gb_reg = GradientBoostingRegressor(random_state=0, n_estimators=1000)
xgb_reg = XGBRegressor(n_estimators=1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

# 트리 기반의 회귀 모델을 반복하면서 평가 수행
models = [dt_reg,rf_reg, gb_reg, xgb_reg, lgb_reg]
for model in models:
    get_model_cv_prediction(model, X_data, y_target)

In [None]:
import seaborn as sns
%matplotlib inline

rf_reg = RandomForestRegressor(n_estimators=1000)

#앞 예제에서 만들어진 X_data, y_target 데이터 세트를 적용해 학습합니다.
rf_reg.fit(X_data, y_target)

feature_series = pd.Series(data=rf_reg.feature_importances_, index= X_data.columns)
feature_series = feature_series.sort_values(ascending=False)
sns.barplot(x=feature_series, y=feature_series.index)

In [None]:
# Reviews, Size, Installs, Genres_c, Category_c 순으로 중요도가 높음