# Outline
- change learning rate to 0.07 from 0.05 -> score increase from 5560 to 5590 
- not removing outlier currently since the output becomes worse
- add interaction terms and change parameters -> 5590 to 5593
    - add square, cubic and quadratic for interaction terms -> 5593 -> 5663
    - change interaction terms (add few more compared with old one) and add two more features -> 5713
        - add good factor -> 5713 -> 5717

In [None]:
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import re

from scipy.stats import norm, stats
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv('/Users/chloe/Documents/GitHub/Kaggle_competition/TBrain/Yushan/Dataset/train.csv')

In [None]:
df.head()

<h1>Handle Missing value</h1>

In [None]:
train_na = (df.isnull().sum() / len(df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)[:30]

print("Missing value percentage by column: \n")
print(train_na)

f, ax = plt.subplots(figsize=(5, 4))
plt.xticks(rotation='-40')
sns.barplot(x=train_na.index, y=train_na)

<h3>Remove "parking_area" due to over 94% missing value percentage</h3>

In [None]:
df = df.drop(['parking_area'], axis=1)

<h3>Fill NA value by features</h3>

In [None]:
# null parking_price all with parking_way=2 -> can directly replace the null with zero
df['parking_price'] = df['parking_price'].fillna(0)
df['village_income_median'] = df['village_income_median'].fillna(df['village_income_median'].mean())
df['txn_floor'] = df['txn_floor'].fillna(0)

<h1>Observe total_price distribution</h1>

In [None]:
sns.distplot(df['total_price'] , fit=norm)

<h3>total_price's distribution is screwed right and has outlier</h3>

In [None]:
#handle screwed right
df["total_price"] = np.log1p(df["total_price"])

In [None]:
sns.distplot(df['total_price'] , fit=norm)

In [None]:
#handle outlier (removing outlier make it worse)
q = df["total_price"].quantile(0.99)

test = df[df["total_price"] < q]
#df = df[df["total_price"] < q]

print("# of data exclude outlier:", 60000-test.shape[0])
sns.distplot(test["total_price"], fit=norm)

<h3>Add features</h3>

In [None]:
# main_col = ["city", "town", "village"]
main_col = ["village_income_median", "town_population", "town_area",
            "city", "town", "village","land_area", "building_area"] 
interact_col = [col for col in df.columns if re.search("[a-zA-Z]_rate$", col, re.IGNORECASE)!=None]
#interact_col = interact_col+["death_date"] # meaning death_rate -> better not include the death_rate

for m_col in main_col:
    for i_col in interact_col:
        df[m_col+"_"+i_col+"_IT"] = df[m_col]*df[i_col]  

In [None]:
# new_col = [col for col in df.columns if re.search("_IT$", col, re.IGNORECASE)!=None]
#list_to_square = interact_col+["village_income_median"]

In [None]:
for col in interact_col:
    df[col+"_s"] = df[col]**2
    #df[col+"_c"] = df[col]**3
    #df[col+"_q"] = df[col]**4

# for col in list_to_square:
#     df[col+"_s"] = df[col]**2

In [None]:
df["ratio_floor"] = df["txn_floor"]/df["total_floor"]
df["ratio_parking_price"] = df["parking_price"]/df["village_income_median"]

In [None]:
df["good_factor"] = df["marriage_rate"]+df["master_rate"]+df["bachelor_rate"]
df["good_factor"+"_s"] = df["good_factor"]**2

In [None]:
# def get_land_ratio(row):
#     if row["land_area"]>row["building_area"]:
#         return row["land_area"]/row["building_area"]
#     elif row["land_area"]==0:
#         return row["land_area"]/row["building_area"]
#     else:
#         return row["land_area"]/row["building_area"]   
        
# df["land_ratio"] = df.apply(get_land_ratio, axis=1)

<h1>Create training and testing set</h1>

In [None]:
#remove useless feature
df = df.drop(['building_id'], axis=1)

In [None]:
#splite to X, y dataframe
df_price = df['total_price']
df = df.drop(['total_price'], axis=1)

In [None]:
random_seed = 5 # set seed for same train test data split
X_train, X_test, y_train, y_test = train_test_split(df, df_price, random_state=random_seed, test_size = 0.2)

<h1>Use random search to find best parameters for lightGBM</h1>

In [None]:
# hyper_space = {'n_estimators': sp_randint(1000, 7000),
#                'max_depth':  [15, 16, 17, 18, 19, 20 ,-1],
#                'num_leaves': [20,30,40, 50, 60, 70, 80],
#                'subsample': sp_uniform(0.6, 0.4),
#                'colsample_bytree': sp_uniform(0.6, 0.4),
#                 'learning_rate': [0.03, 0.05, 0.07, 0.1]}

# gbm = lgb.LGBMRegressor(objective='regression',num_leaves=250,learning_rate=0.05,n_estimators=1500)

# rs = RandomizedSearchCV(gbm, hyper_space, n_iter=60, scoring='r2', cv=4, 
#                          verbose=1, random_state=2018)
# rs_results = rs.fit(X_train, y_train)
# print("BEST PARAMETERS: " + str(rs_results.best_params_))
# print("BEST CV SCORE: " + str(rs_results.best_score_))

In [None]:
# parameters with additional features
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=80,
                        #learning_rate=0.01,
                        learning_rate=0.03,
                        n_estimators=6769,
                       colsample_bytree = 0.6886781648348815,
                       max_depth = 18,
                       subsample = 0.7241144257909466)

In [None]:
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

<h3>Evaluate model by hit rate</h3>

In [None]:
# baseline: 0.54; using exclude outlier: 0.55 but public score low
# changing learning rate to 0.01 -> 0.495; 0.1 -> 0.538; 0.07 -> 0.548; 0.09 -> 0.5386; 0.08 -> 0.5378

# (exclude outlier) -> 0.07 -> 0.5396; 0.05 -> 0.540; 0.1 -> 0.538; 0.03 -> 0.529
# with additional features -> 0.03 -> 0.5527; 0.05 -> 0.551; 0.01 -> 0.534
p = np.abs(( np.exp(y_pred) - np.exp(y_test) )/ np.exp(y_test))
np.sum((p <=.1))/len(y_pred)

<h1>Predict test value for submission</h1>

In [None]:
test_df = pd.read_csv('/Users/chloe/Documents/GitHub/Kaggle_competition/TBrain/Yushan/Dataset/test.csv')
test_df = test_df.drop(['building_id','parking_area'], axis=1)

<h3>Fit the shape of training data</h3>

In [None]:
test_df['parking_price'] = test_df['parking_price'].fillna(0)
test_df['village_income_median'] = test_df['village_income_median'].fillna(test_df['village_income_median'].mean())
test_df['txn_floor'] = test_df['txn_floor'].fillna(0)

In [None]:
for m_col in main_col:
    for i_col in interact_col:
        test_df[m_col+"_"+i_col+"_IT"] = test_df[m_col]*test_df[i_col]  

In [None]:
for col in interact_col:
    test_df[col+"_s"] = test_df[col]**2
    #test_df[col+"_c"] = test_df[col]**3
    #test_df[col+"_q"] = test_df[col]**4

In [None]:
test_df["ratio_floor"] = test_df["txn_floor"]/test_df["total_floor"]
test_df["ratio_parking_price"] = test_df["parking_price"]/test_df["village_income_median"]

In [None]:
test_df["good_factor"] = test_df["marriage_rate"]+test_df["master_rate"]+test_df["bachelor_rate"]
test_df["good_factor"+"_s"] = test_df["good_factor"]**2

<h3>Predict...</h3>

In [None]:
# Predict value
y_sub_GBM = gbm.predict(test_df)

# Get inverse of log value
y_sub_GBM = np.exp(y_sub_GBM)

In [None]:
output = pd.read_csv('/Users/chloe/Documents/GitHub/Kaggle_competition/TBrain/Yushan/Dataset/test.csv')
output = output[["building_id"]]

In [None]:
output["total_price"] = y_sub_GBM

In [None]:
output.to_csv("/Users/chloe/Desktop/submission_learning_rate_20190623.csv", index=False)