In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import norm, skew 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
y_train = df.Habitability_score.values

In [5]:
df_to_train = df

In [6]:
def transform_df(df):
    df['Dust_and_Noise'] = df['Dust_and_Noise'].fillna(df['Dust_and_Noise'].mode()[0])
    df['Furnishing'] = df['Furnishing'].fillna(df['Furnishing'].mode()[0])
    df['Crime_Rate'] = df['Crime_Rate'].fillna(df['Crime_Rate'].mode()[0])
    df["Number_of_Windows"] = df["Number_of_Windows"].fillna(0)
    df["Frequency_of_Powercuts"] = df["Frequency_of_Powercuts"].fillna(0)

In [7]:
transform_df(df_to_train)
transform_df(test)

In [8]:
# check missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
Property_ID,0,0.0
Property_Type,0,0.0
Property_Area,0,0.0
Number_of_Windows,0,0.0
Number_of_Doors,0,0.0
Furnishing,0,0.0
Frequency_of_Powercuts,0,0.0
Power_Backup,0,0.0
Water_Supply,0,0.0
Traffic_Density_Score,0,0.0


In [9]:
cleanup_nums = {"Furnishing":     {"Semi_Furnished": 1, "Unfurnished": 0, "Fully Furnished": 2},
                    "Power_Backup": {"No": 0, "Yes": 1, "NOT MENTIONED": 0},
                    "Crime_Rate": {"Well below average": 1, "Slightly below average": 0.8, "Well above average": 0, "Slightly above average": 0},
                    "Dust_and_Noise": {"High": 0, "Medium": 0.7, "Low": 1}
                    }
df_to_train = df_to_train.replace(cleanup_nums)
test = test.replace(cleanup_nums)

In [10]:
def fix_skew(df):
    from scipy.special import boxcox1p
    skewed_features = ["Air_Quality_Index","Property_Area"]
    lam = 0.15
    for feat in skewed_features:
        #all_data[feat] += 1
        df[feat] = boxcox1p(df[feat], lam)

In [11]:
fix_skew(test)
fix_skew(df_to_train)

In [12]:
def check_skew(df):
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    print("\nSkew in numerical features: \n")
    skewness = pd.DataFrame({'Skew' :skewed_feats})
    print(skewness.head(15))

In [13]:
check_skew(test)
check_skew(df_to_train)


Skew in numerical features: 

                            Skew
Number_of_Windows       1.701516
Frequency_of_Powercuts  1.655591
Power_Backup            1.355206
Number_of_Doors         0.842488
Property_Area           0.543492
Furnishing              0.056805
Traffic_Density_Score  -0.592793
Neighborhood_Review    -0.724206
Air_Quality_Index      -1.147192
Crime_Rate             -1.341579
Dust_and_Noise         -2.456831

Skew in numerical features: 

                            Skew
Number_of_Windows       1.678325
Frequency_of_Powercuts  1.660208
Power_Backup            1.305035
Number_of_Doors         0.838949
Property_Area           0.620820
Furnishing              0.078435
Traffic_Density_Score  -0.551835
Neighborhood_Review    -0.730654
Air_Quality_Index      -0.739742
Crime_Rate             -1.360013
Habitability_score     -1.553639
Dust_and_Noise         -2.378065


In [14]:
df_to_train = df_to_train.iloc[: , 1:] #drop first column
df_to_train = df_to_train.iloc[: , :-1] # drop last column
# df_to_train2 = df_to_train[['Furnishing','Frequency_of_Powercuts','Power_Backup','Crime_Rate','Neighborhood_Review','Property_Area','Dust_and_Noise']]
df_to_train = pd.get_dummies(df_to_train)

In [15]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [16]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(df_to_train.values)
    rmse= np.sqrt(-cross_val_score(model, df_to_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [17]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [18]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [19]:
from sklearn.metrics import r2_score
def sasta_score(actual,predicted):
    score = max(0,100*(r2_score(actual, predicted)))
    return score
#     score = max(0, 100*(r2_score(actual , predicted))

In [20]:
model_xgb.fit(df_to_train, y_train)
xgb_train_pred = model_xgb.predict(df_to_train)
# xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))
print(sasta_score(y_train, xgb_train_pred))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


6.456339513940977
79.0882191811469


In [23]:
df_to_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Property_Area                         39499 non-null  float64
 1   Number_of_Windows                     39499 non-null  float64
 2   Number_of_Doors                       39499 non-null  int64  
 3   Furnishing                            39499 non-null  int64  
 4   Frequency_of_Powercuts                39499 non-null  float64
 5   Power_Backup                          39499 non-null  int64  
 6   Traffic_Density_Score                 39499 non-null  float64
 7   Crime_Rate                            39499 non-null  float64
 8   Dust_and_Noise                        39499 non-null  float64
 9   Air_Quality_Index                     39499 non-null  float64
 10  Neighborhood_Review                   39499 non-null  float64
 11  Property_Type_#

In [24]:
test_id = test['Property_ID']

In [25]:
test_id.head()

0    0x6e93
1    0x8787
2    0x6c17
3    0x9dbd
4    0xbfde
Name: Property_ID, dtype: object

In [26]:
to_test = test.iloc[: , 1:]
to_test = pd.get_dummies(to_test)

In [27]:
to_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Property_Area                         10500 non-null  float64
 1   Number_of_Windows                     10500 non-null  float64
 2   Number_of_Doors                       10500 non-null  int64  
 3   Furnishing                            10500 non-null  int64  
 4   Frequency_of_Powercuts                10500 non-null  float64
 5   Power_Backup                          10500 non-null  int64  
 6   Traffic_Density_Score                 10500 non-null  float64
 7   Crime_Rate                            10500 non-null  float64
 8   Dust_and_Noise                        10500 non-null  float64
 9   Air_Quality_Index                     10500 non-null  float64
 10  Neighborhood_Review                   10500 non-null  float64
 11  Property_Type_#

In [28]:
# Gotta fix test before i can pred it
xgb_pred = model_xgb.predict(to_test)

In [35]:
prediction = pd.DataFrame(xgb_pred)

In [38]:
prediction.insert(loc=0, column='Property_ID', value=test_id)

In [39]:
prediction.head()

Unnamed: 0,Property_ID,0
0,0x6e93,20.953127
1,0x8787,79.141479
2,0x6c17,66.007217
3,0x9dbd,71.774963
4,0xbfde,74.878807


In [42]:
prediction.rename(columns = {0:'Habitability_score'}, inplace = True)

In [43]:
prediction.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,20.953127
1,0x8787,79.141479
2,0x6c17,66.007217
3,0x9dbd,71.774963
4,0xbfde,74.878807


In [44]:
prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Property_ID         10500 non-null  object 
 1   Habitability_score  10500 non-null  float32
dtypes: float32(1), object(1)
memory usage: 123.2+ KB


In [45]:
prediction.to_csv('submission.csv')