Trying to identify the top features to utilize in the model

In [None]:
import pandas as pd
import sklearn.preprocessing as pre
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
#Read data into the training set
train_data = pd.read_csv("../input/train.csv")

In [None]:
#Trying to identify NaN values in the variables
total = train_data.isnull().sum().sort_values(ascending = False)
percent = (train_data.isnull().sum()/train_data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent],axis=1,keys = ["Total","Percentage"])
print(missing_data)

In [None]:
#Delete all NaN values for now
train_data = train_data.drop(missing_data[missing_data["Total"]>0].index,1)

In [None]:
#Identify top features using a basic XGBoost
# I'd like to thank this from 
#https://www.kaggle.com/sudalairajkumar/sberbank-russian-housing-market/simple-exploration-notebook-sberbank
#It helped me to understand a simple way to build my feature importance
for f in train_data:
    if train_data[f].dtype == "object":
        lbl=pre.LabelEncoder()
        lbl.fit(list(train_data[f].values))
        train_data[f]=lbl.transform(list(train_data[f].values))

xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

y_train = train_data["price_doc"]
x_train = train_data.drop(["id","timestamp","price_doc"],axis = 1)
dtrain = xgb.DMatrix(x_train,y_train,feature_names = x_train.columns.values)
model = xgb.train(dict(xgb_params,silent=0),dtrain,num_boost_round=100)

fig,ax=plt.subplots(figsize = (12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show

The top 10 variables impacting the target are as follows:

 1. full_sq(2023)
 2. metro_min_avto(344)
 3. sub_area(332)
 4. kindergarten_km(294)
 5. green_zone_km(224)
 6. school_km(221)
 7. metro_km_avto(213)
 8. park_km(206)
 9. industrial_km(197)
 10. area_m(195)

Out of these, area_m seems similar to full_sq and metro_min_avto seems similar to metro_km_avto. So I am removing these from my analysis.

In [None]:
#Change the Price values to log functions
cols = ["price_doc","full_sq","metro_min_avto","sub_area","kindergarten_km","green_zone_km","school_km","park_km","industrial_km"]
train_data["price_doc"]= np.log(train_data["price_doc"])

In [None]:
#Check the plots with each of the variables
corrmat = train_data.corr()
sns.pairplot(train_data[cols],size = 2.5)
plt.show()