In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)

In [2]:
train = pd.read_csv("./input/train.csv", sep=",")
test = pd.read_csv("./input/test.csv", sep=",")
sample_submit = pd.read_csv("./input/sample_submit.csv", sep=",", header=None)

In [3]:
train_shape = train.shape
test_shape = test.shape
print("train",train.shape)
print("train",train.columns)
print("test",test.shape)
print("test",test.columns)

train (55583, 29)
train Index(['id', 'accommodates', 'amenities', 'bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee', 'description',
       'first_review', 'host_has_profile_pic', 'host_identity_verified',
       'host_response_rate', 'host_since', 'instant_bookable', 'last_review',
       'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type', 'thumbnail_url',
       'zipcode', 'y'],
      dtype='object')
test (18528, 28)
test Index(['id', 'accommodates', 'amenities', 'bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee', 'description',
       'first_review', 'host_has_profile_pic', 'host_identity_verified',
       'host_response_rate', 'host_since', 'instant_bookable', 'last_review',
       'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type',

In [4]:
train.head()

Unnamed: 0,id,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode,y
0,0,6,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",2.0,Real Bed,1.0,4.0,flexible,LA,t,My place is meant for family and a few friends...,2016-07-27,t,f,,2016-07-13,f,2016-07-27,33.788931,-118.154761,The Penthouse,,1,Apartment,60.0,Private room,,90804.0,138.0
1,1,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,DC,t,This is a new listing for a lovely guest bedro...,2016-09-12,t,t,100%,2015-12-30,f,2017-03-31,38.93481,-76.97819,Guest Bedroom in Brookland,Brookland,9,House,100.0,Private room,https://a0.muscache.com/im/pictures/e4d8b51f-6...,20018.0,42.0
2,2,2,"{TV,Internet,""Wireless Internet"",Kitchen,""Indo...",2.0,Real Bed,1.0,1.0,strict,NYC,t,We're looking forward to your stay at our apt....,2016-06-15,t,f,100%,2016-05-21,t,2017-08-13,40.695118,-73.92624,Clean Modern Room in Lux Apt 1 Block From J Train,Bushwick,27,Apartment,83.0,Private room,https://a0.muscache.com/im/pictures/5ffecc9b-d...,,65.0
3,3,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,SF,t,BEST CITY VIEWS - - ROOF DECK W/ BBQ & WiFi - ...,2014-03-15,t,t,100%,2012-06-19,t,2017-09-03,37.796728,-122.411906,BEST views + reviews! 5/5 stars*****,Nob Hill,38,Apartment,95.0,Private room,,94133.0,166.0
4,4,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,1.0,1.0,strict,NYC,t,Charming Apartment on the upper west side of M...,2015-08-05,t,t,100%,2015-03-25,f,2017-09-10,40.78505,-73.974691,Charming 1-bedroom - UWS Manhattan,Upper West Side,5,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/92879730/5...,10024.0,165.0


In [5]:
test.head()

Unnamed: 0,id,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode
0,0,6,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",2.0,Real Bed,2.0,2.0,strict,Boston,t,Feel free to book INSTANTLY. You can check-in ...,2017-01-09,t,f,100%,2016-08-23,t,2017-09-25,42.359278,-71.069962,Gorgeous 2BR/2BA Duplex in Beacon Hill,Beacon Hill,58,House,90.0,Entire home/apt,https://a0.muscache.com/im/pictures/7e4808b4-5...,2114.0
1,1,3,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,moderate,LA,t,The guest house is close to: Equinox West Holl...,2016-08-17,t,t,100%,2014-09-03,f,2017-05-02,34.084747,-118.367355,Luxury 1 Bedroom West Hollywood City Center,West Hollywood,4,Guesthouse,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/5392fbd6-6...,90046.0
2,2,2,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.0,Real Bed,0.0,1.0,flexible,NYC,f,Private room in a three bedroom apartment in N...,,t,t,100%,2012-10-17,f,,40.720541,-73.959192,Bedroom with Patio in Prime Williamsburg Locat...,Williamsburg,0,Apartment,,Private room,https://a0.muscache.com/im/pictures/544d3b89-d...,11249.0
3,3,4,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,2.0,strict,NYC,f,The apartment is located in historic Bed Stuy ...,,t,t,,2013-01-23,f,,40.681117,-73.944091,Cozy apartment in Brooklyn,Bedford-Stuyvesant,0,Apartment,,Entire home/apt,https://a0.muscache.com/im/pictures/26baf7ba-0...,11216.0
4,4,3,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.5,Real Bed,1.0,2.0,strict,LA,t,"Our cozy, pet friendly one bedroom apartment/l...",2015-08-01,t,t,100%,2014-12-28,f,2016-09-11,34.150995,-118.409359,"Cozy, sunny, pet friendly loft/apt",,6,Loft,92.0,Entire home/apt,https://a0.muscache.com/im/pictures/86107545/9...,91604.0


In [6]:
total_data = pd.concat([train,test], axis=0)
print(total_data.shape)
total_data

(74111, 29)


Unnamed: 0,id,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode,y
0,0,6,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",2.0,Real Bed,1.0,4.0,flexible,LA,t,My place is meant for family and a few friends...,2016-07-27,t,f,,2016-07-13,f,2016-07-27,33.788931,-118.154761,The Penthouse,,1,Apartment,60.0,Private room,,90804,138.0
1,1,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,DC,t,This is a new listing for a lovely guest bedro...,2016-09-12,t,t,100%,2015-12-30,f,2017-03-31,38.934810,-76.978190,Guest Bedroom in Brookland,Brookland,9,House,100.0,Private room,https://a0.muscache.com/im/pictures/e4d8b51f-6...,20018,42.0
2,2,2,"{TV,Internet,""Wireless Internet"",Kitchen,""Indo...",2.0,Real Bed,1.0,1.0,strict,NYC,t,We're looking forward to your stay at our apt....,2016-06-15,t,f,100%,2016-05-21,t,2017-08-13,40.695118,-73.926240,Clean Modern Room in Lux Apt 1 Block From J Train,Bushwick,27,Apartment,83.0,Private room,https://a0.muscache.com/im/pictures/5ffecc9b-d...,,65.0
3,3,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,SF,t,BEST CITY VIEWS - - ROOF DECK W/ BBQ & WiFi - ...,2014-03-15,t,t,100%,2012-06-19,t,2017-09-03,37.796728,-122.411906,BEST views + reviews! 5/5 stars*****,Nob Hill,38,Apartment,95.0,Private room,,94133,166.0
4,4,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,1.0,1.0,strict,NYC,t,Charming Apartment on the upper west side of M...,2015-08-05,t,t,100%,2015-03-25,f,2017-09-10,40.785050,-73.974691,Charming 1-bedroom - UWS Manhattan,Upper West Side,5,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/92879730/5...,10024,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18523,18523,4,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,2.0,2.0,strict,NYC,t,"The Greenhouse, located on Green Street, is a ...",,t,t,100%,2009-11-16,f,,40.734555,-73.954892,Spacious 2BR Greenpoint Getaway,Greenpoint,0,Apartment,,Entire home/apt,https://a0.muscache.com/im/pictures/57338613/6...,11222,
18524,18524,2,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.0,Real Bed,2.0,1.0,flexible,Chicago,f,"Two bedroom, one bathroom with large dining/li...",2017-01-16,t,f,100%,2017-01-08,f,2017-04-11,41.945939,-87.672018,Walk up Apartment in Lakeview/Wrigleyville,Lakeview,9,Apartment,90.0,Entire home/apt,,60657,
18525,18525,5,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.5,Real Bed,2.0,2.0,flexible,Chicago,t,Happy Holidays! If you're looking for a big op...,,t,f,100%,2014-09-02,f,,41.933123,-87.708087,Beautiful Logan Square Home,Avondale,0,House,,Entire home/apt,https://a0.muscache.com/im/pictures/361642af-e...,60618,
18526,18526,2,"{Internet,""Wireless Internet"",""Air conditionin...",1.0,Real Bed,1.0,2.0,strict,NYC,t,This is a cozy one-bedroom apartment a few blo...,2016-04-15,t,f,100%,2014-03-17,f,2017-05-08,40.788702,-73.947358,Charming 1 BR apartment east of Central Park,East Harlem,4,Apartment,95.0,Entire home/apt,https://a0.muscache.com/im/pictures/49c8e83f-d...,10029.0,


In [7]:
len(total_data.id.unique()), total_data.id.isnull().value_counts()

(55583,
 False    74111
 Name: id, dtype: int64)

In [8]:
total_data["accommodates"].unique()

array([ 6,  2,  4,  3,  8,  9,  1,  5, 16, 10,  7, 12, 11, 15, 14, 13],
      dtype=int64)

In [9]:
total_amenities = total_data["amenities"].str.split(",", expand=True)
c = total_amenities.shape[1]
for i in range(c):
    total_amenities.iloc[:,i] = total_amenities.iloc[:,i].str.replace('"','')
    total_amenities.iloc[:,i] = total_amenities.iloc[:,i].str.replace('{','')
    total_amenities.iloc[:,i] = total_amenities.iloc[:,i].str.replace('}','')
total_amenities.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85
0,TV,Wireless Internet,Kitchen,Free parking on premises,Washer,Dryer,Smoke detector,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,TV,Cable TV,Internet,Wireless Internet,Air conditioning,Kitchen,Free parking on premises,Heating,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Essentials,Shampoo,Lock on bedroom door,24-hour check-in,Hangers,Iron,Laptop friendly workspace,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,TV,Internet,Wireless Internet,Kitchen,Indoor fireplace,Buzzer/wireless intercom,Heating,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Safety card,Fire extinguisher,Essentials,Shampoo,Lock on bedroom door,24-hour check-in,Hangers,Hair dryer,Iron,Laptop friendly workspace,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,Self Check-In,Keypad,Lockbox,Bathtub,Room-darkening shades,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,TV,Cable TV,Internet,Wireless Internet,Air conditioning,Kitchen,Buzzer/wireless intercom,Heating,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Safety card,Essentials,Shampoo,Self Check-In,Keypad,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,TV,Internet,Wireless Internet,Air conditioning,Kitchen,Elevator,Buzzer/wireless intercom,Heating,Washer,Dryer,Smoke detector,Carbon monoxide detector,Safety card,Fire extinguisher,Essentials,Shampoo,Hangers,Hair dryer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
total_amenities_unique = set()
for i in range(c):
    tmp = list(total_amenities.iloc[:,i].unique())
    for j in tmp:
        total_amenities_unique.add(j)
total_amenities_unique = list(total_amenities_unique)
total_amenities_unique, len(total_amenities_unique)

(['',
  'Kitchen',
  'Free parking on street',
  'First aid kit',
  'Other pet(s)',
  'Cable TV',
  'Washer / Dryer',
  'Stair gates',
  'Elevator',
  'Beach essentials',
  'BBQ grill',
  'Luggage dropoff allowed',
  'Bed linens',
  'Body soap',
  'Doorman',
  'Babysitter recommendations',
  'Fireplace guards',
  'Wide hallway clearance',
  'Lake access',
  'Well-lit path to entrance',
  'Pool',
  'Keypad',
  'Dog(s)',
  ' smooth pathway to front door',
  'Wide entryway',
  'Pack ’n Play/travel crib',
  'Extra pillows and blankets',
  'Outlet covers',
  'Long term stays allowed',
  'Fixed grab bars for shower & toilet',
  'Indoor fireplace',
  'Other',
  'Stove',
  'Hangers',
  'Lockbox',
  'Essentials',
  'Host greets you',
  'Laptop friendly workspace',
  'Shampoo',
  'Private living room',
  'Washer',
  'Suitable for events',
  'Path to entrance lit at night',
  'High chair',
  'Bathtub with shower chair',
  'Table corner guards',
  'Buzzer/wireless intercom',
  'Pets live on this p

In [11]:
temp = pd.concat([pd.get_dummies(total_amenities[col]) for col in total_amenities], axis=1)
total_amenities = temp.groupby(level=0, axis=1).sum()
total_amenities.drop("", axis=1, inplace=True)
total_amenities

Unnamed: 0,smooth pathway to front door,24-hour check-in,Accessible-height bed,Accessible-height toilet,Air conditioning,Air purifier,BBQ grill,Baby bath,Baby monitor,Babysitter recommendations,Bath towel,Bathtub,Bathtub with shower chair,Beach essentials,Beachfront,Bed linens,Body soap,Breakfast,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,Cat(s),Changing table,Children’s books and toys,Children’s dinnerware,Cleaning before checkout,Coffee maker,Cooking basics,Crib,Disabled parking spot,Dishes and silverware,Dishwasher,Dog(s),Doorman,Doorman Entry,Dryer,EV charger,Elevator,Elevator in building,Essentials,Ethernet connection,Extra pillows and blankets,Family/kid friendly,Fire extinguisher,Fireplace guards,Firm matress,Firm mattress,First aid kit,Fixed grab bars for shower & toilet,Flat,Flat smooth pathway to front door,Free parking on premises,Free parking on street,Game console,Garden or backyard,Grab-rails for shower and toilet,Ground floor access,Gym,Hair dryer,Hand or paper towel,Hand soap,Handheld shower head,Hangers,Heating,High chair,Host greets you,Hot tub,Hot water,Hot water kettle,Indoor fireplace,Internet,Iron,Keypad,Kitchen,Lake access,Laptop friendly workspace,Lock on bedroom door,Lockbox,Long term stays allowed,Luggage dropoff allowed,Microwave,Other,Other pet(s),Outlet covers,Oven,Pack ’n Play/travel crib,Paid parking off premises,Path to entrance lit at night,Patio or balcony,Pets allowed,Pets live on this property,Pocket wifi,Pool,Private bathroom,Private entrance,Private living room,Refrigerator,Roll-in shower with chair,Room-darkening shades,Safety card,Self Check-In,Shampoo,Single level home,Ski in/Ski out,Smart lock,Smartlock,Smoke detector,Smoking allowed,Stair gates,Step-free access,Stove,Suitable for events,TV,Table corner guards,Toilet paper,Washer,Washer / Dryer,Waterfront,Well-lit path to entrance,Wheelchair accessible,Wide clearance to bed,Wide clearance to shower & toilet,Wide clearance to shower and toilet,Wide doorway,Wide entryway,Wide hallway clearance,Window guards,Wireless Internet,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18523,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
18524,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1
18525,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
18526,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:
total_data["bed_type"].unique()

array(['Real Bed', 'Pull-out Sofa', 'Airbed', 'Futon', 'Couch'],
      dtype=object)

In [13]:
total_bed_type = total_data["bed_type"]
total_bed_type = pd.get_dummies(total_bed_type)
total_bed_type

Unnamed: 0,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
18523,0,0,0,0,1
18524,0,0,0,0,1
18525,0,0,0,0,1
18526,0,0,0,0,1


In [14]:
total_data.cancellation_policy.unique()

array(['flexible', 'strict', 'moderate', 'super_strict_30',
       'super_strict_60'], dtype=object)

In [15]:
total_cancellation_policy = pd.get_dummies(total_data.cancellation_policy)
total_cancellation_policy

Unnamed: 0,flexible,moderate,strict,super_strict_30,super_strict_60
0,1,0,0,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
18523,0,0,1,0,0
18524,1,0,0,0,0
18525,1,0,0,0,0
18526,0,0,1,0,0


In [16]:
total_data.city.unique()

array(['LA', 'DC', 'NYC', 'SF', 'Chicago', 'Boston'], dtype=object)

In [17]:
total_city = pd.get_dummies(total_data.city)
total_city

Unnamed: 0,Boston,Chicago,DC,LA,NYC,SF
0,0,0,0,1,0,0
1,0,0,1,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,0,1,0
...,...,...,...,...,...,...
18523,0,0,0,0,1,0
18524,0,1,0,0,0,0
18525,0,1,0,0,0,0
18526,0,0,0,0,1,0


In [18]:
len(total_data.description.unique())

73479

In [19]:
total_data.description.isnull().value_counts()

False    74111
Name: description, dtype: int64

In [20]:
bool_mapping = {'t': 1, 'f': 0}

In [21]:
total_first_review = pd.to_datetime(total_data.first_review)
total_first_review.value_counts()

2017-01-01    293
2017-01-22    249
2016-01-02    221
2017-01-02    211
2017-09-04    193
             ... 
2012-12-09      1
2012-01-14      1
2011-01-25      1
2012-10-23      1
2012-11-29      1
Name: first_review, Length: 2554, dtype: int64

In [22]:
total_data.host_has_profile_pic.value_counts()
total_host_has_profile_pic = total_data.host_has_profile_pic.map(bool_mapping)
total_host_has_profile_pic

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
18523    1.0
18524    1.0
18525    1.0
18526    1.0
18527    1.0
Name: host_has_profile_pic, Length: 74111, dtype: float64

In [23]:
total_data.host_identity_verified.value_counts()

t    49748
f    24175
Name: host_identity_verified, dtype: int64

In [24]:
total_host_identity_verified = total_data.host_identity_verified.map(bool_mapping)
total_host_identity_verified

0        0.0
1        1.0
2        0.0
3        1.0
4        1.0
        ... 
18523    1.0
18524    0.0
18525    0.0
18526    0.0
18527    0.0
Name: host_identity_verified, Length: 74111, dtype: float64

In [27]:
total_host_response_rate = total_data.host_response_rate

total_host_response_rate = total_host_response_rate.str[:-1]
total_host_response_rate = total_host_response_rate.astype("float", errors="ignore")
total_host_response_rate = total_host_response_rate / 100
total_host_response_rate.value_counts()

1.00    43254
0.90     2277
0.80     1113
0.00      883
0.50      611
0.70      508
0.99      448
0.67      433
0.98      425
0.94      401
0.97      400
0.96      350
0.60      337
0.95      322
0.88      316
0.92      315
0.75      315
0.89      310
0.93      307
0.83      279
0.86      243
0.91      224
0.33      142
0.40      120
0.78      116
0.71      106
0.87       90
0.81       88
0.82       82
0.25       80
0.68       71
0.84       62
0.85       62
0.63       58
0.57       57
0.73       49
0.79       45
0.20       45
0.56       44
0.30       38
0.76       29
0.74       24
0.77       24
0.64       23
0.43       22
0.54       21
0.44       19
0.29       18
0.58       17
0.10       16
0.17       15
0.38       15
0.72       15
0.65       15
0.53       14
0.55       13
0.69       12
0.46        8
0.62        7
0.14        6
0.42        5
0.52        5
0.36        5
0.35        5
0.59        4
0.61        4
0.26        3
0.66        3
0.22        3
0.27        2
0.47        2
0.13  

In [28]:
total_host_since = pd.to_datetime(total_data.host_since)
total_host_since

0       2016-07-13
1       2015-12-30
2       2016-05-21
3       2012-06-19
4       2015-03-25
           ...    
18523   2009-11-16
18524   2017-01-08
18525   2014-09-02
18526   2014-03-17
18527   2015-08-19
Name: host_since, Length: 74111, dtype: datetime64[ns]

In [29]:
total_instant_bookable = total_data.instant_bookable
total_instant_bookable = total_instant_bookable.map(bool_mapping)
total_instant_bookable

0        0
1        0
2        1
3        1
4        0
        ..
18523    0
18524    0
18525    0
18526    0
18527    1
Name: instant_bookable, Length: 74111, dtype: int64

In [30]:
total_last_review = pd.to_datetime(total_data.last_review)
total_last_review

0       2016-07-27
1       2017-03-31
2       2017-08-13
3       2017-09-03
4       2017-09-10
           ...    
18523          NaT
18524   2017-04-11
18525          NaT
18526   2017-05-08
18527   2017-01-24
Name: last_review, Length: 74111, dtype: datetime64[ns]

In [33]:
total_property_type = total_data.property_type
total_property_type = pd.get_dummies(total_property_type)

In [34]:
total_room_type = total_data.room_type
total_room_type = pd.get_dummies(total_property_type)

In [35]:
total_thumbnail_url = total_data.thumbnail_url
total_thumbnail_url = total_thumbnail_url.isnull() * 1
total_thumbnail_url

0        1
1        0
2        0
3        1
4        0
        ..
18523    0
18524    1
18525    0
18526    0
18527    0
Name: thumbnail_url, Length: 74111, dtype: int32

In [36]:
total_cleaning_fee = total_data.cleaning_fee.map(bool_mapping)
total_cleaning_fee


0        1
1        1
2        1
3        1
4        1
        ..
18523    1
18524    0
18525    1
18526    1
18527    0
Name: cleaning_fee, Length: 74111, dtype: int64

In [37]:
total_since_review= (total_last_review - total_host_since).dt.days
total_since_review

0          14.0
1         457.0
2         449.0
3        1902.0
4         900.0
          ...  
18523       NaN
18524      93.0
18525       NaN
18526    1148.0
18527     524.0
Length: 74111, dtype: float64

In [38]:
total_review= (total_last_review - total_host_since).dt.days
total_review

0          14.0
1         457.0
2         449.0
3        1902.0
4         900.0
          ...  
18523       NaN
18524      93.0
18525       NaN
18526    1148.0
18527     524.0
Length: 74111, dtype: float64

In [39]:
creansing_data = total_data.accommodates.copy()
creansing_data = pd.concat([creansing_data, total_amenities], axis=1)
creansing_data = pd.concat([creansing_data, total_data.bathrooms], axis=1)
creansing_data = pd.concat([creansing_data, total_data.bedrooms], axis=1)
creansing_data = pd.concat([creansing_data, total_data.beds], axis=1)
creansing_data = pd.concat([creansing_data, total_cancellation_policy], axis=1)
creansing_data = pd.concat([creansing_data, total_city], axis=1)
creansing_data = pd.concat([creansing_data, total_cleaning_fee], axis=1)
# creansing_data = pd.concat([creansing_data, total_data_first_review], axis=1)
creansing_data = pd.concat([creansing_data, total_host_has_profile_pic], axis=1)
creansing_data = pd.concat([creansing_data, total_host_identity_verified], axis=1)
creansing_data = pd.concat([creansing_data, total_host_response_rate], axis=1)
# creansing_data = pd.concat([creansing_data, total_host_since], axis=1)
creansing_data = pd.concat([creansing_data, total_instant_bookable], axis=1)
# creansing_data = pd.concat([creansing_data, total_last_review], axis=1)
creansing_data = pd.concat([creansing_data, total_since_review], axis=1)
creansing_data = pd.concat([creansing_data, total_review], axis=1)
creansing_data = pd.concat([creansing_data, total_data.latitude], axis=1)
creansing_data = pd.concat([creansing_data, total_data.longitude], axis=1)
creansing_data = pd.concat([creansing_data, total_data.number_of_reviews], axis=1)
creansing_data = pd.concat([creansing_data, total_property_type], axis=1)
creansing_data = pd.concat([creansing_data, total_data.review_scores_rating], axis=1)
creansing_data = pd.concat([creansing_data, total_room_type], axis=1)
creansing_data = pd.concat([creansing_data, total_thumbnail_url], axis=1)
creansing_data

Unnamed: 0,accommodates,smooth pathway to front door,24-hour check-in,Accessible-height bed,Accessible-height toilet,Air conditioning,Air purifier,BBQ grill,Baby bath,Baby monitor,Babysitter recommendations,Bath towel,Bathtub,Bathtub with shower chair,Beach essentials,Beachfront,Bed linens,Body soap,Breakfast,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,Cat(s),Changing table,Children’s books and toys,Children’s dinnerware,Cleaning before checkout,Coffee maker,Cooking basics,Crib,Disabled parking spot,Dishes and silverware,Dishwasher,Dog(s),Doorman,Doorman Entry,Dryer,EV charger,Elevator,Elevator in building,Essentials,Ethernet connection,Extra pillows and blankets,Family/kid friendly,Fire extinguisher,Fireplace guards,Firm matress,Firm mattress,First aid kit,Fixed grab bars for shower & toilet,Flat,Flat smooth pathway to front door,Free parking on premises,Free parking on street,Game console,Garden or backyard,Grab-rails for shower and toilet,Ground floor access,Gym,Hair dryer,Hand or paper towel,Hand soap,Handheld shower head,Hangers,Heating,High chair,Host greets you,Hot tub,Hot water,Hot water kettle,Indoor fireplace,Internet,Iron,Keypad,Kitchen,Lake access,Laptop friendly workspace,Lock on bedroom door,Lockbox,Long term stays allowed,Luggage dropoff allowed,Microwave,Other,Other pet(s),Outlet covers,Oven,Pack ’n Play/travel crib,Paid parking off premises,Path to entrance lit at night,Patio or balcony,Pets allowed,Pets live on this property,Pocket wifi,Pool,Private bathroom,Private entrance,Private living room,Refrigerator,Roll-in shower with chair,Room-darkening shades,Safety card,Self Check-In,Shampoo,Single level home,Ski in/Ski out,Smart lock,Smartlock,Smoke detector,Smoking allowed,Stair gates,Step-free access,Stove,Suitable for events,TV,Table corner guards,Toilet paper,Washer,Washer / Dryer,Waterfront,Well-lit path to entrance,Wheelchair accessible,Wide clearance to bed,Wide clearance to shower & toilet,Wide clearance to shower and toilet,Wide doorway,Wide entryway,Wide hallway clearance,Window guards,Wireless Internet,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,bathrooms,bedrooms,beds,flexible,moderate,strict,super_strict_30,super_strict_60,Boston,Chicago,DC,LA,NYC,SF,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,0,0.1,latitude,longitude,number_of_reviews,Apartment,Bed & Breakfast,Boat,Boutique hotel,Bungalow,Cabin,Camper/RV,Casa particular,Castle,Cave,Chalet,Condominium,Dorm,Earth House,Guest suite,Guesthouse,Hostel,House,Hut,In-law,Island,Lighthouse,Loft,Other.1,Parking Space,Serviced apartment,Tent,Timeshare,Tipi,Townhouse,Train,Treehouse,Vacation home,Villa,Yurt,review_scores_rating,Apartment.1,Bed & Breakfast.1,Boat.1,Boutique hotel.1,Bungalow.1,Cabin.1,Camper/RV.1,Casa particular.1,Castle.1,Cave.1,Chalet.1,Condominium.1,Dorm.1,Earth House.1,Guest suite.1,Guesthouse.1,Hostel.1,House.1,Hut.1,In-law.1,Island.1,Lighthouse.1,Loft.1,Other.2,Parking Space.1,Serviced apartment.1,Tent.1,Timeshare.1,Tipi.1,Townhouse.1,Train.1,Treehouse.1,Vacation home.1,Villa.1,Yurt.1,thumbnail_url
0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2.0,1.0,4.0,1,0,0,0,0,0,0,0,1,0,0,1,1.0,0.0,,0,14.0,14.0,33.788931,-118.154761,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1.0,1.0,1.0,0,0,1,0,0,0,0,1,0,0,0,1,1.0,1.0,1.0,0,457.0,457.0,38.934810,-76.978190,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,2.0,1.0,1.0,0,0,1,0,0,0,0,0,0,1,0,1,1.0,0.0,1.0,1,449.0,449.0,40.695118,-73.926240,27,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,83.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,1.0,1.0,0,0,1,0,0,0,0,0,0,0,1,1,1.0,1.0,1.0,1,1902.0,1902.0,37.796728,-122.411906,38,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,1.0,1.0,0,0,1,0,0,0,0,0,0,1,0,1,1.0,1.0,1.0,0,900.0,900.0,40.785050,-73.974691,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18523,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,2.0,2.0,0,0,1,0,0,0,0,0,0,1,0,1,1.0,1.0,1.0,0,,,40.734555,-73.954892,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18524,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1.0,2.0,1.0,1,0,0,0,0,0,1,0,0,0,0,0,1.0,0.0,1.0,0,93.0,93.0,41.945939,-87.672018,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
18525,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.5,2.0,2.0,1,0,0,0,0,0,1,0,0,0,0,1,1.0,0.0,1.0,0,,,41.933123,-87.708087,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18526,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,1.0,2.0,0,0,1,0,0,0,0,0,0,1,0,1,1.0,0.0,1.0,0,1148.0,1148.0,40.788702,-73.947358,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
y = train["y"].copy()
df_train = creansing_data.iloc[:len(y),:].copy()
df_test = creansing_data.iloc[len(y):,:].copy()
df_train.shape, df_test.shape

((55583, 227), (18528, 227))

In [41]:
sc = StandardScaler()
X = df_train.values
X_test = df_test.values

# X = sc.fit_transform(X)
# X_test = sc.fit_transform(X_test)

In [46]:
params = {
            "force_col_wise":True,
            'task': 'train', 
            'boosting_type': 'gbdt',
            "n_estimators":50000,
            'objective': 'regression',    
            'metric': {'rmse'}
            }
kn = 5
kf = KFold(n_splits=kn, shuffle=True, random_state=1)
predict_proba = np.zeros(len(X_test)).astype(np.float64)
scores = []

for train_idx, valid_idx in kf.split(X, y):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    lgb_results = {}
    gbm = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    num_boost_round=100,
                    verbose_eval=20,
                    early_stopping_rounds=10,
                    evals_result=lgb_results)
    y_gbm = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    scores.append(mean_squared_error(y_valid, y_gbm))
    predict_proba += gbm.predict(X_test, num_iteration=gbm.best_iteration)
skf_predict = predict_proba / kn


[LightGBM] [Info] Total Bins 1792
[LightGBM] [Info] Number of data points in the train set: 44466, number of used features: 175
[LightGBM] [Info] Start training from score 159.766046
Training until validation scores don't improve for 10 rounds
[20]	training's rmse: 112.477	valid_1's rmse: 116
[40]	training's rmse: 102.732	valid_1's rmse: 109.071
[60]	training's rmse: 98.2248	valid_1's rmse: 106.803
[80]	training's rmse: 94.9655	valid_1's rmse: 105.894
[100]	training's rmse: 92.6317	valid_1's rmse: 105.549
[120]	training's rmse: 90.5292	valid_1's rmse: 105.234
[140]	training's rmse: 88.7445	valid_1's rmse: 105.109
[160]	training's rmse: 87.1568	valid_1's rmse: 104.776
Early stopping, best iteration is:
[169]	training's rmse: 86.631	valid_1's rmse: 104.731
[LightGBM] [Info] Total Bins 1795
[LightGBM] [Info] Number of data points in the train set: 44466, number of used features: 177
[LightGBM] [Info] Start training from score 160.092025
Training until validation scores don't improve for 1

In [47]:
skf_predict.shape

(18528,)

In [49]:
sample_submit.shape

(18528, 2)

In [51]:
sample_submit[1] = skf_predict
sample_submit

Unnamed: 0,0,1
0,0,237.153985
1,1,126.614596
2,2,152.706868
3,3,156.004476
4,4,109.528457
...,...,...
18523,18523,211.231030
18524,18524,112.443331
18525,18525,184.328559
18526,18526,104.667888


In [52]:
sample_submit.to_csv("submit.csv", header=None, sep=",",index=False)