# Feature Engineering

In [215]:
# import libraries
import pandas as pd
import numpy as np

In [216]:
# avoid om
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Read Dataset

In [217]:
# read the data
df = pd.read_csv("../data/latin/latin.csv", error_bad_lines=False, warn_bad_lines=False)

### Delete columns

In [218]:
df = df.drop('id', 1)
df = df.drop('listing_url', 1)
df = df.drop('scrape_id', 1)
df = df.drop('last_scraped', 1)
df = df.drop('name', 1)
df = df.drop('summary', 1)
df = df.drop('space', 1)
df = df.drop('description', 1)
df = df.drop('neighborhood_overview', 1)
df = df.drop('notes', 1)
df = df.drop('transit', 1)
df = df.drop('access', 1)
df = df.drop('interaction', 1)
df = df.drop('house_rules', 1)
df = df.drop('thumbnail_url', 1)
df = df.drop('medium_url', 1)
df = df.drop('picture_url', 1)
df = df.drop('xl_picture_url', 1)
df = df.drop('host_id', 1)
df = df.drop('host_url', 1)
df = df.drop('host_name', 1)
df = df.drop('host_since', 1)
df = df.drop('host_location', 1)
df = df.drop('host_about', 1)
df = df.drop('host_thumbnail_url', 1)
df = df.drop('host_picture_url', 1)
df = df.drop('host_neighbourhood', 1)
df = df.drop('street', 1)
df = df.drop('neighbourhood', 1)
df = df.drop('neighbourhood_cleansed', 1)
df = df.drop('neighbourhood_group_cleansed', 1)
df = df.drop('city', 1)
df = df.drop('state', 1)
df = df.drop('zipcode', 1)
df = df.drop('market', 1)
df = df.drop('smart_location', 1)
df = df.drop('country_code', 1)
df = df.drop('country', 1)
df = df.drop('latitude', 1)
df = df.drop('longitude', 1)
df = df.drop('square_feet', 1)
df = df.drop('calendar_updated', 1)
df = df.drop('first_review', 1)
df = df.drop('last_review', 1)
df = df.drop('license', 1)
df = df.drop('jurisdiction_names', 1)
df = df.drop('experiences_offered', 1)
df = df.drop('host_acceptance_rate', 1)
df = df.drop('calendar_last_scraped', 1)
df = df.drop('monthly_price', 1)
df = df.drop('weekly_price', 1)
df = df.drop('property_type', 1)
df = df.drop('room_type', 1)
df = df.drop('bed_type', 1)
df = df.drop('minimum_minimum_nights', 1)
df = df.drop('maximum_minimum_nights', 1)
df = df.drop('maximum_maximum_nights', 1)

### Transform the amenities and host_verification_count into numbers

In [219]:
df['amenities'] = df['amenities'].astype(str)

In [220]:
df['host_verifications_count'] = df.apply(lambda row: row.host_verifications.count(',') + 1, axis=1)

In [221]:
df['amenities_count'] = df.apply(lambda row: row.amenities.count(',') + 1 if not type(row) is float else 0, axis=1)

In [222]:
df = df.drop('host_verifications', 1)
df = df.drop('amenities', 1)

### One hot-encoding

In [223]:
df = df.dropna()

In [224]:
df.shape

(30814, 49)

In [225]:
# import preprocessing from sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [226]:
# encoding these columns that are 0/1
df['host_is_superhost'] = le.fit_transform(df.host_is_superhost.values)
df['host_has_profile_pic'] = le.fit_transform(df.host_has_profile_pic.values)
df['host_identity_verified'] = le.fit_transform(df.host_identity_verified.values)
df['is_location_exact'] = le.fit_transform(df.is_location_exact.values)
df['has_availability'] = le.fit_transform(df.has_availability.values)
df['requires_license'] = le.fit_transform(df.requires_license.values)
df['instant_bookable'] = le.fit_transform(df.instant_bookable.values)
df['is_business_travel_ready'] = le.fit_transform(df.is_business_travel_ready.values)
df['require_guest_profile_picture'] = le.fit_transform(df.require_guest_profile_picture.values)
df['require_guest_phone_verification'] = le.fit_transform(df.require_guest_phone_verification.values)

In [227]:
# encoding the language categorical feature
df = pd.get_dummies(df, columns = ['host_response_time', 'cancellation_policy'])

In [228]:
# convert to float 100%
df.drop(df[df['host_response_rate'] == 'host_response_rate'].index, inplace = True)
df.drop(df[df['price'] == 'price'].index, inplace = True)

In [229]:
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0

In [230]:
df['price']=(df['price'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))
df['security_deposit']=(df['security_deposit'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))
df['cleaning_fee']=(df['cleaning_fee'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))
df['extra_people']=(df['extra_people'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))

#### Send Y to the end

In [231]:
satisfaction = df.review_scores_rating.values
df = df.drop('review_scores_rating', 1)
df['satisfaction'] = satisfaction

In [232]:
df['satisfaction'] = df['satisfaction'].astype(int)

In [233]:
df['satisfaction'] = df['satisfaction'].apply(lambda x: 1 if x >= 90 else 0)

In [234]:
df.satisfaction.value_counts()

1    26888
0     3922
Name: satisfaction, dtype: int64

In [235]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [236]:
# save the csv
df.to_csv('../experiments/latin-processed.csv')

##### That's all folks!!!

In [237]:
df.shape

(30810, 59)