# Tutorial 1 - AIRBNB - CORE STEPS

**Our unit of analysis is an AIRBNB LISTING**

We will see how we can transform the input variables. We won't do any predictions in this notebook!

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [2]:
#We will predict the "median_house_value" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_75-150


In [3]:
# Find the total number of rows

airbnb.shape

(10272, 23)

In [4]:
# Check the missing values

airbnb.isna().sum()

host_is_superhost                       0
host_identity_verified                  0
neighbourhood_cleansed                  0
latitude                                0
longitude                               0
property_type                           9
room_type                               0
accommodates                            0
bathrooms                              36
bedrooms                               30
beds                                   24
bed_type                                0
Number of amenities                     0
guests_included                         0
price_per_extra_person                  0
minimum_nights                          0
number_of_reviews                       0
number_days_btw_first_last_review       0
review_scores_rating                 2283
cancellation_policy                     0
price                                   0
price_gte_150                           0
price_category                          0
dtype: int64

### Should we remove these rows or not???

In [None]:
# If we want to remove them, use the following code:

# train.dropna(axis=0, inplace=True)

# Split data (train/test)

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(airbnb, test_size=0.3)

In [6]:
train.shape

(7190, 23)

In [7]:
test.shape

(3082, 23)

In [8]:
train.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
3437,0,1,Roslindale,42.287641,-71.129883,Apartment,Entire home/apt,5,1.0,2.0,...,5,30,2,12,68,97.0,moderate,150,1,btw_75-150
6622,0,0,Allston,42.354976,-71.129493,Apartment,Private room,1,2.0,1.0,...,1,0,17,1,0,100.0,strict,45,0,lte_75
2262,0,1,Fenway,42.343264,-71.090912,Apartment,Private room,1,1.0,1.0,...,1,30,2,5,310,100.0,flexible,100,0,btw_75-150
2246,0,1,Fenway,42.340861,-71.088289,Apartment,Entire home/apt,3,1.0,0.0,...,1,0,3,32,474,95.0,strict,142,0,btw_75-150
835,0,1,Roxbury,42.335659,-71.085204,Apartment,Entire home/apt,6,1.0,3.0,...,3,10,3,3,22,93.0,strict,229,1,gte_226


In [9]:
test.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
8047,0,1,Back Bay,42.355263,-71.072793,Apartment,Entire home/apt,6,2.0,3.0,...,1,0,3,52,2314,88.0,super_strict_30,389,1,gte_226
7809,0,1,South End,42.343438,-71.076884,Apartment,Entire home/apt,2,1.0,0.0,...,1,0,7,3,239,67.0,strict,150,1,btw_75-150
3658,0,1,Jamaica Plain,42.289653,-71.114721,House,Private room,3,1.0,1.0,...,1,10,2,145,1641,92.0,moderate,70,0,lte_75
318,0,1,Jamaica Plain,42.307768,-71.116258,House,Private room,3,1.0,1.0,...,2,25,3,97,885,95.0,moderate,93,0,btw_75-150
3014,0,0,South Boston,42.336446,-71.040487,House,Private room,2,1.0,1.0,...,1,25,1,8,104,100.0,moderate,99,0,btw_75-150


# Prepare the data

In [10]:
# Descriptive statistics of numerical variables

train.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,7190.0,7190.0,7190.0,7190.0,7190.0,7171.0,7171.0,7174.0,7190.0,7190.0,7190.0,7190.0,7190.0,7190.0,5581.0,7190.0,7190.0
mean,0.109458,0.730737,42.339698,-71.085623,2.963282,1.190211,1.221169,1.561054,14.885257,1.411127,10.878025,3.031154,19.136161,274.028512,91.875291,158.682893,0.490821
std,0.312234,0.443608,0.02468,0.031656,1.69065,0.427613,0.717506,0.935145,4.810939,1.012971,19.002493,6.592199,35.882092,404.797307,9.614535,90.450492,0.49995
min,0.0,0.0,42.235942,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.32916,-71.106295,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,82.0,0.0
50%,0.0,1.0,42.345171,-71.078999,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,88.0,95.0,145.0,0.0
75%,0.0,1.0,42.354591,-71.062637,4.0,1.0,1.0,2.0,18.0,1.0,20.0,3.0,21.0,398.0,98.0,209.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,5.0,5.0,16.0,30.0,14.0,200.0,273.0,404.0,2680.0,100.0,499.0,1.0


In [13]:
# Total missing values in each column

train.isna().sum()

host_is_superhost                       0
host_identity_verified                  0
neighbourhood_cleansed                  0
latitude                                0
longitude                               0
property_type                           8
room_type                               0
accommodates                            0
bathrooms                              19
bedrooms                               19
beds                                   16
bed_type                                0
Number of amenities                     0
guests_included                         0
price_per_extra_person                  0
minimum_nights                          0
number_of_reviews                       0
number_days_btw_first_last_review       0
review_scores_rating                 1609
cancellation_policy                     0
price                                   0
price_gte_150                           0
price_category                          0
dtype: int64

## Separate the POTENTIAL target columns. Separate numerical and categorical inputs

In [14]:
train_targets = train[['price', 'price_gte_150', 'price_category']]

train_numeric_columns = train[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']]

train_binary_columns = train[['host_is_superhost', 'host_identity_verified']]

train_categorical_columns = train[['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']]

In [15]:
train_numeric_columns.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
3437,42.287641,-71.129883,5,1.0,2.0,3.0,19,5,30,2,12,68,97.0
6622,42.354976,-71.129493,1,2.0,1.0,1.0,12,1,0,17,1,0,100.0
2262,42.343264,-71.090912,1,1.0,1.0,1.0,14,1,30,2,5,310,100.0
2246,42.340861,-71.088289,3,1.0,0.0,1.0,14,1,0,3,32,474,95.0
835,42.335659,-71.085204,6,1.0,3.0,3.0,8,3,10,3,3,22,93.0


In [16]:
train_binary_columns.head()

Unnamed: 0,host_is_superhost,host_identity_verified
3437,0,1
6622,0,0
2262,0,1
2246,0,1
835,0,1


In [19]:
train_categorical_columns.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
3437,Roslindale,Apartment,Entire home/apt,Real Bed,moderate
6622,Allston,Apartment,Private room,Real Bed,strict
2262,Fenway,Apartment,Private room,Real Bed,flexible
2246,Fenway,Apartment,Entire home/apt,Real Bed,strict
835,Roxbury,Apartment,Entire home/apt,Real Bed,strict


## Process the numerical variables

### Imputation 

In [20]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [21]:
train_numeric_columns_imputed = imputer.fit_transform(train_numeric_columns)

In [22]:
train_numeric_columns_imputed

array([[ 42.28764103, -71.12988287,   5.        , ...,  12.        ,
         68.        ,  97.        ],
       [ 42.35497619, -71.12949326,   1.        , ...,   1.        ,
          0.        , 100.        ],
       [ 42.34326408, -71.09091151,   1.        , ...,   5.        ,
        310.        , 100.        ],
       ...,
       [ 42.35994471, -71.06206012,   2.        , ...,   4.        ,
        238.        , 100.        ],
       [ 42.31680579, -71.07352707,   2.        , ...,   3.        ,
         34.        ,  87.        ],
       [ 42.33128999, -71.1026941 ,   1.        , ...,   0.        ,
          0.        ,  95.        ]])

### Standardize the values


In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_numeric_columns_std = scaler.fit_transform(train_numeric_columns_imputed)

train_numeric_columns_std

array([[-2.10940159, -1.39824237,  1.20477863, ..., -0.19889191,
        -0.5090025 ,  0.51641975],
       [ 0.61906783, -1.38593382, -1.16133947, ..., -0.50547284,
        -0.67699949,  0.86649907],
       [ 0.14448465, -0.16705969, -1.16133947, ..., -0.39398886,
         0.08886914,  0.86649907],
       ...,
       [ 0.82039585,  0.74441303, -0.56980994, ..., -0.42185986,
        -0.08901002,  0.86649907],
       [-0.92762441,  0.3821493 , -0.56980994, ..., -0.44973085,
        -0.59300099, -0.65051133],
       [-0.34071414, -0.53929512, -1.16133947, ..., -0.53334383,
        -0.67699949,  0.28303353]])

### Convert back to Pandas

In [24]:
train_numeric_columns_std_df = pd.DataFrame(train_numeric_columns_std, 
                                      columns=train_numeric_columns.columns).reset_index(drop=True)

train_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,-2.109402,-1.398242,1.204779,-0.444147,1.087662,1.541283,0.855348,3.543164,1.006358,-0.156431,-0.198892,-0.509002,0.51642
1,0.619068,-1.385934,-1.161339,1.897066,-0.307821,-0.599099,-0.59977,-0.40589,-0.572492,2.119144,-0.505473,-0.676999,0.866499
2,0.144485,-0.16706,-1.161339,-0.444147,-0.307821,-0.599099,-0.184022,-0.40589,1.006358,-0.156431,-0.393989,0.088869,0.866499
3,0.047111,-0.084198,0.02172,-0.444147,-1.703304,-0.599099,-0.184022,-0.40589,-0.572492,-0.004726,0.358528,0.494038,0.283034
4,-0.163661,0.013258,1.796308,-0.444147,2.483145,1.541283,-1.431267,1.568637,-0.046209,-0.004726,-0.449731,-0.622648,0.049647


In [25]:
train_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables

In [27]:
#Find the total number of missing values
train_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             8
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [28]:
train_categorical_columns['property_type'].value_counts()

Apartment       5344
House           1168
Condominium      456
Townhouse        108
Loft              83
Villa             10
Entire Floor      10
Guesthouse         3
Name: property_type, dtype: int64

In [29]:
#Find the rows that have missing values
train_categorical_columns[train_categorical_columns.isnull().any(axis=1)]

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
6658,Allston,,Private room,Real Bed,moderate
8809,Downtown,,Entire home/apt,Real Bed,strict
8108,Back Bay,,Entire home/apt,Real Bed,strict
1260,Back Bay,,Entire home/apt,Real Bed,strict
1961,Downtown,,Entire home/apt,Real Bed,strict
10082,Allston,,Private room,Real Bed,moderate
3234,Allston,,Private room,Real Bed,moderate
5385,Downtown,,Entire home/apt,Real Bed,strict


In [30]:
#Impute "unknown" or for categorical text values

categorical_imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')

train_categorical_columns_imputed = categorical_imputer.fit_transform(train_categorical_columns)

In [34]:
train_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             8
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

### Convert back to Pandas

In [32]:
train_categorical_columns_imputed_df = pd.DataFrame(train_categorical_columns_imputed, 
                                      columns=train_categorical_columns.columns).reset_index(drop=True)

train_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,Roslindale,Apartment,Entire home/apt,Real Bed,moderate
1,Allston,Apartment,Private room,Real Bed,strict
2,Fenway,Apartment,Private room,Real Bed,flexible
3,Fenway,Apartment,Entire home/apt,Real Bed,strict
4,Roxbury,Apartment,Entire home/apt,Real Bed,strict


In [33]:
train_categorical_columns_imputed_df['property_type'].value_counts()

Apartment       5344
House           1168
Condominium      456
Townhouse        108
Loft              83
Villa             10
Entire Floor      10
UNKNOWN            8
Guesthouse         3
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [35]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

train_categorical_columns_1hot = cat_encoder.fit_transform(train_categorical_columns_imputed_df)

train_categorical_columns_1hot

<7190x46 sparse matrix of type '<class 'numpy.float64'>'
	with 35950 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [36]:
train_categorical_columns_1hot.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
cat_encoder.categories_

[array(['Allston', 'Back Bay', 'Bay Village', 'Beacon Hill', 'Brighton',
        'Charlestown', 'Chinatown', 'Dorchester', 'Downtown',
        'East Boston', 'Fenway', 'Hyde Park', 'Jamaica Plain',
        'Leather District', 'Longwood Medical Area', 'Mattapan',
        'Mission Hill', 'North End', 'Roslindale', 'Roxbury',
        'South Boston', 'South Boston Waterfront', 'South End', 'West End',
        'West Roxbury'], dtype=object),
 array(['Apartment', 'Condominium', 'Entire Floor', 'Guesthouse', 'House',
        'Loft', 'Townhouse', 'UNKNOWN', 'Villa'], dtype=object),
 array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object),
 array(['Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'],
       dtype=object),
 array(['flexible', 'moderate', 'strict', 'super_strict_30'], dtype=object)]

In [38]:
#Let's flatten the array of arrays to get the column names

onehot_column_names = [item for sublist in cat_encoder.categories_ for item in sublist]

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Condominium',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [39]:
train_categorical_columns_1hot_df = pd.DataFrame(train_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

train_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Do not process the binary variables

## Concatenate all variables

In [40]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

train_prepared = pd.concat((train_numeric_columns_std_df.reset_index(drop=True), 
                             train_categorical_columns_1hot_df.reset_index(drop=True),
                             train_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

train_prepared.shape

(7190, 61)

In [41]:
train_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,-2.109402,-1.398242,1.204779,-0.444147,1.087662,1.541283,0.855348,3.543164,1.006358,-0.156431,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1
1,0.619068,-1.385934,-1.161339,1.897066,-0.307821,-0.599099,-0.59977,-0.40589,-0.572492,2.119144,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0
2,0.144485,-0.16706,-1.161339,-0.444147,-0.307821,-0.599099,-0.184022,-0.40589,1.006358,-0.156431,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1
3,0.047111,-0.084198,0.02172,-0.444147,-1.703304,-0.599099,-0.184022,-0.40589,-0.572492,-0.004726,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
4,-0.163661,0.013258,1.796308,-0.444147,2.483145,1.541283,-1.431267,1.568637,-0.046209,-0.004726,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1


# Process the Test data using "Transform" only

In [42]:
test_targets = test[['price', 'price_gte_150', 'price_category']]

test_numeric_columns = test[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']]

test_binary_columns = test[['host_is_superhost', 'host_identity_verified']]

test_categorical_columns = test[['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']]

## Process numerical variables - test

### Imputation 

In [44]:
#Transform only

test_numeric_columns_imputed = imputer.transform(test_numeric_columns)

In [45]:
test_numeric_columns_imputed

array([[ 4.23552632e+01, -7.10727931e+01,  6.00000000e+00, ...,
         5.20000000e+01,  2.31400000e+03,  8.80000000e+01],
       [ 4.23434381e+01, -7.10768836e+01,  2.00000000e+00, ...,
         3.00000000e+00,  2.39000000e+02,  6.70000000e+01],
       [ 4.22896525e+01, -7.11147208e+01,  3.00000000e+00, ...,
         1.45000000e+02,  1.64100000e+03,  9.20000000e+01],
       ...,
       [ 4.23436240e+01, -7.10979281e+01,  5.00000000e+00, ...,
         4.00000000e+00,  2.11000000e+02,  9.40000000e+01],
       [ 4.23426200e+01, -7.11036275e+01,  2.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  9.50000000e+01],
       [ 4.23164580e+01, -7.10795195e+01,  1.00000000e+00, ...,
         4.00000000e+00,  2.23000000e+02,  9.50000000e+01]])

### Standardize the values


In [46]:
test_numeric_columns_std = scaler.transform(test_numeric_columns_imputed)

test_numeric_columns_std

array([[ 0.63069768,  0.40533687,  1.79630816, ...,  0.91594784,
         5.03983927, -0.53381822],
       [ 0.15153485,  0.27611111, -0.56980994, ..., -0.44973085,
        -0.08653948, -2.98437347],
       [-2.02789334, -0.91924215,  0.02171958, ...,  3.50795024,
         3.37716317, -0.06704579],
       ...,
       [ 0.15906806, -0.38872897,  1.20477863, ..., -0.42185986,
        -0.15571471,  0.16634043],
       [ 0.11838687, -0.56878308, -0.56980994, ..., -0.53334383,
        -0.67699949,  0.28303353],
       [-0.94171792,  0.19283558, -1.16133947, ..., -0.42185986,
        -0.12606818,  0.28303353]])

### Convert back to Pandas

In [47]:
test_numeric_columns_std_df = pd.DataFrame(test_numeric_columns_std, 
                                      columns=test_numeric_columns.columns).reset_index(drop=True)

test_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,0.630698,0.405337,1.796308,1.897066,2.483145,1.541283,-1.223393,-0.40589,-0.572492,-0.004726,0.915948,5.039839,-0.533818
1,0.151535,0.276111,-0.56981,-0.444147,-1.703304,-0.599099,-1.223393,-0.40589,-0.572492,0.602094,-0.449731,-0.086539,-2.984373
2,-2.027893,-0.919242,0.02172,-0.444147,-0.307821,0.471092,-0.184022,-0.40589,-0.046209,-0.156431,3.50795,3.377163,-0.067046
3,-1.293843,-0.967802,0.02172,-0.444147,-0.307821,0.471092,0.855348,0.581373,0.743216,-0.004726,2.170143,1.509432,0.283034
4,-0.131805,1.425964,-0.56981,-0.444147,-0.307821,-0.599099,0.023852,-0.40589,0.743216,-0.308136,-0.310376,-0.420063,0.866499


In [48]:
test_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables - test

In [49]:
#Find the total number of missing values
test_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             1
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [50]:
#Impute "unknown" or for categorical text values

test_categorical_columns_imputed = categorical_imputer.transform(test_categorical_columns)

### Convert back to Pandas

In [51]:
test_categorical_columns_imputed_df = pd.DataFrame(test_categorical_columns_imputed, 
                                      columns=test_categorical_columns.columns).reset_index(drop=True)

test_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,Back Bay,Apartment,Entire home/apt,Real Bed,super_strict_30
1,South End,Apartment,Entire home/apt,Real Bed,strict
2,Jamaica Plain,House,Private room,Real Bed,moderate
3,Jamaica Plain,House,Private room,Real Bed,moderate
4,South Boston,House,Private room,Real Bed,moderate


In [52]:
test_categorical_columns_imputed_df['property_type'].value_counts()

Apartment       2324
House            461
Condominium      210
Townhouse         45
Loft              31
Villa              8
Entire Floor       2
UNKNOWN            1
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [53]:
test_categorical_columns_1hot = cat_encoder.transform(test_categorical_columns_imputed_df)

test_categorical_columns_1hot

<3082x46 sparse matrix of type '<class 'numpy.float64'>'
	with 15410 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [54]:
test_categorical_columns_1hot.toarray()

array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [55]:
#One hot column names are still the same

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Condominium',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [56]:
test_categorical_columns_1hot_df = pd.DataFrame(test_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

test_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


## Do not transform the binary variables - test

## Concatenate all variables - test

In [57]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

test_prepared = pd.concat((test_numeric_columns_std_df.reset_index(drop=True), 
                           test_categorical_columns_1hot_df.reset_index(drop=True),
                           test_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

test_prepared.shape

(3082, 61)

In [58]:
test_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,0.630698,0.405337,1.796308,1.897066,2.483145,1.541283,-1.223393,-0.40589,-0.572492,-0.004726,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1
1,0.151535,0.276111,-0.56981,-0.444147,-1.703304,-0.599099,-1.223393,-0.40589,-0.572492,0.602094,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
2,-2.027893,-0.919242,0.02172,-0.444147,-0.307821,0.471092,-0.184022,-0.40589,-0.046209,-0.156431,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1
3,-1.293843,-0.967802,0.02172,-0.444147,-0.307821,0.471092,0.855348,0.581373,0.743216,-0.004726,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1
4,-0.131805,1.425964,-0.56981,-0.444147,-0.307821,-0.599099,0.023852,-0.40589,0.743216,-0.308136,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0


## What we didn't do:

Visualization<br>
Feature engineering<br>
Conversion from continuous to categorical<br>
Variable transformation (i.e., transform to normal distribution)<br>
Stratified sampling <br>
Pipeline <br>