<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Intro" data-toc-modified-id="Intro-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Intro</a></span></li><li><span><a href="#Missing-Values" data-toc-modified-id="Missing-Values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Missing Values</a></span><ul class="toc-item"><li><span><a href="#Missing-Data:" data-toc-modified-id="Missing-Data:-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Missing Data:</a></span></li></ul></li><li><span><a href="#Remove-Single-Value-Columns" data-toc-modified-id="Remove-Single-Value-Columns-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Remove Single Value Columns</a></span></li><li><span><a href="#Convert-Data-Types" data-toc-modified-id="Convert-Data-Types-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Convert Data Types</a></span></li><li><span><a href="#Misc-Cleaning" data-toc-modified-id="Misc-Cleaning-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Misc Cleaning</a></span></li></ul></div>

# Intro

 **listings.cv** is a detailed dataset concerning listings in the Los Angeles area. Not all of the features are necessary in this particular project including text based fields. These may be utilized in future iterations.

In [1]:
# Import needed libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
plt.style.use('fivethirtyeight')

# Columns to be used during the project
cols = [
    'experiences_offered', 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
    'host_has_profile_pic', 'host_identity_verified',
    'neighbourhood_group_cleansed', 'country_code', 'country', 'property_type',
    'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',
    'amenities', 'square_feet', 'price', 'security_deposit', 'cleaning_fee',
    'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights',
    'has_availability', 'availability_30', 'availability_60',
    'availability_90', 'availability_365', 'number_of_reviews',
    'review_scores_rating', 'requires_license', 'license', 'instant_bookable',
    'cancellation_policy', 'reviews_per_month'
]

# Display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('data/listings.csv.gz', low_memory=False, usecols=cols)

In [2]:
df.head()

Unnamed: 0,experiences_offered,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,country_code,country,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,requires_license,license,instant_bookable,cancellation_policy,reviews_per_month
0,none,a few days or more,25%,,f,1.0,t,f,,US,United States,House,Entire home/apt,10,7.0,5.0,5.0,Real Bed,"{""Wireless Internet"",""Air conditioning"",Pool,K...",,"$3,000.00","$2,000.00",$200.00,1,$0.00,1,1125,,27,53,82,352,0,,f,,t,strict,
1,none,within an hour,100%,,t,1.0,t,f,,US,United States,House,Private room,2,1.0,1.0,1.0,Real Bed,{},,$50.00,,,1,$0.00,1,1125,,29,45,75,350,33,93.0,f,,f,flexible,1.91
2,none,within an hour,100%,,t,1.0,t,t,,US,United States,House,Private room,2,1.0,1.0,1.0,Real Bed,"{""Wireless Internet"",""Air conditioning"",""Wheel...",,$55.00,,,1,$10.00,1,1125,,18,43,73,348,14,100.0,f,,f,flexible,1.72
3,none,within an hour,100%,,t,1.0,t,f,,US,United States,Other,Entire home/apt,6,1.0,1.0,3.0,Real Bed,"{TV,""Wireless Internet"",""Air conditioning"",Poo...",,$150.00,,$35.00,1,$0.00,1,2,,3,17,30,87,22,100.0,f,,t,flexible,2.12
4,none,,,,f,1.0,t,t,,US,United States,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Free pa...",,$30.00,,$5.00,1,$10.00,1,90,,0,0,0,0,3,93.0,f,,f,flexible,0.18


# Missing Values

In [3]:
# Function to calculate missing values by column# Funct
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0: 'Missing Values', 1: '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " rows.\n"
          "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [4]:
missing = missing_values_table(df)
missing

Your selected dataframe has 39 columns and 31253 rows.
There are 18 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
host_acceptance_rate,31253,100.0
has_availability,31253,100.0
neighbourhood_group_cleansed,31253,100.0
license,31206,99.8
square_feet,30869,98.8
security_deposit,14397,46.1
review_scores_rating,7527,24.1
reviews_per_month,7223,23.1
cleaning_fee,6974,22.3
host_response_time,5561,17.8


## Missing Data:
* Four columns have more than 90% of values missing. I will remove these columns all together.
* Remaining columns will be dealt with later through imputation

In [5]:
# Setting condition to remove colums with 90% or more missing
remove = missing[missing['% of Total Values']>=90.0].index

def drop_multiple_col(col_names_list, df): 
    df.drop(col_names_list, axis=1, inplace=True)
    return df

df = drop_multiple_col(df[remove], df)

# Remove Single Value Columns

In [6]:
# Function to remove columns with one singular value, no value to modeling process
def remove_constants(df):
    df_new = df.loc[:,df.apply(pd.Series.nunique) != 1]
    removed = list(df.loc[:,df.apply(pd.Series.nunique) == 1].columns)
    print(f'Removed columns: {removed}')
    return df_new

In [7]:
df = remove_constants(df)

Removed columns: ['experiences_offered', 'country_code', 'country']


# Convert Data Types

In [8]:
def convert_types(df):

    # Iterate through each column
    for c in df:

        # Convert objects to category
        if (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')

        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)

        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)

        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)

    return df

In [9]:
df = convert_types(df)

# Misc Cleaning

In [10]:
# Convert strings to integers for boolean values
bool_feat = [
    'host_identity_verified', 'host_has_profile_pic', 'host_is_superhost',
    'instant_bookable', 'requires_license'
]
for b in (bool_feat):
    df[b] = df[b].map({'t': 1.0, 'f': 0.0})
    df[b] = df[b].astype('bool')

# Convert $ to float
convert = ('price', 'cleaning_fee', 'extra_people')
for con in (convert):
    df[con] = (df[con].replace('[\$,)]', '',
                               regex=True).replace('[(]', '-',
                                                   regex=True).astype(float))

# Remove % from host_response_rate column
df['host_response_rate'] = (df['host_response_rate'].replace(
    '[\%,)]', '', regex=True).replace('[(]', '-', regex=True).astype(float))

In [11]:
# Final look at the cleaned data
df.to_pickle('data/listings_cleaned.pkl')
df.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,requires_license,instant_bookable,cancellation_policy,reviews_per_month
0,a few days or more,25.0,False,1.0,True,False,House,Entire home/apt,10,7.0,5.0,5.0,Real Bed,"{""Wireless Internet"",""Air conditioning"",Pool,K...",3000.0,"$2,000.00",200.0,1,0.0,1,1125,27,53,82,352,0,,False,True,strict,
1,within an hour,100.0,True,1.0,True,False,House,Private room,2,1.0,1.0,1.0,Real Bed,{},50.0,,,1,0.0,1,1125,29,45,75,350,33,93.0,False,False,flexible,1.91
2,within an hour,100.0,True,1.0,True,True,House,Private room,2,1.0,1.0,1.0,Real Bed,"{""Wireless Internet"",""Air conditioning"",""Wheel...",55.0,,,1,10.0,1,1125,18,43,73,348,14,100.0,False,False,flexible,1.72
3,within an hour,100.0,True,1.0,True,False,Other,Entire home/apt,6,1.0,1.0,3.0,Real Bed,"{TV,""Wireless Internet"",""Air conditioning"",Poo...",150.0,,35.0,1,0.0,1,2,3,17,30,87,22,100.0,False,True,flexible,2.12
4,,,False,1.0,True,True,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Free pa...",30.0,,5.0,1,10.0,1,90,0,0,0,0,3,93.0,False,False,flexible,0.18
