In [2]:
# Basic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For tesxt:
import re

# For times:
import time

# Set a random seed for imputation
#  Source:  https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
np.random.seed(42)

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer


# Read-in Data

In [9]:
# Import the Training Data
lstn = pd.read_csv('../data/listings_train.csv')

# Drop Un-needed Columns

In [10]:
lstn.drop(columns = [
    'listing_url', 'scrape_id', 'last_scraped', 'source',
    'picture_url', 'host_url', 'host_name', 'host_thumbnail_url', 'host_picture_url',
    'neighbourhood','neighbourhood_group_cleansed', 'minimum_minimum_nights',
    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
    'calendar_last_scraped', 'bathrooms', 'first_review', 'last_review',
    'id', 'host_id',
], inplace = True)

# For now, these columsn will also be dropped unless time allows for them to be processed:
lstn.drop(columns = [
    'host_location', 'host_neighbourhood', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'license'
], inplace = True)

In [11]:
lstn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3477 entries, 0 to 3476
Data columns (total 41 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   name                                          3477 non-null   object 
 1   description                                   3464 non-null   object 
 2   neighborhood_overview                         2245 non-null   object 
 3   host_since                                    3477 non-null   object 
 4   host_about                                    2463 non-null   object 
 5   host_response_time                            3011 non-null   object 
 6   host_response_rate                            3011 non-null   object 
 7   host_acceptance_rate                          3067 non-null   object 
 8   host_is_superhost                             3476 non-null   object 
 9   host_listings_count                           3477 non-null   i

In [12]:
lstn.columns

Index(['name', 'description', 'neighborhood_overview', 'host_since',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds',
       'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'has_availability', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
      dtype='object')

# Fix Datatypes

In [16]:
# FIX PRICE:  The dolar signs must be removed from the prices and numbers converted to float values
lstn.price = lstn.price.apply(lambda x: float(x.replace('$','').replace(',','').strip()))

In [17]:
# FIX HOST SINCE:  Convert to datetime then to epoch time in days

'''
The method used below to convert to epoch time was discovered with the help of ChatGPT.
Per the lead instructor, it is ok to use ChatGPT is a search tool provided that we provide the
question that was asked:

Question:  'in python, I want to convert a pandas datetime object to epoch time'

Additional help from:  https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
'''

# The strings are converted to date time, then to epoch time with '.timestamp()'
# The epoch time is then divided by the product the number of hours and seconds per day
#   to get the number of days since the epoch time origin 
lstn['host_since'] = pd.to_datetime(lstn['host_since']).apply(lambda x: x.timestamp()/(3600*24))

In [20]:
# FIX RESPONSE & ACCEPTANCE RATES:  Remove percentages

'''
Since the null values are not a very big percentage of the total data (though not a small percentage either),
the data will be imputed with the median value
'''

'''
The pcnt_floater functionwill be copied over here.  This is necessary as there are null values in these columns
which cannot be simply converted within the lambda function because there is no percentage sign.
'''

# This function will attempt to convert a string percentage value into a float
#  Source for help:  https://www.w3schools.com/python/python_try_except.asp
def pcnt_floater(x):
    try:
        return float(x.replace('%', '').strip())
    except:
        return x

# Convert percentages where they can be converted
lstn.host_acceptance_rate = lstn.host_acceptance_rate.apply(lambda x: pcnt_floater(x))
lstn.host_response_rate = lstn.host_response_rate.apply(lambda x: pcnt_floater(x))

# Impute Missing Data

In [14]:
# Impute missing text information with 'no_text_entered' into the following columns
nte_cols = ['description', 'neighborhood_overview','host_about', 'host_response_time']

for col in nte_cols:
    lstn[col].fillna('no_text_entered', inplace = True)

In [15]:
# Impute missing data with the median in the following columns
median_cols = ['host_response_rate', 'host_acceptance_rate', 'bedrooms', 'beds']

for col in median_cols:
    lstn[col].fillna(lstn[col].median(), inplace = False)

TypeError: could not convert string to float: '100%'

# Advanced Processing

# Geocoding Data