In [87]:
# Basic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For tesxt:
import re

# For times:
import time

# Set a random seed for imputation
#  Source:  https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
np.random.seed(42)

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder


# Read-in Data

In [2]:
# Import the Training Data
lstn = pd.read_csv('../data/listings_train.csv')

In [3]:
# Note, this file has been added to the gitignore file and is NOT located in the repository
geodata = pd.read_csv('../data/lat_lng_data.csv')

latitudes = list(geodata.lat)
longitudes = list(geodata.long)

# Drop Un-needed Columns

In [4]:
def col_dropper(data_frame):
    data_frame.drop(columns = [
        'listing_url', 'scrape_id', 'last_scraped', 'source',
        'picture_url', 'host_url', 'host_name', 'host_thumbnail_url', 'host_picture_url',
        'neighbourhood','neighbourhood_group_cleansed', 'minimum_minimum_nights',
        'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 
        'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
        'calendar_last_scraped', 'bathrooms', 'first_review', 'last_review',
        'id', 'host_id',
    ], inplace = True)

    # For now, these columsn will also be dropped unless time allows for them to be processed:
    data_frame.drop(columns = [
        'host_location', 'host_neighbourhood', 'review_scores_rating', 'review_scores_accuracy',
        'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
        'review_scores_location', 'review_scores_value', 'license'
    ], inplace = True)
    
    # No need to reutrn the dataframe as the inplace functions carry over to the input dataframe

# Fix Datatypes

In [6]:
'''
Since the null values are not a very big percentage of the total data (though not a small percentage either),
the data will be imputed with the median value.

In order to do that, the percetnages need to be convereted where possible, so that the nulls can be imputed
with the median value.

The pcnt_floater functionwill be copied over here.  This is necessary as there are null values in these columns
which cannot be simply converted within the lambda function because there is no percentage sign.
'''

# This function will attempt to convert a string percentage value into a float
#  Source for help:  https://www.w3schools.com/python/python_try_except.asp
def pcnt_floater(x):
    try:
        return float(x.replace('%', '').strip())
    except:
        return x

In [7]:
def data_fixer(data_frame):

# FIX PRICE:  The dolar signs must be removed from the prices and numbers converted to float values
    data_frame.price = data_frame.price.apply(lambda x: float(x.replace('$','').replace(',','').strip()))

    # FIX HOST SINCE:  Convert to datetime then to epoch time in days
    '''
    The method used below to convert to epoch time was discovered with the help of ChatGPT.
    Per the lead instructor, it is ok to use ChatGPT is a search tool provided that we provide the
    question that was asked:

    Question:  'in python, I want to convert a pandas datetime object to epoch time'

    Additional help from:  https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
    '''

    # The strings are converted to date time, then to epoch time with '.timestamp()'
    # The epoch time is then divided by the product the number of hours and seconds per day
    #   to get the number of days since the epoch time origin 
    data_frame['host_since'] = pd.to_datetime(data_frame['host_since']).apply(lambda x: x.timestamp()/(3600*24))

    # FIX RESPONSE & ACCEPTANCE RATES:  Remove percentages
    # Convert percentages where they can be converted
    data_frame.host_acceptance_rate = data_frame.host_acceptance_rate.apply(lambda x: pcnt_floater(x))
    data_frame.host_response_rate = data_frame.host_response_rate.apply(lambda x: pcnt_floater(x))

# Impute Missing Data

In [9]:
def data_imputer(data_frame):
    # Impute missing text information with 'no_text_entered' into the following columns
    nte_cols = ['description', 'neighborhood_overview','host_about', 'host_response_time']

    for col in nte_cols:
        data_frame[col].fillna('no_text_entered', inplace = True)

    # Impute missing data with the median in the following columns
    median_cols = ['host_response_rate', 'host_acceptance_rate', 'bedrooms', 'beds']

    for col in median_cols:
        data_frame[col].fillna(data_frame[col].median(), inplace = True)

    # Impute missing values with the mode in the following columns
    data_frame.host_is_superhost.fillna(data_frame.host_is_superhost.mode()[0], inplace = True)
    data_frame.bathrooms_text.fillna(data_frame.bathrooms_text.mode()[0], inplace = True)

    # Impute missing data with 0 in reviews per month
    data_frame.reviews_per_month.fillna(0, inplace = True)

In [11]:
# Verify imputation
sum(lstn.isnull().sum() != 0)

0

# Create Simple Numerical Features

In [12]:
def simp_num_ft(data_frame):
    # Create percentage columns for the calculated listings by listing type
    data_frame['pcnt_ent_homes'] = round(data_frame['calculated_host_listings_count_entire_homes'] / data_frame['calculated_host_listings_count'], 3)
    data_frame['pcnt_private'] = round(data_frame['calculated_host_listings_count_private_rooms'] / data_frame['calculated_host_listings_count'], 3)
    data_frame['pcnt_shared'] = round(data_frame['calculated_host_listings_count_shared_rooms'] / data_frame['calculated_host_listings_count'], 3)

## Add T-Stop Distnace Data

In [14]:
# This funciton was written around the following source:
# https://towardsdatascience.com/create-new-column-based-on-other-columns-pandas-5586d87de73d

def min_dist(fn_lat, fn_lng, lat_data, lng_data):
    
    # Set a minimum distnace well beyond anything that would be derived
    min_dist = 90
    
    # Write a loop to find the minimum (euclidean) distance to every T-stop
    for n in range(len(latitudes)):
        dist = ((fn_lat - lat_data[n])**2 + (fn_lng - lng_data[n])**2)**0.5
        
        # Store this distance if smaller than min distance
        if dist < min_dist:
            min_dist = dist
    
    return min_dist

In [15]:
def dist_ft_adder(data_frame):
    


    # Crate a new column with the minimum distance to any T-stop
    #  The following source was used to help write this code (note axis = 1 is KEY!):
        # https://towardsdatascience.com/create-new-column-based-on-other-columns-pandas-5586d87de73d
    data_frame['min_distance'] = data_frame.apply(lambda x: min_dist(x.latitude, x.longitude, latitudes, longitudes), axis = 1)

# Create Log Features

In [17]:
lstn._get_numeric_data().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
host_since,3477.0,17138.849871,1079.936868,14216.0,16352.0,17151.0,18090.0,19422.0
host_response_rate,3477.0,97.487489,9.035327,0.0,100.0,100.0,100.0,100.0
host_acceptance_rate,3477.0,88.469083,20.483848,0.0,92.0,97.0,100.0,100.0
host_listings_count,3477.0,478.265746,1349.140443,1.0,2.0,12.0,84.0,4807.0
host_total_listings_count,3477.0,636.246189,1517.960227,1.0,3.0,17.0,127.0,5358.0
latitude,3477.0,42.337493,0.027111,42.2353,42.32187,42.34472,42.35431,42.39228
longitude,3477.0,-71.082429,0.033393,-71.173486,-71.10049,-71.07316,-71.06081,-70.996
accommodates,3477.0,3.181478,2.20618,1.0,2.0,2.0,4.0,16.0
bedrooms,3477.0,1.659477,1.155218,1.0,1.0,1.0,2.0,13.0
beds,3477.0,1.777107,1.427401,1.0,1.0,1.0,2.0,22.0


In [18]:
'''
The col_logger function will need to be brough in from the other notebooks

This has been modified to include a 0 imputation value n such that transforamtion
occurs on log(n) and not log(0) which is undefined.
'''

def col_logger(data_column, zero_imp = 1):
    # Since log(0) is undefined, 0's must be treated as log(1)
    return data_column.apply(lambda x: np.log(zero_imp) if x==0 else np.log(x))

In [19]:
def log_ft_maker(data_frame):
    # Create a list of numerical columns
    num_cols = list(data_frame._get_numeric_data().columns)

    # Remove latitiude and longitude data as they were used previously to create distances
    num_cols.remove('latitude')
    num_cols.remove('longitude')

    for col in num_cols:

        # Find columsn with values between 0 and 1
        if len(data_frame[col][(data_frame[col] < 1) & (data_frame[col] > 0)]) > 0:

            # Determine the minimum value in that column, if it's 0, base the minimum
            #  value off of the second smallest value in the column

            if min(data_frame[col]) < 0:
                print('CANT LOGARITHM A NEGATIVE NUMBER')
                break

            elif min(data_frame[col]) == 0:
                # second smallest value
                min_col_val = data_frame[col].sort_values().unique()[1]

            else:
                min_col_val = min(data_frame[col][(data_frame[col] < 1) & (data_frame[col] > 0)])

            # Calculate a zero imputation value for use in the col_logger function
            #  Take the natural log of the minimum value and round down
            '''
            This last step ensures that any zero values will be less than any positive
            values after a log transformation.
            '''
            z_imp = np.exp(np.floor(np.log(min_col_val)))

            # Transform the column:
            data_frame[f'log_{col}'] = col_logger(data_frame[col], z_imp)

        else:
            # Otherwise, simply use defulat zero_imputation value of 1
            data_frame[f'log_{col}'] = col_logger(data_frame[col])

In [21]:
lstn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3477 entries, 0 to 3476
Data columns (total 72 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              3477 non-null   object 
 1   description                                       3477 non-null   object 
 2   neighborhood_overview                             3477 non-null   object 
 3   host_since                                        3477 non-null   float64
 4   host_about                                        3477 non-null   object 
 5   host_response_time                                3477 non-null   object 
 6   host_response_rate                                3477 non-null   float64
 7   host_acceptance_rate                              3477 non-null   float64
 8   host_is_superhost                                 3477 non-null   object 
 9   host_listings_count

In [22]:
# Check for null values to verify proper feature creation
sum(lstn.isnull().sum())

0

# One Hot Encoding

In [23]:
def ohe_fn(data_frame):
    # Find the remaining categorical columns:
    cat_cols = list(data_frame.columns)
    num_cols = list(data_frame._get_numeric_data().columns)
    
    for col in num_cols:
        cat_cols.remove(col)

    # Some of these columns must be removed to be handled separately in advanced processing
    remove_cols = ['amenities', 'host_about', 'name', 'neighborhood_overview', 'description']

    for col in remove_cols:
        cat_cols.remove(col)

    # Merge the lstn dataframe with the one hot encoded values
    return pd.get_dummies(data_frame, columns = cat_cols, drop_first=True)

In [25]:
lstn.shape

(3477, 160)

# Advanced Processing

## Amenities

In [26]:
# Prior to merging, add a prefix to the column names since it is text data and words tokenized from
#  one column could overwrite with those of another.

# Make a function to do this for all dataframes.

def col_renamer(df, prefix):
    new_names = [f'{prefix}_{col}' for col in df.columns]
    df.columns = new_names

In [27]:
def amentiy_count_maker(data_frame):
    # Use a regular expression to extract the amenities which are between quotes.
    #  Code adapted from this source: https://stackoverflow.com/questions/1454913/regular-expression-to-find-a-string-included-between-two-characters-while-exclud
    # Also helpful:  https://regex101.com/
    regex_string = '(?<=")[^"]+(?="[,\]])'
    
    # Using regex as before, find the number of amenities for every listing and store to a new column
    data_frame['amen_cnt'] = data_frame.amenities.apply(lambda x: len(re.findall(regex_string, x)))

    # Create a log transformed column, setting the zero imputation value to e^-1
    data_frame['log_amen_cnt'] = col_logger(data_frame['amen_cnt'], np.exp(-1))

In [29]:
def amenity_maker(data_frame, training_data):
    # Use a regular expression to extract the amenities which are between quotes.
    #  Code adapted from this source: https://stackoverflow.com/questions/1454913/regular-expression-to-find-a-string-included-between-two-characters-while-exclud
    # Also helpful:  https://regex101.com/
    regex_string = '(?<=")[^"]+(?="[,\]])'

    amn_lst = []

    for string_lists in data_frame.amenities:
        a_list = re.findall(regex_string, string_lists)
        for amenity in a_list:
            amn_lst.append(amenity)

    # Create a pandas series of all amenities and their number of occurences
    amn_counts = pd.Series(amn_lst).value_counts(ascending=False)

    # Filter the datafarme to use only words that appear in 99% of posts
    #  THIS IS REQUIRED GIVEN THAT min_df IS IGNORED BY COUNT VECTORISZER WITH CUSTOM DICTIONARIES
    #  Create a vocab variable by using the index attribute to get the list of amenities
    amn_vocab = amn_counts[amn_counts >= 35].index

    if training_data == True:
        # Use countevectorizer to one hot encode all the amenities
        #  Use the vocab to get only the amenities encoded
        #  NOTE:  Set the 'token_pattern' to the regex string so it finds the exact same tokens as were found previously
        globals()[f'cvec_amen'] = CountVectorizer(lowercase=False,
                               vocabulary=amn_vocab,
                               ngram_range=(1, 1),
                               token_pattern=regex_string,
                              )

        # Create a new dataframe with the count vectorized data from the amenities column
        amen_df = pd.DataFrame(globals()[f'cvec_amen'].fit_transform(data_frame.amenities).todense(), 
                     columns = globals()[f'cvec_amen'].get_feature_names_out())
    
    else:
        # Create a new dataframe with the count vectorized data from the amenities column
        amen_df = pd.DataFrame(globals()[f'cvec_amen'].transform(data_frame.amenities).todense(), 
                     columns = globals()[f'cvec_amen'].get_feature_names_out())
        
    # Rename the columns of amen_df
    col_renamer(amen_df, 'amen')
    
    # Merge amen_df into the lstn dataframe
    return pd.merge(left = data_frame, 
                    right = amen_df, 
                    left_index=True, right_index=True, 
                    how = 'outer')

## Count Vectorize and Add Columns for Name, Description, Neigb. Overview, and Host About

In [31]:
'''
Write a function that will:
* vectorize the columns
* create a new df for those columns
* rename those columns
* merge into lstn
* Store the count vectorizer objects so that .transform can be later 
run on testing/validation datasets
'''

def text_col_cvec(data_frame, training_data):
    
    # Provide all the columns that need to be word vectorized
    text_cols = ['name', 'description', 'host_about', 'neighborhood_overview']
    
    for col in text_cols:
        # print(f'before: {data_frame.shape}')
        
        # Create a temporary variable to establish the pandas series based on the list
        column_data = data_frame[col]
        
        # Do fit_transform only if training_data = True
        if training_data == True:
            
            # Instantiate count vectorizer
            globals()[f'cvec_{col}'] = CountVectorizer(ngram_range=(1, 4), min_df=0.01)

            # Create a new dataframe with the count vectorized data from the selected column
            cvec_df = pd.DataFrame(globals()[f'cvec_{col}'].fit_transform(column_data).todense(), 
                         columns = globals()[f'cvec_{col}'].get_feature_names_out())
            
        else:
            # Create a new dataframe with the count vectorized data from the selected column
            cvec_df = pd.DataFrame(globals()[f'cvec_{col}'].transform(column_data).todense(), 
                         columns = globals()[f'cvec_{col}'].get_feature_names_out())

        # Rename the columns
        col_renamer(cvec_df, col)
        
        # print(cvec_df.shape)

        # Merge into the main dataframe
        data_frame = pd.merge(left = data_frame, 
             right = cvec_df, 
             left_index=True, right_index=True, 
             how = 'outer')
        
        # print(f'after: {data_frame.shape} \n')
        
    return data_frame

In [46]:
# Import the Training Data
does_ist_work = pd.read_csv('../data/listings_train.csv')

In [47]:
col_dropper(does_ist_work)

In [48]:
data_fixer(does_ist_work)

In [49]:
data_imputer(does_ist_work)

In [50]:
simp_num_ft(does_ist_work)

In [51]:
dist_ft_adder(does_ist_work)

In [52]:
log_ft_maker(does_ist_work)

In [53]:
does_ist_work = ohe_fn(does_ist_work)

In [54]:
amentiy_count_maker(does_ist_work)

In [55]:
does_ist_work = amenity_maker(does_ist_work, True)

In [56]:
# Redefine the dataframe with all new columns
does_ist_work = text_col_cvec(does_ist_work, True)

In [60]:
sum(sum(np.array(does_ist_work == lstn)))

44943702

In [61]:
lstn.shape[0] * lstn.shape[1]

44943702

In [84]:
def master_formatter(data_frame, training_data):
    print(f'start: {data_frame.shape[1]}')
    
    col_dropper(data_frame)
    print(f'col_dropper: {data_frame.shape[1]}')

    data_fixer(data_frame)
    print(f'data_fixer: {data_frame.shape[1]}')

    data_imputer(data_frame)
    print(f'data_imputer: {data_frame.shape[1]}')

    simp_num_ft(data_frame)
    print(f'simp_num_ft: {data_frame.shape[1]}')

    dist_ft_adder(data_frame)
    print(f'dist_ft_adder: {data_frame.shape[1]}')

    log_ft_maker(data_frame)
    print(f'log_ft_maker: {data_frame.shape[1]}')

    data_frame = ohe_fn(data_frame)
    print(f'ohe_fn: {data_frame.shape[1]}')

    amentiy_count_maker(data_frame)
    print(f'amentiy_count_maker: {data_frame.shape[1]}')

    data_frame = amenity_maker(data_frame, training_data)
    print(f'amenity_maker: {data_frame.shape[1]}')

    data_frame = text_col_cvec(data_frame, training_data)
    print(f'text_col_cvec: {data_frame.shape[1]}')
    
    return data_frame

In [85]:
# Import the Training Data
how_now = pd.read_csv('../data/listings_train.csv')

about_now = master_formatter(how_now, True)

start: 75
col_dropper: 41
data_fixer: 41
data_imputer: 41
simp_num_ft: 44
dist_ft_adder: 45
log_ft_maker: 72
ohe_fn: 160
amentiy_count_maker: 162
amenity_maker: 315
text_col_cvec: 12926


In [73]:
sum(sum(np.array(about_now == lstn)))

44943702

In [86]:
testing = pd.read_csv('../data/listings_test.csv')
testing_now = master_formatter(testing, False)

start: 75
col_dropper: 41
data_fixer: 41
data_imputer: 41
simp_num_ft: 44
dist_ft_adder: 45
log_ft_maker: 72
ohe_fn: 127
amentiy_count_maker: 129
amenity_maker: 282
text_col_cvec: 12893


In [75]:
testing_now = master_formatter(testing, False)

In [76]:
testing_now.shape

(387, 12893)

In [None]:
testing_now

---
---


In [165]:
def ohe_fn_2(data_frame, training_data):
    # Find the remaining categorical columns:
    cat_cols = list(data_frame.columns)
    num_cols = list(data_frame._get_numeric_data().columns)

    for col in num_cols:
        cat_cols.remove(col)

    # Some of these columns must be removed to be handled separately in advanced processing
    remove_cols = ['amenities', 'host_about', 'name', 'neighborhood_overview', 'description']

    for col in remove_cols:
        cat_cols.remove(col)

    # Create a new temporary dataframe with just the columns taht nee to be one hot encoded
    #  otherwise, it will try to OHE the whole thing...
    data_frame_temp = data_frame[cat_cols]


    # The instance of onehotencoder must be created as aglabal variable
    globals()['ohe_inst'] = OneHotEncoder(sparse_output = False, drop = 'first')
    
    # Add an if statment to fit_transform only if it's the training data:
    if training_data == True:
        
        # Create the ohe array from the temporary dataframe (fitted and transformed)
        ohe_array = ohe_inst.fit_transform(data_frame_temp)

    else:
        # Create the ohe array from the temporary dataframe (transformed only)
        ohe_array = ohe_inst.transform(data_frame_temp)
    
    # Make a dataframe from the array with the proper columns
    ohe_array_df = pd.DataFrame(ohe_array, columns=ohe_inst.get_feature_names_out())
    
    # Drop the original cat_col columns in the original dataframe
    data_frame.drop(columns=cat_cols, inplace=True)
    
    # Merge the OHE data into the new dataframe
    data_frame = pd.merge(left = data_frame, 
                 right = ohe_array_df, 
                 left_index=True, right_index=True, 
                 how = 'outer')
    
    # Return the dataframe
    return data_frame

In [164]:
def master_formatter_2(data_frame, training_data):
    print(f'start: {data_frame.shape[1]}')
    
    col_dropper(data_frame)
    print(f'col_dropper: {data_frame.shape[1]}')

    data_fixer(data_frame)
    print(f'data_fixer: {data_frame.shape[1]}')

    data_imputer(data_frame)
    print(f'data_imputer: {data_frame.shape[1]}')

    simp_num_ft(data_frame)
    print(f'simp_num_ft: {data_frame.shape[1]}')

    dist_ft_adder(data_frame)
    print(f'dist_ft_adder: {data_frame.shape[1]}')

    log_ft_maker(data_frame)
    print(f'log_ft_maker: {data_frame.shape[1]}')

    data_frame = ohe_fn_2(data_frame, True)
    print(f'ohe_fn: {data_frame.shape[1]}')

    amentiy_count_maker(data_frame)
    print(f'amentiy_count_maker: {data_frame.shape[1]}')

    data_frame = amenity_maker(data_frame, training_data)
    print(f'amenity_maker: {data_frame.shape[1]}')

    data_frame = text_col_cvec(data_frame, training_data)
    print(f'text_col_cvec: {data_frame.shape[1]}')
    
    return data_frame

In [166]:
new_new = pd.read_csv('../data/listings_train.csv')

new_new = master_formatter_2(new_new, True)

new_new.shape

start: 75
col_dropper: 41
data_fixer: 41
data_imputer: 41
simp_num_ft: 44
dist_ft_adder: 45
log_ft_maker: 72
ohe_fn: 160
amentiy_count_maker: 162
amenity_maker: 315
text_col_cvec: 12926


(3477, 12926)

In [162]:
new_new = ohe_fn_2(new_new, True)

In [167]:
new_new.shape

(3477, 12926)

In [171]:
sum(sum(np.array(new_new == lstn)))

44943702

In [172]:
3477*12926

44943702

In [None]:
def ohe_fn_2(data_frame):
    # Find the remaining categorical columns:
    cat_cols = list(data_frame.columns)
    num_cols = list(data_frame._get_numeric_data().columns)
    
    for col in num_cols:
        cat_cols.remove(col)

    # Some of these columns must be removed to be handled separately in advanced processing
    remove_cols = ['amenities', 'host_about', 'name', 'neighborhood_overview', 'description']

    for col in remove_cols:
        cat_cols.remove(col)

    # Merge the lstn dataframe with the one hot encoded values
    return pd.get_dummies(data_frame, columns = cat_cols, drop_first=True)

In [142]:
len(lstn.columns[61:160])

99

In [146]:
sum(lstn.columns[61:160] == ohe_new.get_feature_names_out())

99

---
---
# Junk

In [33]:
lstn.shape

(3477, 12926)

In [34]:
lstn.to_csv('../data/try_hard.csv')

In [35]:
lstn2 = pd.read_csv('../data/try_hard2.csv')

In [36]:
lstn_og = pd.read_csv('../data/try_hard.csv')

In [37]:
lstn_og.shape, lstn2.shape

((3477, 12927), (3477, 12927))

In [38]:
sum(lstn_og.columns == lstn2.columns)

12927

In [39]:
len(sum(np.array(lstn_og == lstn2)))

12927

In [40]:
sum(sum(np.array(lstn2 == lstn2)))

44947179

In [41]:
3477 * 12927

44947179

In [42]:
44947179 - 44943702

3477

In [43]:
truth_df = lstn_og == lstn2

In [44]:
sum(truth_df.name)

3477

In [45]:
for col in truth_df.columns:
    if sum(truth_df[col]) != 3477:
        print(col)