In [33]:
# import libraries 
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
from matplotlib import colorbar
import seaborn as sns
%matplotlib inline

import nltk
from nltk import WordNetLemmatizer, pos_tag 
from nltk.corpus import stopwords, wordnet 

from nltk.tokenize import word_tokenize


import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

np.random_state = 42

## Load in Dataset

In [34]:
sd_listings = pd.read_csv('data/sd_listings', index_col= 0)
sd_listings.head(3)

Unnamed: 0,id,listing_url,latitude,longitude,zipcode,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,nightly_price,price_per_stay,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,host_id,host_url,host_since,host_about,host_response_time,host_response_rate,host_is_superhost,neighbourhood_cleansed,host_total_listings_count,host_has_profile_pic,host_identity_verified,number_of_reviews,number_of_stays,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
0,33159143,https://www.airbnb.com/rooms/33159143,32.91736,-117.07635,92131.0,"PRIVATE ROOM, SAFE NEIGHBORHOOD IN SCRIPPS RANCH",80% of the house + yourCozy room ... in Scripp...,"Full bed, private bath...",80% of the house + yourCozy room ... in Scripp...,The neighborhood is a typical Southern Califor...,Have alarm system if you choose to use... I wi...,You can park in the drive way or plenty of spa...,"Kitchen, Dining room, Living room, Very Comfor...","I lived in San Diego 40+ years, love to share ...",#NAME?,House,Private room,1,1.0,1.0,1.0,Real Bed,"{Wifi,Kitchen,""Free parking on premises"",Heati...",1100.0,1100.0,1000.0,0.0,1.0,0.0,60.0,90.0,249661563.0,https://www.airbnb.com/users/show/249661563,3/18/19,Retired businessman of 30 years in this commun...,within a few hours,100.0,0.0,Scripps Ranch,1.0,1.0,0.0,0.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,0.0,strict_14_with_grace_period,0.0,0.0
1,17138468,https://www.airbnb.com/rooms/17138468,32.84067,-117.27443,92037.0,NOT AVAILABLE,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,,,,,,Apartment,Entire home/apt,1,2.0,2.0,3.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",1400.0,1400.0,,,1.0,0.0,30.0,1125.0,79755951.0,https://www.airbnb.com/users/show/79755951,6/24/16,Quiet and considerate.,,0.0,0.0,La Jolla,1.0,1.0,0.0,2.0,4.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,0.0,strict_14_with_grace_period,0.0,0.0
2,21898446,https://www.airbnb.com/rooms/21898446,32.79797,-117.2425,92109.0,Townhome in Pacific Beach,Hi! We are basically renting this master for a...,,Hi! We are basically renting this master for a...,,,,,,,Townhouse,Private room,1,1.0,1.0,1.0,Real Bed,"{TV,Wifi,Kitchen,""Free parking on premises"",""P...",1250.0,1250.0,,,1.0,0.0,30.0,30.0,159773487.0,https://www.airbnb.com/users/show/159773487,11/22/17,,,0.0,0.0,Pacific Beach,1.0,1.0,0.0,0.0,0.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,0.0,flexible,0.0,0.0


#### Split dataframe into text columns only

In [35]:
sd_listings.columns

Index(['id', 'listing_url', 'latitude', 'longitude', 'zipcode', 'name',
       'summary', 'space', 'description', 'neighborhood_overview', 'notes',
       'transit', 'access', 'interaction', 'house_rules', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'amenities', 'nightly_price', 'price_per_stay',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'host_id', 'host_url', 'host_since',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'neighbourhood_cleansed',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'number_of_reviews', 'number_of_stays',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'requires_license',

In [36]:
# get text columns
sd_text = sd_listings[['id', 'listing_url', 'name','summary', 'space', 'description', 
                       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 
                       'house_rules', 'amenities', 'host_about']]

sd_text.head(3)

Unnamed: 0,id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,amenities,host_about
0,33159143,https://www.airbnb.com/rooms/33159143,"PRIVATE ROOM, SAFE NEIGHBORHOOD IN SCRIPPS RANCH",80% of the house + yourCozy room ... in Scripp...,"Full bed, private bath...",80% of the house + yourCozy room ... in Scripp...,The neighborhood is a typical Southern Califor...,Have alarm system if you choose to use... I wi...,You can park in the drive way or plenty of spa...,"Kitchen, Dining room, Living room, Very Comfor...","I lived in San Diego 40+ years, love to share ...",#NAME?,"{Wifi,Kitchen,""Free parking on premises"",Heati...",Retired businessman of 30 years in this commun...
1,17138468,https://www.airbnb.com/rooms/17138468,NOT AVAILABLE,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,,,,,,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",Quiet and considerate.
2,21898446,https://www.airbnb.com/rooms/21898446,Townhome in Pacific Beach,Hi! We are basically renting this master for a...,,Hi! We are basically renting this master for a...,,,,,,,"{TV,Wifi,Kitchen,""Free parking on premises"",""P...",


In [37]:
sd_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13052 entries, 0 to 13050
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     13052 non-null  object
 1   listing_url            13052 non-null  object
 2   name                   13052 non-null  object
 3   summary                12636 non-null  object
 4   space                  10081 non-null  object
 5   description            12837 non-null  object
 6   neighborhood_overview  9139 non-null   object
 7   notes                  7104 non-null   object
 8   transit                8262 non-null   object
 9   access                 8225 non-null   object
 10  interaction            8667 non-null   object
 11  house_rules            9588 non-null   object
 12  amenities              13051 non-null  object
 13  host_about             8990 non-null   object
dtypes: object(14)
memory usage: 1.5+ MB


## Normalizing and Tokenizing

In [38]:
# import stop words
stop_words = stopwords.words('english')

### Helper function to tokenize columns

In [39]:
# takes in untokenized document and returns fully normalized token list
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

In [41]:
sd_text.head(5)

Unnamed: 0,id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,amenities,host_about
0,33159143,https://www.airbnb.com/rooms/33159143,"PRIVATE ROOM, SAFE NEIGHBORHOOD IN SCRIPPS RANCH",80% of the house + yourCozy room ... in Scripp...,"Full bed, private bath...",80% of the house + yourCozy room ... in Scripp...,The neighborhood is a typical Southern Califor...,Have alarm system if you choose to use... I wi...,You can park in the drive way or plenty of spa...,"Kitchen, Dining room, Living room, Very Comfor...","I lived in San Diego 40+ years, love to share ...",#NAME?,"{Wifi,Kitchen,""Free parking on premises"",Heati...",Retired businessman of 30 years in this commun...
1,17138468,https://www.airbnb.com/rooms/17138468,NOT AVAILABLE,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,,,,,,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",Quiet and considerate.
2,21898446,https://www.airbnb.com/rooms/21898446,Townhome in Pacific Beach,Hi! We are basically renting this master for a...,,Hi! We are basically renting this master for a...,,,,,,,"{TV,Wifi,Kitchen,""Free parking on premises"",""P...",
3,25948680,https://www.airbnb.com/rooms/25948680,Spacious furnished 1 BR with tons of amenaties,"All my stuff will be gone. Dishwasher, washer/...",Lobby provides free coffee. Building doors loc...,"All my stuff will be gone. Dishwasher, washer/...",College area,,Across the street from the green line trolly. ...,Everything except second bedroom in apartment....,,,"{Wifi,""Air conditioning"",Pool,Kitchen,""Free pa...",
4,1756516,https://www.airbnb.com/rooms/1756516,A Spacious luxury retreat,"NIcely furnished. Great location, 2 blocks to ...",,"NIcely furnished. Great location, 2 blocks to ...",coastal town great travel destination.,,Bus line near by.,,Call/text business hours.,No pets. Clean and healthy.,"{TV,Wifi,Kitchen,""Free parking on premises"",El...","Life is too short, enjoy it.....\r\n"


In [42]:
# fill in NaNs with empty text fields
sd_text = sd_text.fillna(' ')
sd_text.head(3)

Unnamed: 0,id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,amenities,host_about
0,33159143,https://www.airbnb.com/rooms/33159143,"PRIVATE ROOM, SAFE NEIGHBORHOOD IN SCRIPPS RANCH",80% of the house + yourCozy room ... in Scripp...,"Full bed, private bath...",80% of the house + yourCozy room ... in Scripp...,The neighborhood is a typical Southern Califor...,Have alarm system if you choose to use... I wi...,You can park in the drive way or plenty of spa...,"Kitchen, Dining room, Living room, Very Comfor...","I lived in San Diego 40+ years, love to share ...",#NAME?,"{Wifi,Kitchen,""Free parking on premises"",Heati...",Retired businessman of 30 years in this commun...
1,17138468,https://www.airbnb.com/rooms/17138468,NOT AVAILABLE,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,AVAILABLE ONLY IN WINTER PRIME La Jolla Villag...,,,,,,,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",Quiet and considerate.
2,21898446,https://www.airbnb.com/rooms/21898446,Townhome in Pacific Beach,Hi! We are basically renting this master for a...,,Hi! We are basically renting this master for a...,,,,,,,"{TV,Wifi,Kitchen,""Free parking on premises"",""P...",


In [43]:
sd_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13052 entries, 0 to 13050
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     13052 non-null  object
 1   listing_url            13052 non-null  object
 2   name                   13052 non-null  object
 3   summary                13052 non-null  object
 4   space                  13052 non-null  object
 5   description            13052 non-null  object
 6   neighborhood_overview  13052 non-null  object
 7   notes                  13052 non-null  object
 8   transit                13052 non-null  object
 9   access                 13052 non-null  object
 10  interaction            13052 non-null  object
 11  house_rules            13052 non-null  object
 12  amenities              13052 non-null  object
 13  host_about             13052 non-null  object
dtypes: object(14)
memory usage: 1.5+ MB


### Apply tokenization to entire dataframe

In [170]:
def tokenize_df(df):
    """
    Helper function tokenizes each column in the sd_text dataframe.
    Uses the process_doc helper function to tokenize and normalize each column into string values
    """
    # get list of column names 
    col_names = list(sd_text.columns)
    # just get the relevant text columns
    col_names = col_names[2:]
    
    # create an empty dataframe
    tokenized_df = pd.DataFrame()

    # tokenize each column 
    for col in col_names:
        
        # use helper function to get a series of tokenized text for each column, and flatten the lists into tokens
        tokenized_series = df[col].apply(process_doc).apply(" ".join)
        
        # convert each col into a df
        tokenized_col = tokenized_series.to_frame().transpose()
        
        # add to overall df
        tokenized_df = tokenized_df.append(tokenized_col)
    
    # pivot the rows and columns 
    tokenized_df = tokenized_df.transpose()
    
    # merge the id, listing_url to the tokenized columns
    tokenized_df = df.iloc[:,[0,1]].join(tokenized_df)

    return tokenized_df

In [171]:
tokenize_df(sd_text).head()

Unnamed: 0,id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,amenities,host_about
0,33159143,https://www.airbnb.com/rooms/33159143,private room safe neighborhood scripps ranch,house yourcozy room scripps ranch decor safe f...,full bed private bath,house yourcozy room scripps ranch decor safe f...,neighborhood typical southern california spani...,have alarm system choose use i supply butane b...,park drive way plenty space next house,kitchen din room living room very comfortable ...,i live san diego year love share neat place ci...,name,wifi kitchen free parking premise heat dryer s...,retire businessman year community be se asia i...
1,17138468,https://www.airbnb.com/rooms/17138468,not available,available only winter prime la jolla village l...,,available only winter prime la jolla village l...,,,,,,,tv wifi air condition kitchen free parking pre...,quiet considerate
2,21898446,https://www.airbnb.com/rooms/21898446,townhome pacific beach,hi basically rent master month move date room ...,,hi basically rent master month move date room ...,,,,,,,tv wifi kitchen free parking premise pet allow...,
3,25948680,https://www.airbnb.com/rooms/25948680,spacious furnish br ton amenaties,stuff go dishwasher unit small porch closet re...,lobby provide free coffee building door lock n...,stuff go dishwasher unit small porch closet re...,college area,,street green line trolly stop sdsu freeway ent...,everything second bedroom apartment vacant lock,,,wifi air condition pool kitchen free parking p...,
4,1756516,https://www.airbnb.com/rooms/1756516,spacious luxury retreat,nicely furnished great location block beach be...,,nicely furnished great location block beach be...,coastal town great travel destination,,bus line,,business hour,pet clean healthy,tv wifi kitchen free parking premise elevator ...,life short enjoy
