# Airbnb Text Mining

In [2]:
import pandas as pd
import numpy as np
import spacy
from string import punctuation

## Slice all text columns from the listing.csv file.

In [23]:
listings = pd.read_csv('listings_cleaned_20230303.csv')

In [24]:
text_features= listings[['id','name','description','neighborhood_overview','host_about','price']]
text_features

Unnamed: 0,id,name,description,neighborhood_overview,host_about,price
0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0
1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0
2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0
3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0
4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0
...,...,...,...,...,...,...
45209,26665862.0,"Queen Bed, Foam Mattress",Queen bed with a soft foam mattress in a priva...,"-Near Fairview Regional Park with hiking, biki...",,60.0
45210,26898605.0,SANITIZED Modern Zen Getaway | Relax & Replenish,Are you more than ready for a Local Getaway? N...,We are conveniently located just minutes from ...,Hi! My name is Isabella and I am a first gener...,104.0
45211,26895703.0,"Simple, Clean, Modern One Bedroom Apartment",Welcome to Anaheim! This one bedroom suite ha...,We are in a safe and happy area. The street m...,It is nice finding that place where you can ju...,134.0
45212,26954426.0,Oak Park House,"Good location, new carpet, kitchen and floorin...","Close to Brookside elementary, Medea creek and...",,300.0


In [4]:
text_features.shape

(45214, 6)

In [5]:
text_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45214 entries, 0 to 45213
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     45214 non-null  float64
 1   name                   45212 non-null  object 
 2   description            44403 non-null  object 
 3   neighborhood_overview  26565 non-null  object 
 4   host_about             26256 non-null  object 
 5   price                  45214 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.1+ MB


In [6]:
text_features.isnull().sum()

id                           0
name                         2
description                811
neighborhood_overview    18649
host_about               18958
price                        0
dtype: int64

In [7]:
text_features.head(10)

Unnamed: 0,id,name,description,neighborhood_overview,host_about,price
0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0
1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0
2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0
3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0
4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0
5,185536.0,Lovely Private rm in Home to FEMALE Red Room,"The space Hello,, Ladies!!! I offer you Privac...",,Laura..seamstress in L.A. Ca,85.0
6,2708.0,Runyon Canyon | Beau Furn Mirror Mini-Suite Fi...,"Run Runyon Canyon, Our Gym & Sauna Open Beaut...","Walk and run to Runyon Canyon, it is open! We ...",Writer.\r\nLiterary Manager.\r\nPhotographer.\...,93.0
7,51546.0,Cool Pad Under the Hollywood Sign,"Please note, after January 2, 2020 we can onl...","Beachwood Canyon is peaceful, natural, and lov...","I'm a pretty happy person, I love music, garde...",100.0
8,185557.0,"Private Room in Home to FEMALE, pls Purple Room","The space Hello, Ladies! I am a female homeown...","Quiet environment..Drama- Free, Privacy and Re...",Laura..seamstress in L.A. Ca,85.0
9,2732.0,Zen Life at the Beach,An oasis of tranquility awaits you. The space ...,"This is the best part of Santa Monica. Quiet, ...",I have been teaching yoga and meditation for 3...,179.0


In [8]:
#save text columns to csv file
text_features.to_csv('text_features.csv')

## Understanding text columns and dealing with null values

In [5]:
text_columns= pd.read_csv('text_features.csv')
text_columns.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_about,price
0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0
1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0
2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0
3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0
4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0


In [6]:
text_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45214 entries, 0 to 45213
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     45214 non-null  float64
 1   name                   45212 non-null  object 
 2   description            44403 non-null  object 
 3   neighborhood_overview  26565 non-null  object 
 4   host_about             26256 non-null  object 
 5   price                  45214 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.1+ MB


In [7]:
text_columns.isnull().sum()

id                           0
name                         2
description                811
neighborhood_overview    18649
host_about               18958
price                        0
dtype: int64

In [23]:
## I replaced all null values with 'NULL'
text_columns= text_columns.fillna('NULL')

In [24]:
text_columns.isnull().sum()

Unnamed: 0               0
id                       0
name                     0
description              0
neighborhood_overview    0
host_about               0
price                    0
dtype: int64

## Count tokens and sentences using Spacy 
Note: This part of the code counts tokens and sentences in each text records

Each count is recorded to dataset as:
  
(column name)_TC for Token Count    
(column name)_SC for Sentence Count     

In [26]:
nlp = spacy.load("en_core_web_sm")

def countToken(text):
  doc = nlp(text) # Create a Doc object
  tokens = [token.text for token in doc]
  return len(tokens)

def countSentence(text):
  doc = nlp(text) # Create a Doc object
  sentences = [sentence for sentence in doc.sents]
  #sentences = list(doc.sents) #alternative
  return len(sentences)

In [37]:
text_columns['name_TC']= text_columns['name'].map(countToken)
text_columns['name_SC']= text_columns['name'].map(countSentence)

In [41]:
text_columns['description_TC']= text_columns['description'].map(countToken)
text_columns['description_SC']= text_columns['description'].map(countSentence)

In [45]:
text_columns['neighborhood_overview_TC']= text_columns['neighborhood_overview'].map(countToken)
text_columns['neighborhood_overview_SC']= text_columns['neighborhood_overview'].map(countSentence)

In [48]:
text_columns['host_about_TC']= text_columns['host_about'].map(countToken)
text_columns['host_about_SC']= text_columns['host_about'].map(countSentence)

In [49]:
text_columns.head()

Unnamed: 0.1,Unnamed: 0,id,name,description,neighborhood_overview,host_about,price,name_TC,name_SC,description_TC,description_SC,neighborhood_overview_TC,neighborhood_overview_SC,host_about_TC,host_about_SC
0,0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0,5,1,163,13,19,1,63,5
1,1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0,10,2,203,17,24,2,65,4
2,2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0,8,1,185,15,51,3,14,3
3,3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0,11,1,195,9,13,1,102,5
4,4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0,7,1,178,11,1,1,301,10


## Preprocessing  

In [50]:
#Peprocessing
import spacy
nlp = spacy.load("en_core_web_sm")
import re
from string import punctuation
stop_words = nlp.Defaults.stop_words

For Preprocessing, these were were done:

transform all text into lower case  
replace all HTML tags  
remove all hyperlinks  
replace non-alphanumerics with space  
remove single characters  
replace multiple spaces with one space  
remove any repeated characters like hellooo  

In [57]:
def preprocess_text(text):
  text = text.lower() # Lowercase text
  text = re.sub("<[^>]*>", " ", text) # Replace HTML tags with space - this needs to be before removing special characters
  text = re.sub(r"https?://\S+", "", text) # Remove hyperlinks
  text = re.sub(r"\W"," ",text) #replace non-alphanumerics with space
  text = re.sub(r"\s+[a-z]\s+"," ", text) #remove single character
  text = re.sub(r"\s+"," ",text)  #replace multiple spaces with one space
  text = re.sub(r'(.)\1{3,}',r'\1', text) # Remove repeated characters like hellooo

  # text = re.sub(r"[!#$+-@\']/g","", text) # remove special characters and punctuations
  # text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # Remove punctuation
  # text = re.sub(r"[^A-Za-z0-9\s]+", " ", text) # Replace special characters and punctuation with space
  # text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines

  # In regular expressions, \b anchors the regex at a word boundary or the position between a word and a non-word character, or vice versa.
  # text = re.sub(r"\b\d+\b","", text) # remove digits

  text = remove_stopwords(text)
  text = lemmatize(text)
  return text

# Remove stop words using spaCy
def remove_stopwords(text):
    text_tokens = nlp.tokenizer(text)
    #print (type(text_tokens))
    tokens_without_sw = [t.text for t in text_tokens if not t.text in stop_words]
    #print (type(tokens_without_sw))
    text = " ".join(tokens_without_sw)
    return text

In [58]:
text_columns['cleaned_name']= text_columns['name'].map(preprocess_text)

In [61]:
text_columns['cleaned_description']= text_columns['description'].map(preprocess_text)

In [62]:
text_columns['cleaned_neighborhood_overview']= text_columns['neighborhood_overview'].map(preprocess_text)

In [63]:
text_columns['cleaned_host_about']= text_columns['host_about'].map(preprocess_text)

In [64]:
text_columns.head()

Unnamed: 0.1,Unnamed: 0,id,name,description,neighborhood_overview,host_about,price,name_TC,name_SC,description_TC,description_SC,neighborhood_overview_TC,neighborhood_overview_SC,host_about_TC,host_about_SC,cleaned_name,cleaned_description,cleaned_neighborhood_overview,cleaned_host_about
0,0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0,5,1,163,13,19,1,63,5,panoramic ocean view venice beach,craftsman style penthouse ocean view room park...,close beach live venice boardwalk,yoga practice study 25 year spend month thaila...
1,1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0,10,2,203,17,24,2,65,4,spanish bungalow guest house la 30 plus night,private guest house space private guest house ...,local la community shop restaurant walk distan...,male educate healthcare professional cycle run...
2,2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0,8,1,185,15,51,3,14,3,boho chic flat step beach,bright airy quiet 1 bdr locate step seclude cl...,beach location los angeles peninsula marina de...,health wellness professional author passionate...
3,3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0,11,1,195,9,13,1,102,5,guest house entrance exit hot tub,fully self contain separate structure entrance...,close venice hassle parking space,easy go hostess enjoy stay experienced travele...
4,4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0,7,1,178,11,1,1,301,10,amazing bright elegant condo park upgrade,unit upgrade new bamboo floor brand new ult...,,paolo privitera ceo evensi paolo mit mba phone...


In [66]:
# saves df into csv for personal record
text_columns.to_csv('cleaned_text_features')

In [67]:
# Recounts tokens and sentences for comparison
text_columns['name_TC_afterProcess']= text_columns['cleaned_name'].map(countToken)
text_columns['name_SC_afterProcess']= text_columns['cleaned_name'].map(countSentence)

In [71]:
#Compares intial count to afterProcess token and sentence count
text_columns[['name_TC', 'name_SC','name_TC_afterProcess','name_SC_afterProcess']]

Unnamed: 0,name_TC,name_SC,name_TC_afterProcess,name_SC_afterProcess
0,5,1,5,1
1,10,2,8,1
2,8,1,5,1
3,11,1,6,1
4,7,1,6,1
...,...,...,...,...
45209,5,1,4,1
45210,8,1,6,1
45211,8,1,5,1
45212,3,1,3,1


In [72]:
text_columns['description_TC_afterProcess']= text_columns['cleaned_description'].map(countToken)
text_columns['description_SC_afterProcess']= text_columns['cleaned_description'].map(countSentence)

In [73]:
text_columns['neighborhood_overview_TC_afterProcess']= text_columns['cleaned_neighborhood_overview'].map(countToken)
text_columns['neighborhood_overview_SC_afterProcess']= text_columns['cleaned_neighborhood_overview'].map(countSentence)

In [74]:
text_columns['host_about_TC_afterProcess']= text_columns['cleaned_host_about'].map(countToken)
text_columns['host_about_SC_afterProcess']= text_columns['cleaned_host_about'].map(countSentence)

In [75]:
text_columns.head()

Unnamed: 0.1,Unnamed: 0,id,name,description,neighborhood_overview,host_about,price,name_TC,name_SC,description_TC,...,cleaned_neighborhood_overview,cleaned_host_about,name_TC_afterProcess,name_SC_afterProcess,description_TC_afterProcess,description_SC_afterProcess,neighborhood_overview_TC_afterProcess,neighborhood_overview_SC_afterProcess,host_about_TC_afterProcess,host_about_SC_afterProcess
0,0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0,5,1,163,...,close beach live venice boardwalk,yoga practice study 25 year spend month thaila...,5,1,93,1,5,1,25,1
1,1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0,10,2,203,...,local la community shop restaurant walk distan...,male educate healthcare professional cycle run...,8,1,109,2,15,1,37,1
2,2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0,8,1,185,...,beach location los angeles peninsula marina de...,health wellness professional author passionate...,5,1,108,1,22,1,8,1
3,3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0,11,1,195,...,close venice hassle parking space,easy go hostess enjoy stay experienced travele...,6,1,91,1,5,1,48,1
4,4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0,7,1,178,...,,paolo privitera ceo evensi paolo mit mba phone...,6,1,119,1,1,1,143,1


In [76]:
# saves df into csv for personal record
text_columns.to_csv('my_cleaned_text_features2.csv')

In [8]:
cleaned_text_features= pd.read_csv('my_cleaned_text_features2.csv')

In [9]:
cleaned_text_features.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_about,price,name_TC,name_SC,description_TC,description_SC,...,cleaned_neighborhood_overview,cleaned_host_about,name_TC_afterProcess,name_SC_afterProcess,description_TC_afterProcess,description_SC_afterProcess,neighborhood_overview_TC_afterProcess,neighborhood_overview_SC_afterProcess,host_about_TC_afterProcess,host_about_SC_afterProcess
0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice. Been studying for 25 y...,152.0,5,1,163,13,...,close beach live venice boardwalk,yoga practice study 25 year spend month thaila...,5,1,93,1,5,1,25,1
1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...","male , educated . Healthcare Professional\r\nC...",75.0,10,2,203,17,...,local la community shop restaurant walk distan...,male educate healthcare professional cycle run...,8,1,109,2,15,1,37,1
2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,Health-Wellness Professional. Author. Passiona...,125.0,8,1,185,15,...,beach location los angeles peninsula marina de...,health wellness professional author passionate...,5,1,108,1,22,1,8,1
3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,Easy going hostess!! Enjoy your stay. \n\nI am...,189.0,11,1,195,9,...,close venice hassle parking space,easy go hostess enjoy stay experienced travele...,6,1,91,1,5,1,48,1
4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",115.0,7,1,178,11,...,,paolo privitera ceo evensi paolo mit mba phone...,6,1,119,1,1,1,143,1


In [10]:
cleaned_text_features.columns

Index(['id', 'name', 'description', 'neighborhood_overview', 'host_about',
       'price', 'name_TC', 'name_SC', 'description_TC', 'description_SC',
       'neighborhood_overview_TC', 'neighborhood_overview_SC', 'host_about_TC',
       'host_about_SC', 'cleaned_name', 'cleaned_description',
       'cleaned_neighborhood_overview', 'cleaned_host_about',
       'name_TC_afterProcess', 'name_SC_afterProcess',
       'description_TC_afterProcess', 'description_SC_afterProcess',
       'neighborhood_overview_TC_afterProcess',
       'neighborhood_overview_SC_afterProcess', 'host_about_TC_afterProcess',
       'host_about_SC_afterProcess'],
      dtype='object')

In [12]:
## take processed text columns as its own df and save to a csv (add to OneDrive)

df= cleaned_text_features[['name_TC_afterProcess', 'name_SC_afterProcess',
       'description_TC_afterProcess', 'description_SC_afterProcess']]

df.to_csv('cleaned_text_features_christineVersion.csv')
