### Preprocessing

1. Create dataframe of host descriptions
2. Add column of host descriptions without stopwords
3. Add column of host descriptions with only lemmatized words
4. Pickle dataframe

In [1]:
import pandas as pd

import string

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import pickle

#### Create dataframe of host descriptions

In [2]:
s_listings = pd.read_csv('data/seattle/listings.csv')
s_listings.drop_duplicates('host_about', inplace=True)
s_listings.dropna(subset=['host_about'], axis=0, inplace=True)
s_listings.reset_index(drop=True, inplace=True)
host_abouts = s_listings['host_about']
len(s_listings), len(host_abouts)

(2011, 2011)

In [3]:
abouts = []
for host in host_abouts:
    if (type(host) is str):
        host = host.decode('utf-8').lower()
        abouts.append(host)
print abouts[:4]

[u"i am an artist, interior designer, and run a small landscape business. my life revolves around beauty wherever i find it or create it. i grew up in seattle and love this city for the people, it's natural beauty and don't know anywhere else i would rather call home. i love to travel and try to do so as much as possible and am excited to be a part of the airbnb community!", u"living east coast/left coast/overseas.  time is short & it's a big world. ", u'i love living in seattle.  i grew up in the mid-west but the pacific north west has always felt like home.  i am a mom to 3 beautiful kids, love playing tennis, cooking, reading and being with friends and family.  i manage a few long-term rental properties as well as consult and manage airbnb properties for those living in the seattle area. ', u"hi, i live in seattle, washington but i'm originally from southern california. i am an industrial designer. we (my husband daniel and our little boy oliver) love to travel and host people at ou

In [4]:
len(abouts)

2011

In [5]:
s_listings['abouts'] = abouts

In [6]:
s_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,abouts
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,f,,WASHINGTON,f,moderate,f,f,2,4.07,"i am an artist, interior designer, and run a s..."
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,f,,WASHINGTON,f,strict,t,t,6,1.48,living east coast/left coast/overseas. time i...
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,f,,WASHINGTON,f,strict,f,f,2,1.15,i love living in seattle. i grew up in the mi...
3,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,f,,WASHINGTON,f,strict,f,f,1,0.89,"hi, i live in seattle, washington but i'm orig..."
4,5956968,https://www.airbnb.com/rooms/5956968,20160104002432,2016-01-04,Private unit in a 1920s mansion,We're renting out a small private unit of one ...,If you include a bit of your background in you...,We're renting out a small private unit of one ...,none,This part of Queen Anne has wonderful views an...,...,f,,WASHINGTON,f,strict,f,f,1,2.45,i've been fortunate to have had many life expe...


#### Remove stopwords

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += string.punctuation
stopwords += ['seattle', 'wa', 'washington', 'website', 'hidden', 'good', \
              'love', 'live', 'living', 'life', 'travel', 'enjoy', 'like', \
              'home', 'year', 'work', 'people', 'new', 'city', 'time', \
              'place', 'host', 'stay', 'world', 'great', 'meet', 'also', \
              'share', 'go', 'airbnb', 'bnb', 'guest', 'lot', 'year', 'years', \
              'old', 'hi']

In [8]:
word_punct = WordPunctTokenizer()
nonstops = []
nonstop_abouts = []
for about in abouts:
    host = []
    sents = sent_tokenize(about)
    for sent in sents:
        tokens = word_punct.tokenize(sent)
        tokens = [w for w in tokens if (w not in stopwords)]
        host.extend(tokens)
    nonstops.append(host)
    nonstop_abouts.append(' '.join(host))

In [9]:
for i in range(0,4):
    print nonstop_abouts[i]

artist interior designer run small landscape business revolves around beauty wherever find create grew natural beauty know anywhere else would rather call try much possible excited part community
east coast left coast overseas short big
grew mid west pacific north west always felt mom 3 beautiful kids playing tennis cooking reading friends family manage long term rental properties well consult manage properties area
originally southern california industrial designer husband daniel little boy oliver house appreciate design vintage finds culture collect experiences stuff hope reflected house lives


In [10]:
s_listings['nonstop_abouts'] = nonstop_abouts

#### Lemmatize words

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
lemma_pos = {'JJ': 'a',
             'NN': 'n',
             'RB': 'r', 
             'VB': 'v'}

In [13]:
host_lemmas = []
for about in nonstops:
    lemmas = []
    pos_tags = pos_tag(about)
    for word, pos in pos_tags:
        if pos != 'NNP' and pos != 'NNPS':
            try:
                p = lemma_pos[pos[:2]]
                lemma = lemmatizer.lemmatize(word, pos=p)
                if lemma not in stopwords:
                    lemmas.append(lemma)
            except:
                continue
    host_lemmas.append(' '.join(lemmas))

In [14]:
print host_lemmas[:4]

[u'artist interior designer run small landscape business revolves beauty find create grow natural beauty know anywhere else rather call try much possible excited part community', u'east coast leave coast overseas short big', u'grow mid west pacific north west always felt mom beautiful kid play tennis cook read friend family manage long term rental property well consult manage property area', u'originally southern california industrial designer husband daniel little boy oliver house appreciate design vintage find culture collect experience stuff hope reflect house']


In [15]:
s_listings['host_lemmas'] = host_lemmas

#### Pickle dataframe

In [16]:
len(s_listings)

2011

In [17]:
with open('data/s_listings.pkl', 'wb') as picklefile:
    pickle.dump(s_listings, picklefile)