In [23]:
import numpy as np
import pandas as pd
import pickle
import numbers
import os
from nltk import stem, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/ckrasnia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ckrasnia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ckrasnia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ckrasnia/nltk_data...


True

In [43]:

def remove_md(string):
    """
    Removes angled brackets (< & >) and the markdown formatting inside them from strings. Should 
    remove anything between two angled brackets, anything before a close bracket if there is only a
    close bracket, and anything after an open bracket if there is only an open bracket
    
    Input : str
    output : str
    """

    # if both an open and close angle brackets are in the string
    if ('<' in string) and ('>' in string):
        open_brac = string.find('<')
        close_brac = string.find('>')
        # if there is something inside the brackets, drop that including the brackets
        if open_brac < close_brac:
            string = string.replace(string[open_brac : close_brac + 1]," ")
        # if there isn't anything in between them, drop everything before and after
        else:
            string = string.replace(string[:close_brac+1]," ")
            string = string.replace(string[open_brac:]," ")
        # use recursion to fix any instances where there are multiple opens and closes
        return remove_md(string)
    elif ('<' in string):
        open_brac = string.find('<')
        string = string.replace(string[open_brac:]," ")
        return remove_md(string)
    elif ('>' in string):
        close_brac = string.find('>')
        string = string.replace(string[:close_brac+1]," ")
        return remove_md(string)
    else:
        return string


def drop_words(word_list):
    """
    Removes stop-words or words that don't really add any meaning so we want to remove to limit
    the number of features in our final data set.

    Input :
        word_list (list of str)
    Output :
        list of str
    """
    stop_words = set(stopwords.words('english'))
    wl = list(word_list)
    add_stop_words = ['w/', '&', 'will','-',  '•', '+','eg', '|', '/']
    for item in add_stop_words:
        stop_words.add(item)
    
    for sw in stop_words:
        if sw in wl:
            wl.remove(sw)
    return wl


def ordered_unique(seq):
    """
    Takes the unique values of a list, but unlike np.unique, it returns those unique values in the
    same order in which the first occurence appears so ordered_unique([1,1,2,4,3,4]) = [1,2,4,3]
    
    Input : 
        seq (iterable) 

    Output : 
        list
    """
    
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]


def get_dictionary(dict_name, load=False, text=None, n_words=4000, overWrite=False, 
                   fileDir='/Users/ckrasnia/Documents/application_materials/rental_data/'):
    """
    Retrieves a dictionary of unnique words, where each word is a key and each key is assigned a unique integer. Default
    is to load a previously saved file, but to generate a new dictionary, set load=False and provide a text
    
    Input :
        dict_name (str) : filename for the dictionary, ending in .pkl, should be same as the column name
        load (bool) defaults to True, if the dictionary should be loade from fileDir
        text (list of strings) a list of strings from which to generate the dictionary
        n_words (int) number of words to include in the dictionary
        overWrite (bool) only applies if text is provided, determines if the dictionary should be saved
        fileDir (str) the file that contains the dictionary / where the dictionary will be saved if overWrite=True
        
    Returns : 
        a dictionary where there are n_words keys which are strings and the values are a unique integer assigned
        according to the frequency of the words with 1 = most frequent. 0 is reserved for words not in the dictionary
    """
    
    if text != None:
        print('creating new dictionary...')
        u,cts = np.unique(text, return_counts=True)
        words = list(u[np.argsort(cts)[-n_words:]])
        words.reverse()# reverse so that 1 is the most frequent word
        vals = np.arange(1,n_words+1) # reserve 0 for words that are not in the top 4000
        dictionary = {word:val for word, val in zip(words,vals)}
        
        if overWrite:
            f = open(fileDir+dict_name,"wb")
            pickle.dump(dictionary,f)
            print('new dictionary saved to {}'.format(fileDir))
            f.close()
    elif load: # load file if thats what we want
        print('loading pregenerated dictionary')
        f = open(fileDir+dict_name,"rb")
        dictionary = pickle.load(f)
        f.close()

    else:
        raise Exception('dictionary must either be loaded with load=True or created by providing \
            a list of strings to the text argument')
        
    return dictionary


def clean_text(column : pd.Series):
    """
    written to clean up the text data to translate it into numerical values for input to a model.
    steps: 
    1. changes nan values to empty strings
    2. lowercases all text
    3. removes markdown formatting
    4. removes special characters
    5. splits into a list of words
    6. drops a list of stopwords to pare down to a set of only meaningful words
    7. lemmatizes words to reduce the number of unique words
    8. only keeps the unique words of the list, while preserving order of appearance
    
    Input:
        column (pd.Series) with each row containing a string containing with multiple words
        
    Output:
        (pd.Series) the cleaned column with a list of unique words in each row
    """
    column[column.isna()] = ''
    #look at the original distribution of how many words there are per description
    words = column.str.lower()

    # first start by getting rid of ugly formatting 
    words = words.apply(remove_md)
    #also going to want to remove some punctuation
    words = words.str.replace("[():!.,?/]"," ")
    words = words.apply(word_tokenize)

    # get rid of stop words
    words = words.apply(drop_words)

    # lematize words so that to reduce unique words 
    # this is pretty time intensive due to the loop over the list within each of the rows, plus using WordNet is a bit slow
    # if its way too slow, could try just stemming with PorterStemmer
    lemmatizer = WordNetLemmatizer()
    words = words.map(lambda x: [lemmatizer.lemmatize(y) for y in x])

    # now I want unique words within each description
    words = words.apply(ordered_unique)

    # sometimes stop-words reappear after lematization, so try dropping them again
    words = words.apply(drop_words)
    return words


def translate(word_list, dictionary):
    """
    A lossy conversion of words to numbers or numbers back to words. if word_list contains strings,
    it will translate to using the dictionary, if it is ints, it will translate to strings using
    the reverse translation. For words that are not in the dictionary, it will be mapped to None.
    for the reverse translation, 0 gets mapped to None
    
    Input : 
        word_list (list of int or str) list that you want translated
        dictionary (dict with str as keys mapped to a unique int) used to translate between words
            and ints
        
    Returns :
        a translated version of the list, the same length as the input word_list
    """

    # catch if there is no description
    try:
        if len(word_list) == 0:
            translated = np.array([0])

        
        # translate to integers
        elif type(word_list[0]) == str:
            translated = np.array([*map(dictionary.get, word_list)])
            translated[translated==None] = 0

        # translate to strings
        elif np.any([isinstance(x, numbers.Number) for x in word_list]):
            inverse_dict = {v: k for k, v in dictionary.items()}
            translated = np.array([*map(inverse_dict.get, word_list)])
            
        # if a list of Nones, return self as thats the best translation we can do
        elif (np.all([x == None for x in word_list])):
            translated = np.zeros(len(word_list))
            
    except TypeError: # catches the case that there is only a None
        translated = np.array([0])
        
    return translated

def preprocess_text(column, dictionary=None, return_dict = False):
    """
    Applies both the clean_column function and translate function to yield a numerical series
    
    Input : 
        column (pd.Series) a column with rows containing strings, most should have multiple words
        dictionary (None or dict) if None, a dictionary is created from the words in the column and
            the column is translated with that new dictionary, else a dictionary with words as keys
            and unique integers as values
        return_dict (bool) if true, the dictionary will also be returned
        
    Returns : 
        (pd.Series) the same length as the column input with a 1d array of integers in each row who's values are mapped
            to words using the dictionary either provided or generated here. 
        [optional] 
        (dict) a dictionary assigning words to unique integers based on their frequency, see get_dictionary for details
    """
    
    print('cleaning the text...')
    
    cleaned_column = clean_text(column)
    if dictionary == None:
        n = 50000
        idx = np.random.choice(np.arange(len(cleaned_column)),n)
        word_list = [w for j in cleaned_column.iloc[idx] for w in j]
        dictionary = get_dictionary(column.name+'.pkl', text=word_list,overWrite=True)
        
    print('applying the dictionary')
    
    if return_dict:
        return cleaned_column.apply(translate,args=(dictionary,)), dictionary
    
    return cleaned_column.apply(translate,args=(dictionary,))


In [6]:
final_dir = r'/Users/ckrasnia/Documents/application_materials/rental_data'
data = pd.read_csv(os.path.join(final_dir,'raw_US_listings.csv'),index_col='Unnamed: 0',dtype = {'id':str, 'host_id':str, })


  exec(code_obj, self.user_global_ns, self.user_ns)


In [40]:
X=data.head(100)
cleaned=clean_text(X['description'])
for i,c in cleaned.iteritems():
    print(c)
    print('\n')


['custom', 'built', 'studio', 'exquisite', 'design', 'real', 'hardwood', 'floor', 'perfect', 'professional', 'road', 'desiring', 'nice', 'quiet', 'place', 'call', 'home', 'fully', 'furnished', 'custom-built', 'corian', 'kitchen', 'built-in', 'curio', 'cabinet', 'display', 'desk', 'nook', 'marble', 'bath', 'large', 'mirror', 'closet', 'tv', 'ac', 'heat', 'on-site', 'laundry', 'linen', 'utensil', 'iron', 'ironing', 'board', 'dedicated', 'parking', 'pool', 'near', 'tesla', 'google', 'shuttle', 'stop', '5', 'min', 'kaiser', 'easy', 'access', '280', '20', 'stanford', 'etc', 'space', 'description', 'favorite', 'international', 'corporate', 'assignee', 'silicon', 'valley', 'intern', 'absolutely', 'one', 'kind', 'designed', 'discriminating', 'taste', 'newly', 'renovated', 'unit', 'deluxe', 'amenity', 'ideal', 'short', 'term', 'away', 'contract', 'assignment', 'truly', 'turnkey', 'bring', 'luggage', ';', 'superior', 'price', 'performance', 'exte']


['room', 'gracious', 'home', 'beautiful', 'ga

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [49]:
# now create a sklearn transformer that will take word columns in and return the translated version, 
# question: should it be 1 hot encoded or some other representation? like some frequency count based version?
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
class StringPreprocess(BaseEstimator, TransformerMixin):
    """
    transformer to preprocess long string text columns, like the name or description of the listing
    
    """
      
    def fit(self, X, y=None, n_words=4000):
        _, dictionary = preprocess_text(X, dictionary=None, return_dict = True)
        self.dictionary_ = dictionary
        return self
    
    def transform(self, X):
        return preprocess_text(X, dictionary=self.dictionary_)
StringPreprocess().fit_transform(data['name'])

cleaning the text...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


creating new dictionary...
applying the dictionary
cleaning the text...
applying the dictionary


0        [181, 5, 58, 40, 2299, 1831, 508, 21]
1                              [0, 55, 329, 0]
2                         [1, 2, 33, 30, 3039]
3                              [7, 2, 44, 425]
4                    [346, 343, 728, 443, 435]
                         ...                  
38272                     [129, 1, 2, 406, 27]
38273                [43, 1415, 2, 44, 586, 0]
38274                         [76, 2, 140, 65]
38275                        [76, 2, 2328, 21]
38276        [3565, 0, 904, 374, 1325, 115, 0]
Name: name, Length: 498225, dtype: object