# Natural Language Processing (NLP)

In [1]:
import pandas as pd

# import data
housing_data = pd.read_csv("zillow-data.csv")
housing_data.head(5)

Unnamed: 0,city,county,state,property_type,price,description
0,Burbank,Los Angeles County,CA,TOWNHOUSE,685000,Cabrini Villas.Top of Hill.Private/Quiet/Grass...
1,Sylmar,Los Angeles County,CA,SINGLE_FAMILY,688000,SOLD BEFORE PROCESSING - Welcome home to this ...
2,Los Angeles,Los Angeles County,CA,TOWNHOUSE,700000,This tri-level end unit is one of Just 55 unit...
3,Reseda,Los Angeles County,CA,SINGLE_FAMILY,700000,3 Bedroom 2 bath home in the desirable area of...
4,Panorama City,Los Angeles County,CA,SINGLE_FAMILY,690000,Single family home first time back on the mark...


In [4]:
import textstat
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()

housing_data["reading_ease"] = housing_data.description.progress_apply(lambda x: textstat.flesch_reading_ease(x))
housing_data["grade_level"] = housing_data.description.progress_apply(lambda x: textstat.text_standard(x, float_output = True))
housing_data["reading_time"] = housing_data.description.progress_apply(lambda x: textstat.reading_time(x, ms_per_char=14.69))
housing_data["sentence_count"] = housing_data.description.progress_apply(lambda x: textstat.sentence_count(x))

100%|██████████| 300/300 [00:00<00:00, 1699.27it/s]
100%|██████████| 300/300 [00:00<00:00, 851.71it/s]
100%|██████████| 300/300 [00:00<00:00, 34200.13it/s]
100%|██████████| 300/300 [00:00<00:00, 17649.08it/s]


In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def text_clean(string):
    words = []
    tokenizer = RegexpTokenizer((r'\w+')) # load function that tokenizes into words and keeps only alphanumeric characters 
    text = tokenizer.tokenize(string.lower()) # tokenize the input and lower every word
    text = nlp(" ".join(text)) 
    for token in text:
        # remove stopwords and lemmatize rest 
        if token.is_stop == False:
            words += [token.lemma_]
    return (" ").join(words)

housing_data["clean_description"] = housing_data.description.progress_apply(text_clean)

100%|██████████| 300/300 [00:05<00:00, 51.73it/s]


In [8]:
import fasttext
from sklearn.decomposition import PCA

model = fasttext.train_unsupervised("cleaned_text_corpus.txt")

housing_data["vec"] = housing_data.clean_description.progress_apply(lambda x: model.get_sentence_vector(x))

pca = PCA(n_components=3)
housing_data['x'] = pca.fit_transform(housing_data["vec"].values.tolist())[:,0]
housing_data['y'] = pca.fit_transform(housing_data["vec"].values.tolist())[:,1]
housing_data['z'] = pca.fit_transform(housing_data["vec"].values.tolist())[:,2]

100%|██████████| 300/300 [00:00<00:00, 1541.74it/s]


In [9]:
housing_data.head(5)

Unnamed: 0,city,county,state,property_type,price,description,reading_ease,grade_level,reading_time,sentence_count,clean_description,vec,x,y,z
0,Burbank,Los Angeles County,CA,TOWNHOUSE,685000,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,57.43,12.0,7.84,15,cabrini villas hill private quiet grassy gleno...,"[-0.02512608, 0.028402371, -0.12185228, -0.015...",-0.005382,-0.042914,-0.013894
1,Sylmar,Los Angeles County,CA,SINGLE_FAMILY,688000,SOLD BEFORE PROCESSING - Welcome home to this ...,70.84,8.0,4.61,4,sell process welcome home lovely 3 bedroom 2 b...,"[-0.03434704, -0.009852296, -0.13140254, -0.01...",0.010132,-0.09305,0.053181
2,Los Angeles,Los Angeles County,CA,TOWNHOUSE,700000,This tri-level end unit is one of Just 55 unit...,68.97,11.0,10.86,13,tri level end unit 55 unit nestle new communit...,"[-0.036746506, 0.010594783, -0.13332342, -0.04...",-0.12853,-0.052824,-0.047796
3,Reseda,Los Angeles County,CA,SINGLE_FAMILY,700000,3 Bedroom 2 bath home in the desirable area of...,57.61,11.0,3.22,2,3 bedroom 2 bath home desirable area reseda co...,"[-0.021491604, 0.013568662, -0.17267467, -0.03...",-0.023203,-0.010952,0.116257
4,Panorama City,Los Angeles County,CA,SINGLE_FAMILY,690000,Single family home first time back on the mark...,71.85,11.0,4.91,4,single family home time market 1975 great oppo...,"[-0.02191974, -0.019852286, -0.13412026, -0.02...",0.076868,0.014223,0.085426
