# Pull california housing data

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

raw = fetch_california_housing()
df = pd.DataFrame(raw.data, columns=raw.feature_names)
df['median_house_value'] = raw.target

In [2]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,median_house_value
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
df.shape

(20640, 9)

# Add county information

In [4]:
# map coordinates to zip code/county to give us categorical variables to work with
import json
import os
import time

import geopy
from tqdm import tqdm_notebook


import json


if not os.path.exists('geo.json'):
    try:
        results
    except NameError:
        results = {}
    geolocator = geopy.geocoders.Nominatim(country_bias='USA', user_agent='housing')
    coordinates = list(df[['Latitude', 'Longitude']].itertuples(name='Coordinates'))
    for c in tqdm_notebook(coordinates):
        try:
            if not c.Index in results:
                results[c.Index] = geolocator.reverse((c.Latitude, c.Longitude))
                time.sleep(1)
        except Exception:
            pass
    with open('geo.json', 'w') as f:
        results = {k: v.raw for k, v in results.items()}
        json.dump(results, f)
else:
    results = json.load(open('geo.json'))

In [5]:
results

{'0': {'address': {'city': 'Oakland',
   'country': 'USA',
   'country_code': 'us',
   'county': 'Contra Costa County',
   'house_number': '2981',
   'postcode': '94611',
   'road': 'Grizzly Peak Boulevard',
   'state': 'California'},
  'boundingbox': ['37.880400840239',
   '37.880600840239',
   '-122.22990352754',
   '-122.22970352754'],
  'display_name': '2981, Grizzly Peak Boulevard, Oakland, Contra Costa County, California, 94611, USA',
  'lat': '37.8805008402394',
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'lon': '-122.229803527544',
  'osm_id': '5149902',
  'osm_type': 'way',
  'place_id': '204090201'},
 '1': {'address': {'city': 'Oakland',
   'country': 'USA',
   'country_code': 'us',
   'county': 'Alameda County',
   'house_number': '2004',
   'postcode': '94611',
   'road': 'Tunnel Road',
   'state': 'California'},
  'boundingbox': ['37.85931709178',
   '37.85951709178',
   '-122.21958253407',
   '-122.21938253407'],
  'display_nam

In [6]:
df_geo = pd.DataFrame(
    [(r['address'].get('postcode'), r['address']['county']) for r in results.values()],
    columns=['zip_code', 'county'],
    index=[int(k) for k in results.keys()]
)

In [7]:
df_geo.isnull().sum()

zip_code    877
county        0
dtype: int64

In [8]:
df = df.merge(df_geo, left_index=True, right_index=True)
df.shape

(20374, 11)

In [9]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,median_house_value,zip_code,county
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,94611,Contra Costa County
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,94611,Alameda County
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,94618,Alameda County
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,94618,Alameda County
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,94618,Alameda County


# Add text data from wikipedia

In [10]:
import string
import re
import requests as rq
import bs4

endpoint = 'https://en.wikipedia.org/w/api.php'

search_params = {
    'action': 'query',
    'list': 'search',
    'format': 'json',
    'srsearch': None
}

page_params = {
    'action': 'parse',
    'format': 'json',
    'prop': 'text',
    'pageid': None
}


def query_wiki(query):
    response_json = _search_wiki(query)
    page_id = _extract_pageid(response_json)
    page_xml = _fetch_page_content(page_id)
    article = _parse_xml(page_xml)
    return _clean_article(article)

def _search_wiki(query):
    params = search_params.copy()
    params.update({'srsearch': query})
    response = rq.get(endpoint, params=params)
    return response

def _extract_pageid(response):
    json = response.json()
    # just take first result
    pageid = json[u'query'][u'search'][0][u'pageid']
    return pageid

def _fetch_page_content(pageid):
    params = page_params.copy()
    params.update({'pageid': str(pageid)})
    json = rq.get(endpoint, params=params).json()
    content = json[u'parse'][u'text'][u'*']
    return content

def _parse_xml(page_xml):
    soup = bs4.BeautifulSoup(page_xml, 'lxml')
    _ = [s.extract() for s in soup(['script', '[document]', 'head', 'title'])]
    return soup.getText()

def _clean_word(word):
    # remove references
    word = re.sub('\[[0-9]*\]', '', word)
    # remove punctuation
    word = word.translate({s: None for s in string.punctuation})
    return word

def _clean_article(article):
    def take_line(L):
        take = (
            # take text if it looks like a paragraph or full sentence
            (L.count('.') > 2 or L.count(' ') > 12) 
            # remove references at bottome of artcile
            and not L.startswith('^') 
            # Remove article meta-comments
            and not L.startswith('‹')
            and not ('•' in L)
            and not L.startswith('Images, from')
            and not L.startswith(u'This article needs to be updated') 
            and not L.endswith(u'(Learn how and when to remove this template message)') 
            and not L.startswith(u'This article is about') 
            and not L.endswith(u'This article has multiple issues') 
            and not re.match('^For the[^\.].*, see.*\.$', L) 
            and not re.match('".*" redirects here.', L))
        return take
    text = '\n'.join(L for L in article.split('\n') if take_line(L))
    text = ' '.join(_clean_word(word) for word in text.split())
    return ' '.join(text.split())

In [11]:
counties = df.county.unique()
counties

array(['Contra Costa County', 'Alameda County', 'Alpine County',
       'Amador County', 'Butte County', 'Calaveras County',
       'Colusa County', 'Marin County', 'Del Norte County',
       'Douglas County', 'El Dorado County', 'Fresno County',
       'Tulare County', 'Glenn County', 'Humboldt County',
       'Imperial County', 'Inyo County', 'Kern County', 'Kings County',
       'Lake County', 'Lassen County', 'Los Angeles County',
       'San Bernardino County', 'Orange County', 'Ventura County',
       'Madera County', 'Mariposa County', 'Mendocino County',
       'Merced County', 'Modoc County', 'Mono County', 'Monterey County',
       'Napa County', 'Nevada County', 'San Diego County',
       'Placer County', 'Sacramento County', 'Plumas County',
       'Riverside County', 'La Paz County', 'San Benito County',
       'Municipio de Tijuana', 'SF', 'San Joaquin County',
       'San Luis Obispo County', 'San Mateo County', 'Santa Clara County',
       'Santa Barbara County', 'Santa

In [12]:
county_descriptions = {c: query_wiki(c) for c in counties}

In [13]:
df['county_description'] = df.county.map(county_descriptions)

In [14]:
df[['county', 'county_description']].drop_duplicates().head()

Unnamed: 0,county,county_description
0,Contra Costa County,Contra Costa County is a county in the state o...
1,Alameda County,Alameda County (/ˌæləˈmiːdə/ AL-ə-MEE-də) is a...
1022,Alpine County,Alpine County is a county in the U.S. state of...
1025,Amador County,"Amador County, is a county in the U.S. state o..."
1053,Butte County,Butte County is a county in the U.S. state of ...


# Create train/test split

In [17]:
import numpy as np
df['is_train'] = np.random.choice([True, False], p=[0.8, 0.2], size=len(df))

# Encode categorical values

In [20]:
# roughly clean zip codes first
import numpy as np
zip_codes_raw = (df.zip_code
 .str.replace('CA ', '')
 .str.split('-', n=0, expand=True))[0]
zip_codes_raw.fillna(0, inplace=True)
zip_codes_raw = np.where(zip_codes_raw.str.isdigit(), zip_codes_raw, 0)
df['zip_code'] = zip_codes_raw.astype(int)

In [34]:
zip_code_encodings = {v: i for i, v in enumerate(df[df.is_train].zip_code.unique())}
county_encodings = {v: i for i, v in enumerate(df[df.is_train].county.unique())}
len(zip_code_encodings), len(county_encodings)

(1650, 61)

In [35]:
df['zip_code_encoding'] = df.zip_code.map(zip_code_encodings)
df['county_encoding'] = df.county.map(county_encodings)

In [36]:
# add "unkown" encodings
# df['zip_code_encoding'] = 
df.zip_code_encoding.fillna(len(zip_code_encodings), inplace=True)
df.county_encoding.fillna(len(county_encodings), inplace=True)

In [39]:
df['zip_code_encoding'] = df.zip_code_encoding.astype(int)
df['county_encoding'] = df.county_encoding.astype(int)

# dump data frame to disk

In [40]:
df.to_csv('california_housing_enriched.csv.gz', index=False, compression='gzip')