In [1]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import pandas as pd
import numpy as np
import pickle

In [2]:
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)

Pandas version: 0.20.3
Numpy version: 1.13.1


1. Get links, **`link=get_links()`**
2. create dictionary  
Begin Loop  
3. Create soup object, **`soup=create_soup_object(url)`** * 
4. Find city and put in dictionary, **`find_city_put_in_dict(input_dict)`** 
5. Get items and put in list **`items=get_items()`** 
6. Get prices and put in list **`prices=get_prices()`** 
7. Put items and prices in dictionary  **`items_prices_to_dict(cities_dict)`** 
8. Get price index
9. Put price index in dictionary
10. Make data frame **`cities_df = make_df(input_dict)`** 

In [60]:
def get_links():
    '''
    output: a list of all the links on the expatistan website
    '''
    links = []
    for line in soup.find_all('td', class_='city-name'):
        links.append(line.find('a')['href'])
    return links

In [61]:
def create_soup_object(url):
    '''
    input: url
    output: soup object of url text
    '''
    response = requests.get(url)
    if response.status_code == 200:
        page = response.text
    soup = BeautifulSoup(page, 'html5lib')
    return soup

In [62]:
def find_city_put_in_dict(input_dict):
    '''
    finds the city in the expatistan website and puts it in input_dict
    '''
    city = soup.find('span', class_='city-2').text
    input_dict['City'].append(city)

In [63]:
def get_items():
    '''
    finds all the item names used in the cost of living index on the expatistan website
    returns a list of items
    '''
    items = []
    for row in soup.find_all('td', class_='item-name'):
        item = row.find('a').text
        items.append(item)
    return items

In [64]:
def get_price_indices():
    '''
    soup object must be initialized on https://www.expatistan.com/cost-of-living/index page
    output: the price indices on the expatistan website
    '''
    price_index = []
    for row in soup.find_all('td', class_ ='price-index'):
        item = row.text
        price_index.append(item)
    return price_index

In [65]:
def get_prices():
    '''
    returns a list of prices from scraped from the expatistan website
    '''
    prices = []
    price_regex = re.compile(r'\(?\$?([0-9,.]*)|([-])')
    for row in soup.find_all('td', class_='price city-1'):
        row=row.text.strip()
        mo = price_regex.search(row)
        if mo:
            prices.append(mo.group(1))

    if len(prices) == 104:
        prices1 = prices[1::2]
    else:
        prices1 = prices
    return prices1

In [66]:
def items_prices_to_dict(input_dict):
    '''
    takes items and prices and puts them to input_dict
    '''
    for item, price in zip(items, prices):
        input_dict[item].append(price)

In [67]:
url = 'https://www.expatistan.com/cost-of-living/index'
response = requests.get(url)
response.status_code

200

In [68]:
page = response.text

In [69]:
soup = BeautifulSoup(page,"html5lib")

In [70]:
cities_dict = defaultdict(list) 
url = 'https://www.expatistan.com/cost-of-living/index'
soup = create_soup_object(url)
links = get_links()
price_indices = get_price_indices()
for i,url in enumerate(links):
    url=url+'?currency=USD'
    print(i, url)
    soup = create_soup_object(url)
    find_city_put_in_dict(cities_dict)
    items = get_items()
    prices = get_prices()
    items_prices_to_dict(cities_dict)

0 https://www.expatistan.com/cost-of-living/zurich?currency=USD
1 https://www.expatistan.com/cost-of-living/geneva?currency=USD
2 https://www.expatistan.com/cost-of-living/grand-cayman?currency=USD
3 https://www.expatistan.com/cost-of-living/luanda?currency=USD
4 https://www.expatistan.com/cost-of-living/new-york-city?currency=USD
5 https://www.expatistan.com/cost-of-living/reykjavik?currency=USD
6 https://www.expatistan.com/cost-of-living/san-francisco?currency=USD
7 https://www.expatistan.com/cost-of-living/washington-d-c?currency=USD
8 https://www.expatistan.com/cost-of-living/oslo?currency=USD
9 https://www.expatistan.com/cost-of-living/mountain-view-california?currency=USD
10 https://www.expatistan.com/cost-of-living/lausanne?currency=USD
11 https://www.expatistan.com/cost-of-living/basel?currency=USD
12 https://www.expatistan.com/cost-of-living/bern?currency=USD
13 https://www.expatistan.com/cost-of-living/london?currency=USD
14 https://www.expatistan.com/cost-of-living/hong-kong

120 https://www.expatistan.com/cost-of-living/orlando?currency=USD
121 https://www.expatistan.com/cost-of-living/st-louis?currency=USD
122 https://www.expatistan.com/cost-of-living/montreal?currency=USD
123 https://www.expatistan.com/cost-of-living/virginia-beach?currency=USD
124 https://www.expatistan.com/cost-of-living/cincinnati?currency=USD
125 https://www.expatistan.com/cost-of-living/las-vegas?currency=USD
126 https://www.expatistan.com/cost-of-living/halifax?currency=USD
127 https://www.expatistan.com/cost-of-living/asheville?currency=USD
128 https://www.expatistan.com/cost-of-living/winnipeg?currency=USD
129 https://www.expatistan.com/cost-of-living/glasgow?currency=USD
130 https://www.expatistan.com/cost-of-living/cleveland?currency=USD
131 https://www.expatistan.com/cost-of-living/beirut?currency=USD
132 https://www.expatistan.com/cost-of-living/jacksonville?currency=USD
133 https://www.expatistan.com/cost-of-living/kelowna?currency=USD
134 https://www.expatistan.com/cost-of-

238 https://www.expatistan.com/cost-of-living/guayaquil?currency=USD
239 https://www.expatistan.com/cost-of-living/windhoek?currency=USD
240 https://www.expatistan.com/cost-of-living/san-salvador?currency=USD
241 https://www.expatistan.com/cost-of-living/cape-town?currency=USD
242 https://www.expatistan.com/cost-of-living/natal?currency=USD
243 https://www.expatistan.com/cost-of-living/saint-petersburg?currency=USD
244 https://www.expatistan.com/cost-of-living/budapest?currency=USD
245 https://www.expatistan.com/cost-of-living/dar-es-salaam?currency=USD
246 https://www.expatistan.com/cost-of-living/warsaw?currency=USD
247 https://www.expatistan.com/cost-of-living/quito?currency=USD
248 https://www.expatistan.com/cost-of-living/tegucigalpa?currency=USD
249 https://www.expatistan.com/cost-of-living/lima?currency=USD
250 https://www.expatistan.com/cost-of-living/kuala-lumpur?currency=USD
251 https://www.expatistan.com/cost-of-living/bogota?currency=USD
252 https://www.expatistan.com/cost-

### Create df with scraped data
#### Data for project was scraped on Sept 28, 2017

In [71]:
cities_df = pd.DataFrame.from_dict(cities_dict)
cities_df['Price_Index'] = price_indices
pi = cities_df['Price_Index'] 
cities_df.drop(labels=['Price_Index'], axis=1,inplace = True) 
cities_df.insert(0, 'Price_Index', pi) # move "Price_Index" column to front
cities_df.set_index(['City'], inplace=True)
cities_df.head()

Unnamed: 0_level_0,Price_Index,0.5 l (16 oz) domestic beer in the supermarket,1 beer in neighbourhood pub (500ml or 1pt.),"1 bottle of red table wine, good quality","1 box of 32 tampons (Tampax, OB, ...)",1 box of antibiotics (12 doses),1 cocktail drink in downtown club,1 kg (2 lb.) of apples,1 kg (2 lb.) of potatoes,1 kg (2 lb.) of tomatoes,...,Monthly rent for a 45 m2 (480 Sqft) furnished studio in NORMAL area,Monthly ticket public transport,Short visit to private Doctor (15 minutes),Standard men's haircut in expat area of the city,"Taxi trip on a business day, basic tariff, 8 km. (5 miles)",Tube of toothpaste,"Utilities 1 month (heating, electricity, gas ...) for 1 person in 45 m2 (480 Sqft) studio","Utilities 1 month (heating, electricity, gas ...) for 2 people in 85m2 flat","Volkswagen Golf 1.4 TSI 150 CV (or equivalent), with no extras, new","iPad Air 2, 64GB"
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Zurich, Switzerland",255,2.21,8.0,15,9.0,30,17,3.66,2.77,4.54,...,1654,104,143,47,38,4.14,91,153,27936,628
"Geneva, Switzerland",254,2.04,7.0,20,8.0,31,20,4.24,3.1,4.1,...,1762,73,114,47,37,4.0,113,196,26483,539
"Grand Cayman, Cayman Islands",254,4.11,6.0,28,10.0,25,13,8.0,4.26,5.84,...,1322,106,149,31,28,4.51,292,353,32956,759
"Luanda, Angola",254,2.46,2.2,29,4.72,10,10,8.0,3.51,6.0,...,2267,66,71,16,19,2.8,109,226,31887,1257
"New York City, United States",245,2.88,7.0,17,7.0,27,15,3.68,1.9,4.85,...,2012,119,171,26,16,2.3,126,202,22312,386


In [72]:
cities_df.replace('', np.nan , inplace = True) # replace blank values with nan

Save df into a pickle file.

In [None]:
pickle.dump(cities_df, open( "cities_df.p", "wb" ))