In [1]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import pandas as pd
import numpy as np
import pickle

In [2]:
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)

Pandas version: 1.2.4
Numpy version: 1.20.1


1. Get links, **`link=get_links()`**
2. create dictionary  
Begin Loop  
3. Create soup object, **`soup=create_soup_object(url)`** * 
4. Find city and put in dictionary, **`find_city_put_in_dict(input_dict)`** 
5. Get items and put in list **`items=get_items()`** 
6. Get prices and put in list **`prices=get_prices()`** 
7. Put items and prices in dictionary  **`items_prices_to_dict(cities_dict)`** 
8. Get price index
9. Put price index in dictionary
10. Make data frame **`cities_df = make_df(input_dict)`** 

In [3]:
def get_links():
    '''
    output: a list of all the links on the expatistan website
    '''
    links = []
    for line in soup.find_all('td', class_='city-name'):
        links.append(line.find('a')['href'])
    return links

In [4]:
def create_soup_object(url):
    '''
    input: url
    output: soup object of url text
    '''
    response = requests.get(url)
    if response.status_code == 200:
        page = response.text
    soup = BeautifulSoup(page, 'html5lib')
    return soup

In [5]:
def find_city_put_in_dict(input_dict):
    '''
    finds the city in the expatistan website and puts it in input_dict
    '''
    city = soup.find('span', class_='city-2').text
    input_dict['City'].append(city)

In [6]:
def get_items():
    '''
    finds all the item names used in the cost of living index on the expatistan website
    returns a list of items
    '''
    items = []
    for row in soup.find_all('td', class_='item-name'):
        item = row.find('a').text
        items.append(item)
    return items

In [7]:
def get_price_indices():
    '''
    soup object must be initialized on https://www.expatistan.com/cost-of-living/index page
    output: the price indices on the expatistan website
    '''
    price_index = []
    for row in soup.find_all('td', class_ ='price-index'):
        item = row.text
        price_index.append(item)
    return price_index

In [8]:
def get_prices():
    '''
    returns a list of prices from scraped from the expatistan website
    '''
    prices = []
    price_regex = re.compile(r'\(?\$?([0-9,.]*)|([-])')
    for row in soup.find_all('td', class_='price city-1'):
        row=row.text.strip()
        mo = price_regex.search(row)
        if mo:
            prices.append(mo.group(1))

    if len(prices) == 104:
        prices1 = prices[1::2]
    else:
        prices1 = prices
    return prices1

In [9]:
def items_prices_to_dict(input_dict):
    '''
    takes items and prices and puts them to input_dict
    '''
    for item, price in zip(items, prices):
        input_dict[item].append(price)

In [10]:
url = 'https://www.expatistan.com/cost-of-living/index'
response = requests.get(url)
response.status_code

200

In [11]:
page = response.text

In [12]:
soup = BeautifulSoup(page,"html5lib")

In [13]:
cities_dict = defaultdict(list) 
url = 'https://www.expatistan.com/cost-of-living/index'
soup = create_soup_object(url)
links = get_links()
price_indices = get_price_indices()
for i,url in enumerate(links):
    url=url+'?currency=USD'
    print(i, url)
    soup = create_soup_object(url)
    find_city_put_in_dict(cities_dict)
    items = get_items()
    prices = get_prices()
    items_prices_to_dict(cities_dict)

0 https://www.expatistan.com/cost-of-living/zurich?currency=USD
1 https://www.expatistan.com/cost-of-living/grand-cayman?currency=USD
2 https://www.expatistan.com/cost-of-living/san-francisco?currency=USD
3 https://www.expatistan.com/cost-of-living/new-york-city?currency=USD
4 https://www.expatistan.com/cost-of-living/london?currency=USD
5 https://www.expatistan.com/cost-of-living/geneva?currency=USD
6 https://www.expatistan.com/cost-of-living/hong-kong?currency=USD
7 https://www.expatistan.com/cost-of-living/lausanne?currency=USD
8 https://www.expatistan.com/cost-of-living/dublin?currency=USD
9 https://www.expatistan.com/cost-of-living/los-angeles?currency=USD
10 https://www.expatistan.com/cost-of-living/washington-d-c?currency=USD
11 https://www.expatistan.com/cost-of-living/reykjavik?currency=USD
12 https://www.expatistan.com/cost-of-living/sydney?currency=USD
13 https://www.expatistan.com/cost-of-living/jersey-city?currency=USD
14 https://www.expatistan.com/cost-of-living/san-jose-

121 https://www.expatistan.com/cost-of-living/santo-domingo?currency=USD
122 https://www.expatistan.com/cost-of-living/quito?currency=USD
123 https://www.expatistan.com/cost-of-living/krakow?currency=USD
124 https://www.expatistan.com/cost-of-living/sao-paulo?currency=USD
125 https://www.expatistan.com/cost-of-living/puebla?currency=USD
126 https://www.expatistan.com/cost-of-living/queretaro?currency=USD
127 https://www.expatistan.com/cost-of-living/jakarta?currency=USD
128 https://www.expatistan.com/cost-of-living/rio-de-janeiro?currency=USD
129 https://www.expatistan.com/cost-of-living/nairobi?currency=USD
130 https://www.expatistan.com/cost-of-living/bucharest?currency=USD
131 https://www.expatistan.com/cost-of-living/guadalajara?currency=USD
132 https://www.expatistan.com/cost-of-living/brasilia?currency=USD
133 https://www.expatistan.com/cost-of-living/ho-chi-minh-city?currency=USD
134 https://www.expatistan.com/cost-of-living/lima?currency=USD
135 https://www.expatistan.com/cost-

### Create df with scraped data
#### Data for project was scraped on Jan 12, 2022

In [14]:
cities_df = pd.DataFrame.from_dict(cities_dict)
cities_df['Price_Index'] = price_indices
pi = cities_df['Price_Index'] 
cities_df.drop(labels=['Price_Index'], axis=1,inplace = True) 
cities_df.insert(0, 'Price_Index', pi) # move "Price_Index" column to front
cities_df.set_index(['City'], inplace=True)
cities_df.head()

Unnamed: 0_level_0,Price_Index,Basic lunchtime menu (including a drink) in the business district,Combo meal in fast food restaurant (big mac meal or similar),500 gr (1 lb.) of boneless chicken breast,1 liter (1 qt.) of whole fat milk,"12 eggs, large",1 kg (2 lb.) of tomatoes,500 gr (16 oz.) of local cheese,1 kg (2 lb.) of apples,1 kg (2 lb.) of potatoes,...,2 tickets to the movies,2 tickets to the theater (best available seats),"Dinner for two at an italian restaurant in the expat area including appetisers, main course, wine and dessert",1 cocktail drink in downtown club,Cappuccino in expat area of the city,1 beer in neighbourhood pub (500ml or 1pt.),Ipad wi-fi 128gb,1 min. of prepaid mobile tariff (no discounts or plans),1 month of gym membership in business district,1 package of marlboro cigarettes
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Zurich, Switzerland",255,29,15,12.0,1.98,8.0,3.88,10,3.8,2.58,...,36,283,105,18,5.63,8,536,0.44,131,9
"Grand Cayman, Cayman Islands",253,19,10,6.0,3.21,5.63,8.0,8,7.0,3.67,...,33,56,106,13,6.0,7,635,0.41,79,12
"San Francisco, California, United States",239,20,10,7.0,1.23,4.46,7.0,7,6.0,1.63,...,30,263,94,14,4.79,7,426,0.14,83,12
"New York City, United States",238,19,9,6.0,1.11,3.63,4.62,7,3.73,1.59,...,31,397,113,17,5.12,7,423,0.48,74,14
"London, United Kingdom",238,17,8,5.18,1.31,3.76,2.69,7,3.07,1.12,...,33,293,101,15,4.53,8,683,0.47,66,17


In [15]:
cities_df.replace('', np.nan , inplace = True) # replace blank values with nan

# get all the object columns, replace ',' with '.'
obj_cols = cities_df.select_dtypes(include=['object']).columns
for col in obj_cols:
    cities_df[col] = cities_df[col].str.replace(',', '').astype(float)

# Save df as csv

In [12]:
cities_df.to_csv('price_indicies.csv')