# Scraping notebook

In [93]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from time import sleep

In [74]:
def grab_country_codes():
    country_url =  "https://trade.ec.europa.eu/access-to-markets/country/get/ALL?lang=en"
    result = requests.get(country_url).json()
    
    codes = []
    
    for k in result:
        codes.append(k["code"])
        
    return codes
    
    
    

In [75]:
ccodes = grab_country_codes()

In [None]:
url = "https://trade.ec.europa.eu/access-to-markets/api/tariffs/get/8507908090/KR/SE?lang=EN"

In [49]:
def get_url(product,origin_country,destination_country):
    url = "https://trade.ec.europa.eu/access-to-markets/api/tariffs/get/{}/{}/{}?lang=EN".format(product,origin_country,destination_country)
    return url

In [88]:
get_url("8507908090","KR","SE")

'https://trade.ec.europa.eu/access-to-markets/api/tariffs/get/8507908090/KR/SE?lang=EN'

In [86]:
def get_page(url):
    r = requests.get(url).json()
    return r
    

In [89]:
result_test = requests.get("https://trade.ec.europa.eu/access-to-markets/api/tariffs/get/8507908090/KR/SE?lang=EN")

In [90]:
result_test.json()[0]["measures"][0]["tariffFormula"]

'2.70%'

In [91]:
json_obj = get_page(get_url("8507908090","KR","SE"))

In [62]:
def grab_info(json_obj):
    tariff_third = json_obj[0]["measures"][0]["tariffFormula"]
    tariff_pref = json_obj[0]["measures"][2]["tariffFormula"]
    return tariff_third,tariff_pref

In [63]:
grab_info(json_obj)

('2.70%', '0%')

In [103]:
def get_info_from_country(ccodes):
    tariff_third = []
    tariff_pref = []
    countries = []
    for ccode in tqdm(ccodes):
        try:
            url = get_url("8507908090",ccode,"SE")
            json_obj = get_page(url)
            a,b = grab_info(json_obj)
            tariff_third.append(a)
            tariff_pref.append(b)
            countries.append(ccode)
            #sleep(2)
            
        except:
            pass
        
    df = pd.DataFrame({"origin_country":countries,"tariff_third":tariff_third,"tariff_pref":tariff_pref})
    print(df)
    return df
        
        

In [104]:
result_df = get_info_from_country(ccodes)

100%|██████████| 233/233 [00:44<00:00,  5.19it/s]

   origin_country tariff_third tariff_pref
0              AF        2.70%          0%
1              AL        2.70%          0%
2              DZ        2.70%          0%
3              AD        2.70%          0%
4              AO        2.70%          0%
..            ...          ...         ...
77             HK        2.70%          0%
78             IS        2.70%          0%
79             IN        2.70%          0%
80             ID        2.70%          0%
81             IR        2.70%          0%

[82 rows x 3 columns]





In [105]:
result_df

Unnamed: 0,origin_country,tariff_third,tariff_pref
0,AF,2.70%,0%
1,AL,2.70%,0%
2,DZ,2.70%,0%
3,AD,2.70%,0%
4,AO,2.70%,0%
...,...,...,...
77,HK,2.70%,0%
78,IS,2.70%,0%
79,IN,2.70%,0%
80,ID,2.70%,0%


# Get products

In [106]:
section_url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products?country=SE&lang=EN"

In [107]:
sections = requests.get(section_url).json()

In [110]:
def get_ids(sections):
    ids = []
    for section in sections:
        ids.append(section["id"])
        
    return ids
    

In [111]:
ids = get_ids(sections)

In [112]:
ids

[-1,
 -2,
 -3,
 -4,
 -5,
 -6,
 -7,
 -8,
 -9,
 -10,
 -11,
 -12,
 -13,
 -14,
 -15,
 -16,
 -17,
 -18,
 -19,
 -20,
 -21]

In [117]:
class ProductTraverse:
    def __init__(self,start_id):
        self.start_id = start_id
        self.product_codes = []
        
    def traverse(self,start_id = None):
        if start_id == None:
            start_id = self.start_id
            
        url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products?country=SE&lang=EN&parent={}".format(start_id)
        result = requests.get(url).json()
        
        for resp in result:
            if resp["hasChildren"] == False:
                self.product_codes.append(resp["code"])
                
            else:
                self.traverse(start_id = resp["id"])
                
        return self.product_codes
    
        

In [118]:
pt = ProductTraverse("301")
result = pt.traverse()

In [119]:
result

['010121',
 '01012910',
 '01012990',
 '010130',
 '010190',
 '01022110',
 '01022130',
 '01022190',
 '01022905',
 '0102291010',
 '0102291020',
 '0102291030',
 '0102291040',
 '0102291050',
 '0102291090',
 '01022921',
 '0102292910',
 '0102292920',
 '0102292930',
 '0102292940',
 '0102292950',
 '0102292990',
 '01022941',
 '0102294910',
 '0102294920',
 '0102294930',
 '0102294940',
 '0102294950',
 '0102294990',
 '0102295110',
 '0102295190',
 '0102295911',
 '0102295919',
 '0102295921',
 '0102295929',
 '0102295931',
 '0102295939',
 '0102295991',
 '0102295999',
 '01022961',
 '0102296910',
 '0102296920',
 '0102296930',
 '0102296990',
 '0102299110',
 '0102299190',
 '0102299921',
 '0102299929',
 '0102299991',
 '0102299999',
 '010231',
 '0102391010',
 '0102391090',
 '01023990',
 '01029020',
 '0102909110',
 '0102909190',
 '01029099',
 '010310',
 '01039110',
 '01039190',
 '01039211',
 '01039219',
 '01039290',
 '01041010',
 '01041030',
 '01041080',
 '01042010',
 '01042090',
 '01051111',
 '01051119',
 '0