In [1]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup as bs
import os
import time
import sys
import csv


# Getting country names and codes from the website
def get_countrycode():
    
    res = requests.get('https://comtrade.un.org/Data/cache/reporterAreas.json')
    soup = bs(res.content, 'html.parser') # Use html parser to get the website contents
    data = json.loads(soup.text) # Because the contents formation is Json type. 
    
    # Extract the country names and its code
    with open('reporter.csv','w') as f:
        for i in data['results']:
            f.write(i['id'])
            f.write(', ')
            f.write(i['text'])
            f.write('\n')
    
    # Extract the country code only to use as a input for the function: auto_downloader()
    with open('reporter_code.csv','w') as f:
        for i in data['results']:
            f.write(i['id'])
            f.write('\n')
    
    global country_code
    df = pd.read_csv('reporter_code.csv', index_col=False)
    country_code = df['all'].to_list()
    
    return


# Auto downloader with API request form
def auto_downloader(years):
    print('################################')
    print('Auto Downloader Initiated')
    print('################################')
    print()
    
    # Run get_countrycodd() function to get country code
    get_countrycode()
    country = country_code

    for year in years:
        print('Now Start to Download {} Data'.format(year))

        fl = 'data/' + str(year) + '/' # Set the folder name.
        folder = os.path.exists(fl) # Check if the folder exists.
        
        # No folder, make the folder.
        if not folder:
            os.makedirs(fl)
        
        # Prepare a file to write the scraped data.
        for item in country:
            file = str(year) + '_' + str(item) + ".csv"
            file_path = fl + file

            while True:
                try:
                    url = 'https://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=HS&ps={}&r={}&p=all&rg=all&cc=3915,391510,391520,391530,391590&fmt=csv'.format(year, item)
                    res = requests.get(url) # Request the bulk data with API
                    
                    if res.status_code == 200: # Check the request status 200 is good sign.
                        print('Downloading ' + file + ' Start!')

                    elif res.status_code == 409: # For basic user, there is downloading limitation 100 / hour. If the limit hit, 409 error code generated.
                        print('Usage Limit Hit!')
                        print('Waiting!')
                        time.sleep(3600) # Wait for an hour
                        print('Times up! Reinitiate the Downloader!')
                        res = requests.get(url) # The former request will be deleted due to time delay. So here I put one more. If you don't, you will get an empty file.
                        
                    with open(file_path, 'wb') as f:
                        f.write(res.content)
                        f.close()

                        print('{} Making Success'.format(file))
                        print('Done')
                        print('--------------------------------')
                        print()

                except Exception as e:
                    print(e)
                    print('Error Occured...Break the Downloader...')
                    sys.stdout.flush()
                    time.sleep(5)

                else:
                    break

            sys.stdout.flush()
            time.sleep(1)

        # Start to merge downloaded data to a 'year_total.csv' file.
        total_path = 'data/total/'
        total_folder = os.path.exists(total_path)

        if not total_folder:
            os.makedirs(total_path)

        file_list = os.listdir(fl)
        all_files = [file for file  in file_list if file.endswith('.csv')] # Get all the files end with csv.

        df = pd.DataFrame()

        for i in all_files:
            data = pd.read_csv(fl + i)
            if len(data.index) == 1:
                pass
            else:
                df = df.append(data)
        df.reset_index(drop = True, inplace=True)
        df.to_csv(total_path+'{}_total.csv'.format(year))  

    print('All Requests Done! Good-bye')