### How does it work:
* import required libraries;
* get data from Forbes. Link to request for top 2000 companies;
* filtering data fields for each company;
* create data frame;
* save as parquet file

In [1]:
import pandas as pd
import json
import requests
import os

In [2]:
# Request settings
params={
    "limit": 2000
}
URL = "https://www.forbes.com/forbesapi/org/global2000/2020/position/true.json"

In [3]:
# Get data from Forbes website
response = requests.get(URL, params = params)
print(response.status_code)

200


In [4]:
# List of organization data
list_of_organization_data = []

# Get json from response message
original_json = response.json()

# Get only list of all organizations
organizations_json = original_json["organizationList"]["organizationsLists"]

# Loop for filtering data about ecach organization
for organization in organizations_json:
    filtered_organization={}
    filtered_organization["Company"] = organization["organizationName"]
    filtered_organization["Country"] = organization["country"]
    filtered_organization["Industry"] = organization["industry"]
    filtered_organization["Sales"] = organization["revenue"] * 1000
    filtered_organization["Profits"] = organization["profits"] * 1000
    filtered_organization["Assets"] = organization["assets"] * 1000
    filtered_organization["Market Value"] = organization["marketValue"] * 1000

    # Add filtered data to list
    list_of_organization_data.append(filtered_organization)


In [5]:
# Create DataFrame from list of oraganizations dictionaries
df = pd.DataFrame(list_of_organization_data)

# Clear
df.dropna(axis=1, how='all').dropna(axis=0, how='all')

Unnamed: 0,Company,Country,Industry,Sales,Profits,Assets,Market Value
0,ICBC,China,Banking,177230000.0,45283600.0,4.322528e+09,242283000.0
1,China Construction Bank,China,Banking,162147000.0,38914800.0,3.822048e+09,203818000.0
2,JPMorgan Chase,United States,Diversified Financials,142927000.0,29954000.0,3.139431e+09,291737000.0
3,Berkshire Hathaway,United States,Diversified Financials,254616000.0,81417000.0,8.177290e+08,455444000.0
4,Agricultural Bank of China,China,Banking,148692000.0,30911600.0,3.697451e+09,147174000.0
...,...,...,...,...,...,...,...
1995,Founder Securities,China,Diversified Financials,1391000.0,114700.0,1.862300e+07,8300000.0
1996,Merlin Properties SOCIMI S.A,Spain,Diversified Financials,594000.0,630900.0,1.493600e+07,4306000.0
1997,Heico,United States,Aerospace & Defense,2096000.0,370500.0,3.097000e+06,11796000.0
1998,W.P. Carey,United States,Diversified Financials,1246000.0,305200.0,1.406100e+07,11341000.0


In [6]:
# File save settings
save_dir="../data"
file_name = "companies_rank.parquet"

# Create "data" directory if it isn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
# Save to parquet
df.to_parquet('{}/{}'.format(save_dir,file_name))