In [1]:
import pandas as pd
import numpy as np

import unidecode
from models.companies_refactor import COMPANIES

In [2]:
columns_2008 = ['company','industry','country','market_value','profits','assets','sales','rank']
columns_til_2011 = ['company','industry','country','market_value','profits','assets','sales','rank','profits_%_assets','profits_%_sales']
columns_til_2014 = ['company','industry','country','market_value','profits','assets','sales','rank','forbes_webpage','profits_%_assets','profits_%_sales']
columns_2014 = ['company','sector','industry','continent','country','market_value','sales','profits','assets','rank','forbes_webpage','profits_%_assets','profits_%_sales']
columns_til_2020 = ['company','market_value','sales','profits','assets','rank','sector','industry','continent','country','headquarters','state','ceo','forbes_webpage','profits_%_assets','profits_%_sales']
columns_2020 = ['rank','company','country','sales','profits','assets','market_value','sector','industry']

In [3]:
def load_forbes_df(year: int, columns: list, sep: str = ','):
    df = pd.read_csv(f'../data/Forbes Global 2000 - {year}.csv', sep=sep, error_bad_lines=False)
    df.columns = columns
    df['year'] = year
    return df


def load_all_forbes_df():
    dfs = []
    for year in range(2008,2021,1):
        if year == 2020:
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_2020,
                    sep=';'
                )
            )
        elif year in [2015,2016,2017,2018,2019] :
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_til_2020
                )
            )
        elif year == 2014:
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_2014
                )
            )
        elif year in [2011,2012,2013]:
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_til_2014
                )
            )
        elif year in [2009,2010]:
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_til_2011
                )
            )
        else:
            dfs.append(
                load_forbes_df(
                    year=year,
                    columns=columns_2008
                )
            )

    return pd.concat(dfs, axis=0)

In [4]:
df = load_all_forbes_df()

b'Skipping line 74: expected 16 fields, saw 17\nSkipping line 108: expected 16 fields, saw 17\nSkipping line 1896: expected 16 fields, saw 17\nSkipping line 1901: expected 16 fields, saw 17\n'


In [5]:
df.count()

company             25998
industry            25512
country             25998
market_value        25993
profits             25989
assets              25983
sales               25987
rank                25998
year                25998
profits_%_assets    21980
profits_%_sales     21975
forbes_webpage      17998
sector              13491
continent           11991
headquarters         9982
state                2841
ceo                  9968
dtype: int64

In [6]:
df.market_value.unique()

array([180.81, 330.93, 176.53, ..., '$68 M', '$171 M', '$729 M'],
      dtype=object)

In [7]:
def forbes_value_to_float(some_value):
    money_value = str(some_value).replace('$','')
    if 'B' in money_value:
        money_value = money_value \
            .replace(' ','') \
            .replace('B','') \
            .replace(',','')
    elif 'M' in money_value:
        aux = money_value.replace(' ','') \
            .replace('M','')\
            .replace('.','')

        if '-' in aux:
            aux = aux.replace('-','')
            money_value = f'-0.{aux}'
        else:
            money_value = f'0.{aux}'

    return float(money_value)

In [8]:
df['market_value'] = df['market_value'].map(forbes_value_to_float)

In [9]:
df['profits'] = df['profits'].map(forbes_value_to_float)

In [10]:
df['assets'] = df['assets'].map(forbes_value_to_float)

In [11]:
df['sales'] = df['sales'].map(forbes_value_to_float)

In [12]:
df['profits_%_sales'] = df['profits_%_sales'].map(lambda x: np.nan if x in ['∞','-∞']  else x)

In [13]:
df['company'] = df['company'].astype('string')
df['industry'] = df['industry'].astype('string')
df['country'] = df['country'].astype('string')
df['market_value'] = df['market_value'].astype('float')
df['profits'] = df['profits'].astype('float')
df['assets'] = df['assets'].astype('float')
df['sales'] = df['sales'].astype('float')
df['rank'] = df['rank'].astype('Int64')
df['year'] = df['year'].astype('Int64')
df['profits_%_assets'] = df['profits_%_assets'].astype('float')
df['profits_%_sales'] = df['profits_%_sales'].astype('float')
df['forbes_webpage'] = df['forbes_webpage'].astype('string')
df['sector'] = df['sector'].astype('string')
df['continent'] = df['continent'].astype('string')
df['headquarters'] = df['headquarters'].astype('string')
df['state'] = df['state'].astype('string')
df['ceo'] = df['ceo'].astype('string')
df.dtypes

company              string
industry             string
country              string
market_value        float64
profits             float64
assets              float64
sales               float64
rank                  Int64
year                  Int64
profits_%_assets    float64
profits_%_sales     float64
forbes_webpage       string
sector               string
continent            string
headquarters         string
state                string
ceo                  string
dtype: object

In [14]:
def fill_empty_values_from_company_tuples(col: str, col_type: str):
    key_map = {}

    for company, col_val in df[['company', col]].values:
        if  pd.notna(col_val):
            key_map[company] = col_val

    df[col] = df['company'].map(lambda x: key_map.get(x, pd.NA))
    df[col] = df[col].astype(col_type)

In [15]:
fill_empty_values_from_company_tuples(
    col='sector',
    col_type='string'
)

fill_empty_values_from_company_tuples(
    col='industry',
    col_type='string'
)

In [16]:
fill_empty_values_from_company_tuples(
    col='ceo',
    col_type='string'
)

fill_empty_values_from_company_tuples(
    col='forbes_webpage',
    col_type='string'
)

In [17]:
fill_empty_values_from_company_tuples(
    col='continent',
    col_type='string'
)

fill_empty_values_from_company_tuples(
    col='state',
    col_type='string'
)

fill_empty_values_from_company_tuples(
    col='headquarters',
    col_type='string'
)

In [18]:
continent_map = {}
for country, continent in df[['country', 'continent']].values:
    if  pd.notna(continent):
        continent_map[country] = continent

df['continent'] = df['country'].map(lambda x: continent_map.get(x, pd.NA))
df['continent'] = df['continent'].astype('string')


In [19]:
df['profits_%_assets'] = round(df['profits'] / df['assets'],6)
df['profits_%_sales'] = round(df['profits'] / df['sales'],6)


In [20]:
df.dtypes

company              string
industry             string
country              string
market_value        float64
profits             float64
assets              float64
sales               float64
rank                  Int64
year                  Int64
profits_%_assets    float64
profits_%_sales     float64
forbes_webpage       string
sector               string
continent            string
headquarters         string
state                string
ceo                  string
dtype: object

In [21]:
df.count()

company             25998
industry            25665
country             25998
market_value        25993
profits             25989
assets              25983
sales               25987
rank                25998
year                25998
profits_%_assets    25974
profits_%_sales     25975
forbes_webpage      24928
sector              22648
continent           25978
headquarters        22174
state                6517
ceo                 22162
dtype: int64

In [22]:
for company in sorted(df.company.unique()):
    name = str(company)
    replacement = COMPANIES.get(name, None)
    if replacement:
        df.loc[df.company == name, 'company'] = replacement

In [23]:
for company in sorted(df.company.unique()):
    lower_case = unidecode.unidecode(
        str(company).lower().strip()
    )
    df.loc[df.company == company, 'company'] = lower_case

In [24]:
df['is_top_500'] = False
df.loc[df['rank'] <= 500, 'is_top_500'] = True

In [25]:
df.to_csv('../data/forbes_2000_all_in_one_original.csv', index=False, header=True)