# Beeswarm graphic data formatting

In [1]:
import feather
import pandas as pd

## Load data

In [2]:
df = feather.read_dataframe('data/processed/company-revenues.feather')

print(len(df))

df.head()

2891


Unnamed: 0,Symbol,Name,Proper Name,china_rev_18,china_rev_17,china_rev_16,confidence_18,confidence_17,country_hq,curated_sector,growth_18,growth_17,faster_18,growth_18_z,growth_18_p
0,BLDY62,Rhipe Limited,Rhipe Limited,15.458523,0.095392,0.48021,0.992843,0.999522,Australia,"Heavy industry, machinery",16105.1969,-80.1353,True,28.634923,100
1,661448,Webzen Inc.,Webzen Inc.,45.966671,0.32833,0.202569,0.993882,0.999543,South Korea,"IT, software, internet",13900.1495,62.0827,True,24.69286,100
2,B03MXR,Turkiye Sise ve Cam Fabrikalari A.S.,Turkiye Sise ve Cam Fabrikalari A.S.,90.761591,0.702733,0.657526,0.996689,0.999964,Turkey,"Light manufacturing, processing",12815.5167,6.8753,True,22.753813,100
3,BF1GQB,P.C.S. Machine Group Holding Public Company Ltd,P.C.S. Machine Group Holding Public Company Ltd,22.611381,0.323594,0.091828,0.980272,0.999418,Thailand,"Heavy industry, machinery",6887.5744,252.3908,True,12.156162,100
4,VBIV,"VBI Vaccines, Inc.","VBI Vaccines, Inc.",2.590077,0.043343,0.043849,0.979546,0.994639,United States,Health services and technology,5875.7547,-1.1541,True,10.347287,100


In [3]:
df.dtypes

Symbol              object
Name                object
Proper Name         object
china_rev_18       float64
china_rev_17       float64
china_rev_16       float64
confidence_18      float64
confidence_17      float64
country_hq          object
curated_sector      object
growth_18          float64
growth_17          float64
faster_18             bool
growth_18_z        float64
growth_18_p       category
dtype: object

## Sort sectors by aggregate revenue growth (inclusive of outliers) and format for D3 viz

_Filter out companies with growth > 100 per cent_

In [4]:
sort_order = df.copy() \
    .groupby('curated_sector', as_index=False)['china_rev_18', 'china_rev_17'].sum() \
    .assign(aggregate_growth_18 = lambda x:
        round((x['china_rev_18'] - x['china_rev_17']) / x['china_rev_17'], 3) * 100
    ) \
    .drop('china_rev_17', axis=1) \
    .sort_values('aggregate_growth_18', ascending=False)['curated_sector'].tolist()

sort_order

['Energy and mining',
 'IT, software, internet',
 'Health services and technology',
 'Electronic equipment, semiconductors, aerospace, defense',
 'Heavy industry, machinery',
 'Light manufacturing, processing',
 'Others',
 'Consumer and autos',
 'Retail, wholesale, logistics, shipping']

In [5]:
df_beeswarm = df.copy() \
    .query('growth_18 <= 100') \
    .assign(curated_sector = lambda x: pd.Categorical(x['curated_sector'], sort_order)) \
    .assign(colour = 'Positive') \
    .rename({
        'Name': 'name',
        'china_rev_18': 'radius',
        'curated_sector': 'category',
        'confidence_18': 'opacity',
        'growth_18': 'value'
    }, axis=1) \
    .drop('china_rev_17', axis=1) \
    .sort_values(['category', 'value'], ascending=[True, False]) \
    .reset_index(drop=True)

print(len(df_beeswarm))

df_beeswarm.head()

2454


Unnamed: 0,Symbol,name,Proper Name,radius,china_rev_16,opacity,confidence_17,country_hq,category,value,growth_17,faster_18,growth_18_z,growth_18_p,colour
0,633258,Fauji Cement Co. Ltd.,Fauji Cement Co. Ltd.,1.500558,2.414669,0.998712,0.999187,Pakistan,Energy and mining,97.9512,-68.6067,True,0.018046,85,Positive
1,B59FPC,Novolipetsk Steel,Novolipetsk Steel,100.719979,61.976106,0.998952,0.999154,Russian Federation,Energy and mining,88.0741,-13.5903,True,0.000388,84,Positive
2,BWSW5D,South32 Ltd.,South32 Ltd.,362.241709,242.838379,0.998684,0.999062,Australia,Energy and mining,76.5257,-15.4968,True,-0.020257,82,Positive
3,B03MS9,Eregli Demir ye celik Fabrikalari T.A.S.,Eregli Demir ye celik Fabrikalari T.A.S.,187.716484,75.201763,0.99464,0.995609,Turkey,Energy and mining,73.5279,43.8484,True,-0.025617,82,Positive
4,ARCH,Arch Coal Inc Class A,Arch Coal Inc Class A,133.815749,24.286804,0.99524,0.996563,United States,Energy and mining,72.9279,218.619,False,-0.026689,82,Positive


In [6]:
df_beeswarm.tail()

Unnamed: 0,Symbol,name,Proper Name,radius,china_rev_16,opacity,confidence_17,country_hq,category,value,growth_17,faster_18,growth_18_z,growth_18_p,colour
2449,608498,Salalah Port Services Co.,Salalah Port Services Co.,0.162115,0.355388,0.999861,0.999521,Oman,"Retail, wholesale, logistics, shipping",-75.5932,86.901,False,-0.292207,3,Positive
2450,MATX,"Matson, Inc.","Matson, Inc.",108.015166,707.444987,0.974276,0.979675,United States,"Retail, wholesale, logistics, shipping",-77.1694,-33.1234,False,-0.295025,3,Positive
2451,656175,Thoresen Thai Agencies Public Co. Ltd.,Thoresen Thai Agencies Public Co. Ltd.,19.203519,13.397906,0.980549,0.975655,Thailand,"Retail, wholesale, logistics, shipping",-78.6565,571.549,False,-0.297684,2,Positive
2452,GWR,"Genesee & Wyoming, Inc. Class A","Genesee & Wyoming, Inc. Class A",2.614688,13.313657,0.999878,0.998995,United States,"Retail, wholesale, logistics, shipping",-83.4216,18.4619,False,-0.306203,2,Positive
2453,659266,Sinwa Ltd.,Sinwa Ltd.,0.00086,0.763432,0.99703,0.99703,Singapore,"Retail, wholesale, logistics, shipping",-99.9526,137.5254,False,-0.335756,1,Positive


In [7]:
df_beeswarm.to_csv('data/processed/beeswarm-companies.csv', index=False)

## Aggregate companies with growth > 100 per cent

In [8]:
df_gt_100 = df.copy() \
    .query('growth_18 > 100') \
    .assign(
        curated_sector = lambda x: pd.Categorical(x['curated_sector'], sort_order)
    ) \
    .groupby('curated_sector', as_index=False)['china_rev_18', 'Name'].agg({
        'china_rev_18': 'sum',
        'Name': 'count'
    }) \
    .rename({
        'china_rev_18': 'radius',
        'Name': 'companies_count'
    }, axis=1)

print(len(df_gt_100))

df_gt_100

9


Unnamed: 0,curated_sector,radius,companies_count
0,Energy and mining,92198.184267,29
1,"IT, software, internet",980.224084,43
2,Health services and technology,14196.636486,39
3,"Electronic equipment, semiconductors, aerospac...",4484.293553,48
4,"Heavy industry, machinery",10178.508213,69
5,"Light manufacturing, processing",11287.499673,47
6,Others,6994.207543,64
7,Consumer and autos,14948.664646,71
8,"Retail, wholesale, logistics, shipping",3961.454867,27


## Format sectoral data for small- and medium-sized graphics

In [9]:
df_beeswarm_s = feather.read_dataframe('data/processed/sector-revenues.feather') \
    .assign(
        curated_sector = lambda x: pd.Categorical(x['curated_sector'], sort_order)
    ) \
    .rename({
        'curated_sector': 'name',
        'china_rev_18': 'radius_18',
        'china_rev_17': 'radius_17',
        'confidence_18': 'opacity_18',
        'confidence_17': 'opacity_17',
        'sector_growth_18': 'value_18',
        'sector_growth_17': 'value_17'
    }, axis=1) \
    .drop('slower_18', axis=1) \
    .sort_values('name') \
    .reset_index(drop=True)
    

df_beeswarm_s

Unnamed: 0,name,radius_18,radius_17,opacity_18,opacity_17,value_18,value_17
0,Energy and mining,242016.741407,163185.375811,0.995139,0.994462,48.3079,31.4231
1,"IT, software, internet",47642.046425,37938.649907,0.995452,0.994811,25.5765,15.6723
2,Health services and technology,59225.919681,48955.624952,0.995464,0.994873,20.9788,11.4335
3,"Electronic equipment, semiconductors, aerospac...",274578.018589,233920.840281,0.994018,0.993183,17.3807,8.7726
4,"Heavy industry, machinery",123882.054476,106652.015991,0.995311,0.994859,16.1554,9.7131
5,"Light manufacturing, processing",102447.873262,88986.807666,0.994462,0.993809,15.127,13.4077
6,Others,40823.692907,37357.881197,0.996099,0.995603,9.2773,31.9221
7,Consumer and autos,170877.860873,157625.254803,0.995335,0.994918,8.4077,9.8821
8,"Retail, wholesale, logistics, shipping",69476.606155,69695.619982,0.995452,0.995277,-0.3142,6.138


In [10]:
df_beeswarm_s_18 = df_beeswarm_s[['name', 'radius_18', 'opacity_18', 'value_18']].copy() \
    .assign(category = '2018') \
    .rename({
        'radius_18': 'radius',
        'opacity_18': 'opacity',
        'value_18': 'value'
    }, axis=1)

df_beeswarm_s_18

Unnamed: 0,name,radius,opacity,value,category
0,Energy and mining,242016.741407,0.995139,48.3079,2018
1,"IT, software, internet",47642.046425,0.995452,25.5765,2018
2,Health services and technology,59225.919681,0.995464,20.9788,2018
3,"Electronic equipment, semiconductors, aerospac...",274578.018589,0.994018,17.3807,2018
4,"Heavy industry, machinery",123882.054476,0.995311,16.1554,2018
5,"Light manufacturing, processing",102447.873262,0.994462,15.127,2018
6,Others,40823.692907,0.996099,9.2773,2018
7,Consumer and autos,170877.860873,0.995335,8.4077,2018
8,"Retail, wholesale, logistics, shipping",69476.606155,0.995452,-0.3142,2018


In [11]:
df_beeswarm_s_17 = df_beeswarm_s[['name', 'radius_17', 'opacity_17', 'value_17']].copy() \
    .assign(category = '2017') \
    .rename({
        'radius_17': 'radius',
        'opacity_17': 'opacity',
        'value_17': 'value'
    }, axis=1)

df_beeswarm_s_17

Unnamed: 0,name,radius,opacity,value,category
0,Energy and mining,163185.375811,0.994462,31.4231,2017
1,"IT, software, internet",37938.649907,0.994811,15.6723,2017
2,Health services and technology,48955.624952,0.994873,11.4335,2017
3,"Electronic equipment, semiconductors, aerospac...",233920.840281,0.993183,8.7726,2017
4,"Heavy industry, machinery",106652.015991,0.994859,9.7131,2017
5,"Light manufacturing, processing",88986.807666,0.993809,13.4077,2017
6,Others,37357.881197,0.995603,31.9221,2017
7,Consumer and autos,157625.254803,0.994918,9.8821,2017
8,"Retail, wholesale, logistics, shipping",69695.619982,0.995277,6.138,2017


In [12]:
df_beeswarm_s_joined = pd.concat([df_beeswarm_s_17, df_beeswarm_s_18], sort=False) \
    .assign(colour = 'Positive')

df_beeswarm_s_joined

Unnamed: 0,name,radius,opacity,value,category,colour
0,Energy and mining,163185.375811,0.994462,31.4231,2017,Positive
1,"IT, software, internet",37938.649907,0.994811,15.6723,2017,Positive
2,Health services and technology,48955.624952,0.994873,11.4335,2017,Positive
3,"Electronic equipment, semiconductors, aerospac...",233920.840281,0.993183,8.7726,2017,Positive
4,"Heavy industry, machinery",106652.015991,0.994859,9.7131,2017,Positive
5,"Light manufacturing, processing",88986.807666,0.993809,13.4077,2017,Positive
6,Others,37357.881197,0.995603,31.9221,2017,Positive
7,Consumer and autos,157625.254803,0.994918,9.8821,2017,Positive
8,"Retail, wholesale, logistics, shipping",69695.619982,0.995277,6.138,2017,Positive
0,Energy and mining,242016.741407,0.995139,48.3079,2018,Positive


In [13]:
df_beeswarm_s_joined.to_csv('data/processed/beeswarm-sectors.csv', index=False)