# **EXPLORATORY ANALYSIS**

Let's go to see what are in data/db_bsm_categorical.csv

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
import folium
import geopy
import vincent
from folium import plugins
from geopy.geocoders import Nominatim
from vincent import AxisProperties, PropertySet, ValueRef

In [2]:
path_ = '../data/db_bsm_categorical.csv'

In [3]:
db_bsm = pd.read_csv(path_)
db_bsm.sample(5)

Unnamed: 0,ticker,company,sector_gics,sector_icb,stock_index,country
594,CENX,Century Aluminum Company,,Basic materials,,United States
1990,AKTX,"Akari Therapeutics, Plc",,,NASDAQ Composite,United States
725,STX,Seagate Technology plc,Information technology,Technology,,Ireland
927,SVMK,SVMK Inc.,Information technology,Technology,,United States
1712,HVT.L,The Heavitree Brewery PLC,Consumer staples,Consumer goods,,Kingdom


What type of data are there in *path_*?

In [4]:
db_bsm.dtypes

ticker         object
company        object
sector_gics    object
sector_icb     object
stock_index    object
country        object
dtype: object

In [5]:
db_bsm.describe()

Unnamed: 0,ticker,company,sector_gics,sector_icb,stock_index,country
count,2023,2023,1517,1834,328,1762
unique,1983,1868,11,11,12,35
top,AMZN,"Amazon.com, Inc.",Information technology,Technology,CAC 40,United States
freq,3,4,523,523,34,1005


In [6]:
db_bsm.isnull().sum()

ticker            0
company           0
sector_gics     506
sector_icb      189
stock_index    1695
country         261
dtype: int64

##### First analysis is to know how many **company** there are by **country**. So **company** column must be unique.

In [7]:
df_company_by_countr = db_bsm[['company', 'country']].drop_duplicates()

In [8]:
mask = df_company_by_countr[df_company_by_countr['company'].duplicated(keep=False)]
mask = mask[mask['country'].isna()].index

In [9]:
df_company_by_countr = df_company_by_countr.drop(mask)
df_company_by_countr = df_company_by_countr.groupby(['country'])['company'].count().to_frame()

In [10]:
data = [go.Bar(x=df_company_by_countr.index, y=df_company_by_countr['company'].tolist(), 
               marker=dict(color='rgb(141,224,94)', line=dict(color='rgb(69,129,34)', width=1.5)), opacity=0.4)]
layout = go.Layout(title='Companies by country', yaxis=dict(type='log', autorange=True))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='companies-by-country')


Consider using IPython.display.IFrame instead



The chosen of logaritmic scale on y axis is to see differences between countries with less companies.

##### Next, we are going to see **sectors** by **country** so must be a unique record by **company** and **sector** for each **country**.

In [11]:
df_company_by_gics = db_bsm[['company', 'country', 'sector_gics']].drop_duplicates()
df_company_by_icb = db_bsm[['company', 'country', 'sector_icb']].drop_duplicates()

In [12]:
df_company_by_gics.describe()

Unnamed: 0,company,country,sector_gics
count,1931,1670,1441
unique,1868,35,11
top,"Amazon.com, Inc.",United States,Information technology
freq,3,958,500


In [13]:
df_company_by_icb.describe()

Unnamed: 0,company,country,sector_icb
count,1924,1663,1735
unique,1868,35,11
top,Petróleo Brasileiro S.A. - Petrobras,United States,Technology
freq,3,953,500


In [14]:
df_company_by_gics[df_company_by_gics['company'] == 'Amazon.com, Inc.']

Unnamed: 0,company,country,sector_gics
1155,"Amazon.com, Inc.",United States,Information technology
1156,"Amazon.com, Inc.",United States,Industrials
1157,"Amazon.com, Inc.",United States,Consumer discretionary


In [15]:
df_company_by_icb[df_company_by_icb['company'] == 'Petróleo Brasileiro S.A. - Petrobras']

Unnamed: 0,company,country,sector_icb
633,Petróleo Brasileiro S.A. - Petrobras,Brazil,Basic materials
1110,Petróleo Brasileiro S.A. - Petrobras,Brazil,Utilities
1963,Petróleo Brasileiro S.A. - Petrobras,Brazil,


The reason of a company is included in different sectors is due to the way of download from yahoo finance on different countries. This means that the company is bigger because we can set it in different sectors.

In [16]:
df_company_by_gics = df_company_by_gics.groupby(['country', 'sector_gics'], as_index=False)['company'].count()
df_company_by_gics = df_company_by_gics.sort_values(by=['company'], ascending=[False])

df_company_by_icb = df_company_by_icb.groupby(['country', 'sector_icb'], as_index=False)['company'].count()
df_company_by_icb = df_company_by_icb.sort_values(by=['company'], ascending=[False])

In [17]:
trace = []
for sector in df_company_by_gics['sector_gics'].unique():
    trace.append({
        'x': df_company_by_gics[df_company_by_gics['sector_gics'] == sector]['country'].tolist(),
        'y': df_company_by_gics[df_company_by_gics['sector_gics'] == sector]['company'].tolist(),
        'name': sector,
        'type': 'bar'
    })
layout = {
  'yaxis': {'title': 'Companies', 
            'type': 'log', 
            'autorange': True},
  'barmode': 'relative',
  'title': 'Sectors GICS by country'
}
py.iplot({'data': trace, 'layout': layout}, filename='sectors-by-country-gics')

In [18]:
trace = []
for sector in df_company_by_icb['sector_icb'].unique():
    trace.append({
        'x': df_company_by_icb[df_company_by_icb['sector_icb'] == sector]['country'].tolist(),
        'y': df_company_by_icb[df_company_by_icb['sector_icb'] == sector]['company'].tolist(),
        'name': sector,
        'type': 'bar'
    })
layout = {
  'yaxis': {'title': 'Companies', 
            'type': 'log', 
            'autorange': True},
  'barmode': 'relative',
  'title': 'Sectors ICB by country'
}
py.iplot({'data': trace, 'layout': layout}, filename='sectors-by-country-icb')

In both graphics we can see that *Information technology* for **GICS** and *Technology* for **ICB** are the sectors with more companies in most of countries. <span style="color:red">**CAUTION**:</span> Be aware to understand the logaritmic scale in y-axis. However *Technology* in *Sectors ICB by country* for United States, for example, looks like the highest such difference is not "real" because of the logaritmic scale, the diference is over the y-axis.

##### In order to show the real differences between sectors, let's plot it in a map. First of all it is needed to do a previous process.

In [19]:
path_country_capital = 'https://en.wikipedia.org/wiki/List_of_national_capitals'
df_country_cap = pd.read_html(path_country_capital, header=0)[1][['City', 'Country']]
df_country_cap = df_country_cap.rename(columns={'Country': 'country', 'City': 'city'})

df_country = db_bsm['country'].drop_duplicates().to_frame()

In [20]:
df_country_cap = pd.merge(left=df_country, right=df_country_cap, how='left', on='country')

In [21]:
df_country_cap.loc[df_country_cap['country'] == 'Africa', 'city'] = 'South Africa'
df_country_cap.loc[df_country_cap['country'] == 'Netherland', 'city'] = 'Amsterdam'
df_country_cap.loc[df_country_cap['country'] == 'Kingdom', 'city'] = 'London'
df_country_cap.loc[df_country_cap['country'] == 'Hong Kong', 'city'] = 'Victoria City'
df_country_cap.loc[df_country_cap['country'] == 'Israel', 'city'] = 'Jerusalem'
df_country_cap.loc[df_country_cap['country'] == 'Korea', 'city'] = 'Seoul'
df_country_cap.loc[df_country_cap['country'] == 'Chile', 'city'] = 'Santiago de Chile'

df_country_cap = df_country_cap.dropna()

In [22]:
def get_lat_long(df_, col_city):
    """
    This function yields the longitude and latitude given a list of cities.
    
    :param pd.DataFrame df_: Dataframe with information about cities.
    :param str col_city: Name of the column that has data about cities.
    """
    
    df_['lat'] = np.nan
    df_['lon'] = np.nan
    geolocator = Nominatim(user_agent='user-agent-app')
    for city in df_[col_city].unique():
        df_.loc[df_[col_city] == city, 'lat'] = geolocator.geocode(city, timeout=geopy.geocoders.base.DEFAULT_SENTINEL).latitude
        df_.loc[df_[col_city] == city, 'lon'] = geolocator.geocode(city, timeout=geopy.geocoders.base.DEFAULT_SENTINEL).longitude
    return df_

df_country_cap = get_lat_long(df_country_cap, 'city')
df_country_cap.sample(5)

Unnamed: 0,country,city,lat,lon
16,China,Beijing,39.906217,116.391276
20,Chile,Santiago de Chile,-33.437797,-70.650445
7,Japan,Tokyo,35.682839,139.759455
32,Sweden,Stockholm,59.325117,18.071093
35,Italy,Rome,41.894802,12.485338


In [23]:
df_sector_capital_gics = pd.merge(left=df_company_by_gics, right=df_country_cap, how='left', on=['country'])
df_sector_capital_icb = pd.merge(left=df_company_by_icb, right=df_country_cap, how='left', on=['country'])

And finally let's create the maps.

**GICS**

In [24]:
map_ = folium.Map([40, 0], zoom_start=2)
for country in df_sector_capital_gics['country'].unique():
    df_axu = df_sector_capital_gics[df_sector_capital_gics['country'] == country][['sector_gics', 'company']].set_index(['sector_gics'])
    bar = vincent.Bar(df_axu)
    bar.axis_titles(y='Companies', x='')
    bar.x_axis_properties(title_offset=200)
    ax = AxisProperties(
         labels = PropertySet(angle=ValueRef(value=90), dx=ValueRef(value=55)))
    bar.axes[0].properties = ax
    bar_dict = bar.grammar()
    bar_dict['width'] = 200
    bar_dict['height'] = 150
    popup = folium.Popup(max_width=650)
    folium.Vega(bar_dict, height=280, width=250).add_to(popup)
    bubble = folium.Circle(
        location=[df_sector_capital_gics[df_sector_capital_gics['country'] == country]['lat'].unique()[0],
                  df_sector_capital_gics[df_sector_capital_gics['country'] == country]['lon'].unique()[0]],
        tooltip=country,
        popup=popup,
        radius=int(df_sector_capital_gics.groupby('country')['company'].sum().loc[country]) * 2500,
        color='#458122',
        fill=True,
        fill_color='#8de05e'
    )
    bubble.add_to(map_)
map_

**ICB**

In [25]:
map_ = folium.Map([40, 0], zoom_start=2)
for country in df_sector_capital_icb['country'].unique():
    df_axu = df_sector_capital_icb[df_sector_capital_icb['country'] == country][['sector_icb', 'company']].set_index(['sector_icb'])
    bar = vincent.Bar(df_axu)
    bar.axis_titles(y='Companies', x='')
    bar.x_axis_properties(title_offset=200)
    ax = AxisProperties(
         labels = PropertySet(angle=ValueRef(value=90), dx=ValueRef(value=55)))
    bar.axes[0].properties = ax
    bar_dict = bar.grammar()
    bar_dict['width'] = 200
    bar_dict['height'] = 150
    popup = folium.Popup(max_width=650)
    folium.Vega(bar_dict, height=280, width=250).add_to(popup)
    bubble = folium.Circle(
        location=[df_sector_capital_icb[df_sector_capital_icb['country'] == country]['lat'].unique()[0],
                  df_sector_capital_icb[df_sector_capital_icb['country'] == country]['lon'].unique()[0]],
        tooltip=country,
        popup=popup,
        radius=int(df_sector_capital_icb.groupby('country')['company'].sum().loc[country]) * 2500,
        color='#102b56',
        fill=True,
        fill_color='#6499ef'
    )
    bubble.add_to(map_)
map_