In [1]:
import pandas as pd
import numpy as np

import math

In [2]:
df = pd.read_csv(r"C:\Users\ITN1\OneDrive - UPV\Escritorio\Data Science\Blogging\data\patent_data\dataset_frac_inpadoc_family_by_year_by_cpc4dig_by_nuts2.csv")

In [3]:
df.head()

Unnamed: 0,Year,nuts2,cpc4dig,Frac_fam
0,1999,FI19,G06K,1.345238
1,1990,UKH1,B01J,0.160606
2,1990,BE22,B01J,0.174923
3,1990,UKJ1,B01J,1.307167
4,1990,NL42,B01J,0.00101


In [4]:
#filter for the years used
df = df[(df['Year'] >= 2010) & (df['Year'] <= 2015)]

#Remove the Turkey data
df = df[df['nuts2'].str[0:2] != "TR"]

In [5]:
#Create the cpc-3dig code
df['cpc3dig'] = df['cpc4dig'].str[0:4]

df = df.groupby(['cpc3dig', 'nuts2']).sum().reset_index()
df.head()

Unnamed: 0,cpc3dig,nuts2,Year,Frac_fam
0,A01B,AT12,4028,1.833333
1,A01B,AT13,2015,0.1
2,A01B,AT21,6042,0.72619
3,A01B,AT22,8054,2.119048
4,A01B,AT31,12075,11.404365


##### Hirschman-Herfindahl-Index

In [6]:
df_full = df.groupby('nuts2')['Frac_fam'].sum().reset_index().rename(columns = {'Frac_fam': 'total_of_region'})
df = pd.merge(df, df_full, on = 'nuts2')

In [7]:
df['share'] = df['Frac_fam']/df['total_of_region']
df['share_alpha'] = (df['share'])**2

hh_index = df.groupby('nuts2')['share_alpha'].sum().reset_index().rename(
    columns = {'share_alpha': 'hh_index'}).sort_values(
    by = 'hh_index', ascending = False)

hh_index.head()

Unnamed: 0,nuts2,hh_index
222,PT20,1.0
23,BG34,0.36
87,EL42,0.232995
96,EL65,0.207101
223,PT30,0.188147


##### Shannon Entropy Index

In [8]:
df['share_lnshare'] = df['share'] * np.log2(df['share'])

In [9]:
se_index = df.groupby('nuts2')['share_lnshare'].sum().reset_index().rename(
    columns = {'share_lnshare': 'se_index'}).sort_values(
    by = 'se_index', ascending = False)

se_index.head()

Unnamed: 0,nuts2,se_index
222,PT20,0.0
23,BG34,-1.521928
96,EL65,-2.295466
87,EL42,-2.336102
223,PT30,-2.972469


##### Ogive Index

In [10]:
num_industries = len(pd.unique(df['cpc3dig']))
df['o_sum'] = num_industries * ((df['share'] - (1/num_industries))**2)

In [11]:
o_index = df.groupby('nuts2')['o_sum'].sum().reset_index().rename(
    columns = {'o_sum':'o_index'}).sort_values(
    by = 'o_index', ascending = False)

o_index.head()

Unnamed: 0,nuts2,o_index
222,PT20,658.001515
23,BG34,235.604545
87,EL42,151.787139
96,EL65,134.693966
223,PT30,122.199572


##### Krugman Specialization Index

In [12]:
num_regions = len(pd.unique(df['nuts2']))
df_industries = df.groupby(['cpc3dig'])['share'].sum().reset_index()
df_industries['share'] = df_industries['share']/num_regions

df_industries = df_industries.rename(columns = {'share': 'total_reference_group'})

In [18]:
df = pd.merge(df, df_industries, on = 'cpc3dig')

In [19]:
df['ks_sum'] = np.abs(df['share'] - df['total_reference_group'])

In [21]:
ks_index = df.groupby('nuts2')['ks_sum'].sum().reset_index().rename(
    columns = {'ks_sum':'ks_index'}).sort_values(
    'ks_index', ascending = False)

##### Index of Inequality

In [22]:
df['ineq_sum'] = (df['share'] - df['total_reference_group'])**2
ineq_index = df.groupby('nuts2')['ineq_sum'].sum().reset_index().rename(
    columns = {'ineq_sum':'ineq_index'}).sort_values(
    'ineq_index', ascending = False)

#### Theil Index

In [23]:
df['theil_sum'] = (1/num_industries)*(df['share']/df['total_reference_group']) * np.log2(df['share']/df['total_reference_group'])

In [24]:
theil_index = df.groupby('nuts2')['theil_sum'].sum().reset_index().rename(
    columns = {'theil_sum':'theil_index'}).sort_values(
    'theil_index', ascending = False)