In [3]:
import pandas as pd 

In [57]:
# read in the oecd data
df = pd.read_csv('data/gdpph_oecd.csv')

df = df[['Country','LOCATION', 'Measure','TIME_PERIOD', 'OBS_VALUE']]

df.columns = ['country', 'id', 'measure', 'year', 'value']

# sort by country and year
df = df.sort_values(by=['country', 'year'])

df

Unnamed: 0,country,id,measure,year,value
3,Australia,AUS,US dollars,1990,35.589084
1435,Australia,AUS,2015=100,1990,65.728880
1367,Australia,AUS,US dollars,1991,36.406827
1371,Australia,AUS,2015=100,1991,67.239157
472,Australia,AUS,2015=100,1992,69.306729
...,...,...,...,...,...
1239,United States,USA,2015=100,2020,107.111176
231,United States,USA,US dollars,2021,74.838217
236,United States,USA,2015=100,2021,108.386615
698,United States,USA,2015=100,2022,107.025198


In [58]:
# Pivot the dataframe to create separate columns for each measure
df_pivot = df.pivot_table(index=['country', 'id', 'year'], columns='measure', values='value').reset_index()

# Rename the columns for clarity
df_pivot.columns.name = None
df_pivot.rename(columns={'US dollars': 'value', '2015=100': 'index'}, inplace=True)

df_pivot

Unnamed: 0,country,id,year,index,value
0,Australia,AUS,1990,65.728880,35.589084
1,Australia,AUS,1991,67.239157,36.406827
2,Australia,AUS,1992,69.306729,37.526320
3,Australia,AUS,1993,69.456332,37.607323
4,Australia,AUS,1994,69.609837,37.690439
...,...,...,...,...,...
1184,United States,USA,2018,102.342200,70.664702
1185,United States,USA,2019,103.581713,71.520554
1186,United States,USA,2020,107.111176,73.957559
1187,United States,USA,2021,108.386615,74.838217


In [59]:
df_pivot.dtypes

country     object
id          object
year         int64
index      float64
value      float64
dtype: object

In [60]:
df_pivot[df_pivot['id'] == 'USA']

Unnamed: 0,country,id,year,index,value
1156,United States,USA,1990,66.017332,45.583299
1157,United States,USA,1991,66.882229,46.180488
1158,United States,USA,1992,69.179788,47.766894
1159,United States,USA,1993,69.44318,47.94876
1160,United States,USA,1994,70.03144,48.354938
1161,United States,USA,1995,70.185526,48.461331
1162,United States,USA,1996,71.930383,49.666111
1163,United States,USA,1997,72.97303,50.386032
1164,United States,USA,1998,74.614716,51.519575
1165,United States,USA,1999,76.818796,53.041436


In [61]:
# Change the base year for the index to 1990
base_year = 1990

# Get the base values for the year 1990
base_values_1990 = df_pivot[df_pivot['year'] == base_year].set_index('country')['value']

# Update the index column to reflect the new base year
df_pivot['index'] = df_pivot.apply(lambda row: row['value'] / base_values_1990[row['country']] * 100, axis=1)

df_pivot

KeyError: 'Austria'

In [62]:
# create a version with just g7 countries
g7_countries = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States']
g7 = df_pivot[df_pivot['country'].isin(g7_countries)]

g7

Unnamed: 0,country,id,year,index,value
94,Canada,CAN,1990,72.860442,37.329579
95,Canada,CAN,1991,73.501045,37.657788
96,Canada,CAN,1992,75.004661,38.428156
97,Canada,CAN,1993,76.522221,39.205668
98,Canada,CAN,1994,78.043471,39.985071
...,...,...,...,...,...
1184,United States,USA,2018,102.342200,70.664702
1185,United States,USA,2019,103.581713,71.520554
1186,United States,USA,2020,107.111176,73.957559
1187,United States,USA,2021,108.386615,74.838217


In [40]:
# change g7 year to datetime
g7['year'] = pd.to_datetime(g7['year'], format='%Y')

g7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g7['year'] = pd.to_datetime(g7['year'], format='%Y')


Unnamed: 0,country,id,year,index,value
94,Canada,CAN,1990-01-01,72.860442,37.329579
95,Canada,CAN,1991-01-01,73.501045,37.657788
96,Canada,CAN,1992-01-01,75.004661,38.428156
97,Canada,CAN,1993-01-01,76.522221,39.205668
98,Canada,CAN,1994-01-01,78.043471,39.985071
...,...,...,...,...,...
1184,United States,USA,2018-01-01,102.342200,70.664702
1185,United States,USA,2019-01-01,103.581713,71.520554
1186,United States,USA,2020-01-01,107.111176,73.957559
1187,United States,USA,2021-01-01,108.386615,74.838217


In [64]:
# change g7 year to datetime
g7['year'] = pd.to_datetime(g7['year'], format='%Y')

g7.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g7['year'] = pd.to_datetime(g7['year'], format='%Y')


country            object
id                 object
year       datetime64[ns]
index             float64
value             float64
dtype: object

In [65]:
# change g7 index base year to 1990
base_year = 1990

# Get the base values for the year 1990
base_values_1990 = g7[g7['year'] == pd.Timestamp(str(base_year))].set_index('country')['value']

# Update the index column to reflect the new base year
g7['index'] = g7.apply(lambda row: row['value'] / base_values_1990[row['country']] * 100, axis=1)

g7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g7['index'] = g7.apply(lambda row: row['value'] / base_values_1990[row['country']] * 100, axis=1)


Unnamed: 0,country,id,year,index,value
94,Canada,CAN,1990-01-01,100.000000,37.329579
95,Canada,CAN,1991-01-01,100.879220,37.657788
96,Canada,CAN,1992-01-01,102.942913,38.428156
97,Canada,CAN,1993-01-01,105.025744,39.205668
98,Canada,CAN,1994-01-01,107.113640,39.985071
...,...,...,...,...,...
1184,United States,USA,2018-01-01,155.023229,70.664702
1185,United States,USA,2019-01-01,156.900785,71.520554
1186,United States,USA,2020-01-01,162.247052,73.957559
1187,United States,USA,2021-01-01,164.179027,74.838217


In [67]:
g7[g7['year'] == pd.Timestamp('1990-01-01')]

Unnamed: 0,country,id,year,index,value
94,Canada,CAN,1990-01-01,100.0,37.329579
344,France,FRA,1990-01-01,100.0,47.626606
377,Germany,DEU,1990-01-01,100.0,45.617388
574,Italy,ITA,1990-01-01,100.0,45.001814
607,Japan,JPN,1990-01-01,100.0,31.293218
1123,United Kingdom,GBR,1990-01-01,100.0,38.416611
1156,United States,USA,1990-01-01,100.0,45.583299


In [69]:
g7.to_csv('data/g7_productivity.csv', index=False)

In [36]:
df.to_csv('data/gdpph_clean.csv', index=False)

In [8]:
import requests

# Define the URL for the OECD API for the UK
url = "https://stats.oecd.org/SDMX-JSON/data/PDB_LV/GBR.GDPHRS/all?startTime=1990&endTime=2022"

# Fetch the data from the API
response = requests.get(url)
data = response.json()

# Inspect the structure of the data dictionary
print(data.keys())

# Extract the relevant data
if 'dataSets' in data:
	uk_data = data['dataSets'][0]['series']['0:0:0:0:0']['observations']
else:
	print("Key 'dataSets' not found in the data dictionary")
	uk_data = {}

# Convert the data to a pandas DataFrame
uk_gdp_per_hour = pd.DataFrame.from_dict(uk_data, orient='index', columns=['value'])
uk_gdp_per_hour['year'] = pd.to_datetime([f"{1990 + int(i)}-01-01" for i in uk_gdp_per_hour.index])
uk_gdp_per_hour.set_index('year', inplace=True)

uk_gdp_per_hour

dict_keys(['meta', 'data', 'errors'])
Key 'dataSets' not found in the data dictionary


Unnamed: 0_level_0,value
year,Unnamed: 1_level_1


## Manufacturing Value Added

In [8]:
# import world bank data library

import wbgapi as wb

In [86]:
manufacturing_gdp = wb.data.DataFrame('NV.IND.MANF.ZS', time=range(1990, 2024), labels=True)

manufacturing_gdp

Unnamed: 0_level_0,Country,YR1990,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,YR1998,...,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022,YR2023
economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZWE,Zimbabwe,20.484785,24.096142,26.898699,21.021162,19.054827,19.264222,16.685760,15.878677,14.416095,...,12.590765,11.888599,11.596020,14.026908,13.678137,14.222360,15.696566,12.442934,20.533466,16.429474
ZMB,Zambia,31.856891,33.345886,33.191634,24.976080,9.088726,9.177594,10.915614,10.794944,10.773698,...,6.819983,7.522311,7.686125,8.127238,6.847766,6.788930,7.703831,8.668315,7.996353,8.502959
YEM,"Yemen, Rep.",,,,,,,,,,...,,,,,,,,,,
PSE,West Bank and Gaza,,,,,18.844301,17.463750,14.186415,12.322464,12.210531,...,10.750767,9.321233,10.026354,11.655506,11.547252,11.234716,11.047084,11.193329,11.195638,
VIR,Virgin Islands (U.S.),,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CEB,Central Europe and the Baltics,,,,,,19.448570,19.473041,19.364006,18.812993,...,18.451946,18.716251,18.788412,18.088496,17.659202,17.323624,16.613401,16.776185,16.504799,16.224830
CSS,Caribbean small states,6.271485,6.355659,5.934071,5.731104,5.606207,5.634440,5.702603,4.597619,4.388638,...,6.103557,5.302271,5.176896,6.003089,5.660029,5.154541,5.463746,4.847000,4.604154,3.994339
ARB,Arab World,9.702253,11.064014,11.228357,11.117435,11.364042,12.198662,12.569925,12.705414,13.316956,...,9.891700,10.691272,10.895203,10.626059,10.311506,10.240065,11.148078,11.544159,11.832024,11.649006
AFW,Africa Western and Central,15.076753,15.908889,14.543474,15.755576,18.704392,19.199453,18.454293,18.924640,17.545688,...,10.482729,10.242908,9.980801,10.099211,10.654730,11.590153,12.269603,13.502256,13.220448,13.807881


In [87]:
# get a time series of manufacturing value added as a percentage of GDP for the UK, OECD Countries, World, and EU from 1990 to 2023
manufacturing_gdp = wb.data.DataFrame('NV.IND.MANF.ZS', economy=['GBR', 'OED', 'WLD', 'EUU'], time=range(1990, 2024), labels=True)

# reset the index
manufacturing_gdp.reset_index(inplace=True)


manufacturing_gdp

Unnamed: 0,economy,Country,YR1990,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,...,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022,YR2023
0,EUU,European Union,,19.689237,18.97144,18.087389,17.937427,17.949232,17.575646,17.649205,...,14.604544,15.196184,15.264572,15.213411,15.092138,14.880862,14.567861,14.786595,14.863806,14.670236
1,WLD,World,,,,,,,,18.991646,...,15.902311,16.372608,16.187258,16.252184,16.364702,15.957527,15.945706,16.461847,15.762448,15.236553
2,OED,OECD members,,,,,,,,18.024019,...,13.686878,13.895618,13.789732,13.787907,13.799605,13.454898,13.170119,13.265817,,
3,GBR,United Kingdom,16.486036,16.086395,15.870871,15.510671,16.224741,15.337884,15.135391,14.948422,...,9.410668,9.294733,9.109806,9.054196,8.95236,8.79682,8.997255,8.64632,8.108404,8.275426


In [88]:
# melt the dataframe. currently the years are cols. 'economy' and 'Country' are the id_vars
manufacturing_gdp_melt = manufacturing_gdp.melt(id_vars=['economy', 'Country'], var_name='year', value_name='value')

manufacturing_gdp_melt

Unnamed: 0,economy,Country,year,value
0,EUU,European Union,YR1990,
1,WLD,World,YR1990,
2,OED,OECD members,YR1990,
3,GBR,United Kingdom,YR1990,16.486036
4,EUU,European Union,YR1991,19.689237
...,...,...,...,...
131,GBR,United Kingdom,YR2022,8.108404
132,EUU,European Union,YR2023,14.670236
133,WLD,World,YR2023,15.236553
134,OED,OECD members,YR2023,


In [89]:
# turn the year column into a datetime. It's currently a string in the format 'YRyyyy'
manufacturing_gdp_melt['year'] = pd.to_datetime(manufacturing_gdp_melt['year'].str.replace('YR', ''), format='%Y')

manufacturing_gdp_melt

Unnamed: 0,economy,Country,year,value
0,EUU,European Union,1990-01-01,
1,WLD,World,1990-01-01,
2,OED,OECD members,1990-01-01,
3,GBR,United Kingdom,1990-01-01,16.486036
4,EUU,European Union,1991-01-01,19.689237
...,...,...,...,...
131,GBR,United Kingdom,2022-01-01,8.108404
132,EUU,European Union,2023-01-01,14.670236
133,WLD,World,2023-01-01,15.236553
134,OED,OECD members,2023-01-01,


In [90]:
manufacturing_gdp_melt.columns = ['id', 'country', 'date', 'value']

In [92]:
manufacturing_gdp_melt['value'] = manufacturing_gdp_melt['value'] / 100

In [93]:
# save the data to a csv
manufacturing_gdp_melt.to_csv('data/mva.csv', index=False)

In [4]:
g20_countries = ['ARG', 'AUS', 'BRA', 'CAN', 'CHN', 'FRA', 'DEU', 'IND', 'IDN', 'ITA', 'JPN', 'MEX', 'RUS', 'SAU', 'ZAF', 'KOR', 'TUR', 'GBR', 'USA', 'EUU']

In [12]:
# Get R&D spending as a percentage of GDP for all G7 countries since 1990
# Define G7 countries
g7_countries = ['CAN', 'FRA', 'DEU', 'ITA', 'JPN', 'GBR', 'USA']


rd_spending = wb.data.DataFrame('GB.XPD.RSDV.GD.ZS', economy=g7_countries, time=range(1990, 2024), labels=True)

# Reset the index
rd_spending.reset_index(inplace=True)

rd_spending

Unnamed: 0,economy,Country,YR1990,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,...,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022,YR2023
0,USA,United States,,,,,,,2.45013,2.47714,...,2.71786,2.787,2.8535,2.90432,3.0101,3.17049,3.46777,3.45705,,
1,GBR,United Kingdom,,,,,,,1.57321,1.53641,...,2.2645,2.27027,2.31145,2.3232,2.70482,2.66552,2.93144,2.91476,,
2,JPN,Japan,,,,,,,2.64303,2.72177,...,3.36788,3.24071,3.10666,3.16636,3.2192,3.21824,3.26897,3.29581,,
3,ITA,Italy,,,,,,,0.9459,0.98769,...,1.3384,1.3385,1.36642,1.37013,1.42443,1.46159,1.5068,1.45392,,
4,DEU,Germany,,,,,,,2.14461,2.18832,...,2.87784,2.93379,2.94039,3.0471,3.11011,3.16779,3.12979,3.14246,,
5,FRA,France,,,,,,,2.2228,2.14699,...,2.27592,2.22702,2.22238,2.19888,2.19666,2.19179,2.28189,2.21918,,
6,CAN,Canada,,,,,,,1.60682,1.61386,...,1.71417,1.69324,1.72903,1.68702,1.7372,1.75573,1.89484,1.69727,1.55182,


In [13]:
# melt
rd_spending_melt = rd_spending.melt(id_vars=['economy', 'Country'], var_name='year', value_name='value')

# turn the year column into a datetime
rd_spending_melt['year'] = pd.to_datetime(rd_spending_melt['year'].str.replace('YR', ''), format='%Y')


rd_spending_melt.columns = ['id', 'country', 'date', 'value']

rd_spending_melt['value'] = rd_spending_melt['value'] / 100

rd_spending_melt

Unnamed: 0,id,country,date,value
0,USA,United States,1990-01-01,
1,GBR,United Kingdom,1990-01-01,
2,JPN,Japan,1990-01-01,
3,ITA,Italy,1990-01-01,
4,DEU,Germany,1990-01-01,
...,...,...,...,...
233,JPN,Japan,2023-01-01,
234,ITA,Italy,2023-01-01,
235,DEU,Germany,2023-01-01,
236,FRA,France,2023-01-01,


In [14]:
rd_spending_melt.to_csv('data/rd_spending.csv', index=False)