In [26]:
import pandas as pd

In [27]:
# import COFER data
df = pd.read_excel('data/cofer.xlsx', 
                   skiprows=4,
                   nrows=15,
                   usecols=lambda x: x != 'A'
                   )

# delete 	Unnamed: 0 column
del df['Unnamed: 0']

# rename Unnamed: 1 to 'series'
df = df.rename(columns={'Unnamed: 1': 'series'})

# melt
df = df.melt(id_vars='series', var_name='date', value_name='share')

# change quarter to from strings 'YYYYQ1' to datetime
def convert_quarter_to_date(quarter_str):
    year, quarter = quarter_str.split('Q')
    month = (int(quarter) - 1) * 3 + 1
    return pd.Timestamp(f'{year}-{month:02d}-01')

df['date'] = df['date'].apply(convert_quarter_to_date)

# only rows where series contains "Shares"
df = df[df['series'].str.contains('Shares')]

# remove 'Shares of ' from `series` values
df['series'] = df['series'].str.replace('Shares of ', '')

df

Unnamed: 0,series,date,share
3,Allocated Reserves,2000-01-01,77.402382
4,U.S. dollars,2000-01-01,71.459785
5,euro,2000-01-01,17.504981
6,Chinese renminbi,2000-01-01,
7,Japanese yen,2000-01-01,6.337289
...,...,...,...
1381,Australian dollars,2024-07-01,2.268987
1382,Canadian dollars,2024-07-01,2.739457
1383,Swiss francs,2024-07-01,0.167371
1384,other currencies,2024-07-01,4.456427


In [30]:
df = df[(df['series'] != 'other currencies') & (df['series'] != 'Unallocated Reserves') & (df['series'] != 'Allocated Reserves')]
df

Unnamed: 0,series,date,share
4,U.S. dollars,2000-01-01,71.459785
5,euro,2000-01-01,17.504981
6,Chinese renminbi,2000-01-01,
7,Japanese yen,2000-01-01,6.337289
8,pounds sterling,2000-01-01,2.921832
...,...,...,...
1379,Japanese yen,2024-07-01,5.819352
1380,pounds sterling,2024-07-01,4.969310
1381,Australian dollars,2024-07-01,2.268987
1382,Canadian dollars,2024-07-01,2.739457


In [32]:
df['label'] = df['series'].apply(lambda x: 'US Dollar' if x == 'U.S. dollars' else ('Chinese Renminbi' if x == 'Chinese renminbi' else 'Other'))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['series'].apply(lambda x: 'US Dollar' if x == 'U.S. dollars' else ('Chinese Renminbi' if x == 'Chinese renminbi' else 'Other'))


Unnamed: 0,series,date,share,label
4,U.S. dollars,2000-01-01,71.459785,US Dollar
5,euro,2000-01-01,17.504981,Other
6,Chinese renminbi,2000-01-01,,Chinese Renminbi
7,Japanese yen,2000-01-01,6.337289,Other
8,pounds sterling,2000-01-01,2.921832,Other
...,...,...,...,...
1379,Japanese yen,2024-07-01,5.819352,Other
1380,pounds sterling,2024-07-01,4.969310,Other
1381,Australian dollars,2024-07-01,2.268987,Other
1382,Canadian dollars,2024-07-01,2.739457,Other


In [35]:
df.series.unique()

array(['U.S. dollars', 'euro', 'Chinese renminbi', 'Japanese yen',
       'pounds sterling', 'Australian dollars', 'Canadian dollars',
       'Swiss francs'], dtype=object)

In [33]:
df.to_csv('data/cofer_clean.csv', index=False)