# Getting Raw Data into Dataframes


In [21]:
# basic imports
from IPython.display import display, Markdown, HTML
import numpy as np



## Import Spot Yield Data 

In [22]:
import pandas as pd

#load in first spreadsheet to df1
df1 = pd.read_excel("GLC Nominal month end data_1970 to 2015.xlsx",sheet_name="4. spot curve",engine="openpyxl",skiprows=5,header=None)
#create an appropriate set of headers
col_names=pd.read_excel("GLC Nominal month end data_1970 to 2015.xlsx",sheet_name="4. spot curve",engine="openpyxl",skiprows=3,nrows=1,header=None)
col_names[0]="Date"
df1.columns = col_names.iloc[0] 
#load in second spreadsheet to df2
df2 = pd.read_excel("GLC Nominal month end data_2016 to present.xlsx",sheet_name="4. spot curve",engine="openpyxl",skiprows=5,header=None)
#create an appropriate set of headers
col_names2=pd.read_excel("GLC Nominal month end data_2016 to present.xlsx",sheet_name="4. spot curve",engine="openpyxl",skiprows=3,nrows=1,header=None)
col_names2[0]="Date"
df2.columns = col_names2.iloc[0]

In [23]:
#join the two dataframes to create df
df = pd.concat([df1, df2], ignore_index=True)
print("The length of combined dataframe is "+str(len(df))+" rows")

The length of combined dataframe is 660 rows


##Perform Basic Reasonableness Checks
### A Check on the Number of Rows

In [24]:
#producing some sense checks
display(Markdown("**Checking Dataframe 1 -  1970 to 2015**")) 
print("the first dates is "+ str(df.iloc[0,0].strftime('%Y-%m-%d'))+" and the last is " +str(df.iloc[551,0].strftime('%Y-%m-%d') ))
print("one would therefore expect 12 x 46yrs = 552 entries")
print("and indeed we see the number of rows in df is "+str(len(df1)))

**Checking Dataframe 1 -  1970 to 2015**

the first dates is 1970-01-31 and the last is 2015-12-31
one would therefore expect 12 x 46yrs = 552 entries
and indeed we see the number of rows in df is 552


In [25]:
display(Markdown("**Checking Dataframe 2 -  2015 to present**")) 
print("the first dates is "+ str(df.iloc[552,0].strftime('%Y-%m-%d'))+" and the last is " +str(df.iloc[659,0].strftime('%Y-%m-%d') ))
print("one would therefore expect 12 x 9yrs = 108 entries")
print("and indeed we see the number of rows in df is "+str(len(df2)))

**Checking Dataframe 2 -  2015 to present**

the first dates is 2016-01-31 and the last is 2024-12-31
one would therefore expect 12 x 9yrs = 108 entries
and indeed we see the number of rows in df is 108


### A Check on the Sum of Values

In [26]:
display(Markdown("**sum of values check**")) 
print("manual inspection of the sum of all values in first spreadsheet is 191503.172322029")
print("the sume of 1st dataframe is also " + str(df1.iloc[:, 1:].sum().sum()))
display(HTML("<span style='color:red;'>the very minor differences are because ....</span>"))
print("the sum of all values in second spreadsheet is 17844.9993308767")
print("the sume of 1st dataframe is also " + str(df2.iloc[:, 1:].sum().sum()))
display(HTML("<span style='color:red;'>the very minor differences are because ....</span>"))
print("the sum of combined dataframe is  " + str(df.iloc[:, 1:].sum().sum()))
combined_total = 191503.172322029 + 17844.9993308767
print("and the sum of the manually observed 191503.172322029 + 17844.9993308767 = " + str(combined_total))

**sum of values check**

manual inspection of the sum of all values in first spreadsheet is 191503.172322029
the sume of 1st dataframe is also 191503.17232202887


the sum of all values in second spreadsheet is 17844.9993308767
the sume of 1st dataframe is also 17844.999330876683


the sum of combined dataframe is  209348.17165290558
and the sum of the manually observed 191503.172322029 + 17844.9993308767 = 209348.1716529057


## Calculate Spot Yield Differences
### Purpose
We want to calculate the natural log differences of spot yields.

### .apply() function
We want to use np.log() function.  to an individual value we can use it directly
but if applying into to a panda series or dataframe we need to use .apply()

### futher complications with dtype:object
sometimes pandas is treating values as generic python objects not efficient numeric types even if they look like floats
it seems to happen when slicing rows.
a fix is to use .astype(float) before applying functions like np.log




### Combined Dataframe

In [27]:
df.head()

Unnamed: 0,Date,0.5,1,1.5,2,2.5,3,3.5,4,4.5,...,35.5,36,36.5,37,37.5,38,38.5,39,39.5,40
0,1970-01-31,,8.635354,8.70743,8.700727,8.664049,8.618702,8.572477,8.528372,8.487617,...,,,,,,,,,,
1,1970-02-28,,8.413131,8.397269,8.370748,8.337633,8.30159,8.265403,8.230804,8.198713,...,,,,,,,,,,
2,1970-03-31,,7.744187,7.782761,7.795017,7.793104,7.784963,7.775288,7.766459,7.759564,...,,,,,,,,,,
3,1970-04-30,,7.606512,7.864352,7.973522,8.002442,7.992813,7.967524,7.938335,7.911422,...,,,,,,,,,,
4,1970-05-31,,7.391107,7.735838,7.862182,7.87751,7.840673,7.782249,7.718053,7.656856,...,,,,,,,,,,


### Applying Natural Log to the first row          







In [28]:
a=df.iloc[0, 1:].to_frame().T
b=df.iloc[0, 1:].astype(float).to_frame().T
c=df.iloc[0, 1:].astype(float).apply(np.log).to_frame().T
display(Markdown("__first row no adjustments:__"))
display(HTML(a.to_html()))
display(Markdown("__converted all to floats:__ "))
display(HTML(b.to_html()))
display(Markdown("__applied np.log:__"))
display(HTML(c.to_html()))




__first row no adjustments:__

Unnamed: 0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12,12.5,13,13.5,14,14.5,15,15.5,16,16.5,17,17.5,18,18.5,19,19.5,20,20.5,21,21.5,22,22.5,23,23.5,24,24.5,25,25.5,26,26.5,27,27.5,28,28.5,29,29.5,30,30.5,31,31.5,32,32.5,33,33.5,34,34.5,35,35.5,36,36.5,37,37.5,38,38.5,39,39.5,40
0,,8.635354,8.70743,8.700727,8.664049,8.618702,8.572477,8.528372,8.487617,8.450611,8.417442,8.388098,8.362503,8.340549,8.322116,8.307105,8.295429,8.287013,8.281788,8.279691,8.280665,8.284653,8.291604,8.301467,8.314193,8.329735,8.348046,8.369082,8.392797,8.419147,8.448088,8.479578,8.513574,8.550034,8.588914,8.630173,8.67377,8.719662,8.767809,8.818169,8.8707,8.925363,8.982115,9.040916,9.101725,9.164502,9.229205,9.295793,9.364219,9.434411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


__converted all to floats:__ 

Unnamed: 0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12,12.5,13,13.5,14,14.5,15,15.5,16,16.5,17,17.5,18,18.5,19,19.5,20,20.5,21,21.5,22,22.5,23,23.5,24,24.5,25,25.5,26,26.5,27,27.5,28,28.5,29,29.5,30,30.5,31,31.5,32,32.5,33,33.5,34,34.5,35,35.5,36,36.5,37,37.5,38,38.5,39,39.5,40
0,,8.635354,8.70743,8.700727,8.664049,8.618702,8.572477,8.528372,8.487617,8.450611,8.417442,8.388098,8.362503,8.340549,8.322116,8.307105,8.295429,8.287013,8.281788,8.279691,8.280665,8.284653,8.291604,8.301467,8.314193,8.329735,8.348046,8.369082,8.392797,8.419147,8.448088,8.479578,8.513574,8.550034,8.588914,8.630173,8.67377,8.719662,8.767809,8.818169,8.8707,8.925363,8.982115,9.040916,9.101725,9.164502,9.229205,9.295793,9.364219,9.434411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


__applied np.log:__

Unnamed: 0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12,12.5,13,13.5,14,14.5,15,15.5,16,16.5,17,17.5,18,18.5,19,19.5,20,20.5,21,21.5,22,22.5,23,23.5,24,24.5,25,25.5,26,26.5,27,27.5,28,28.5,29,29.5,30,30.5,31,31.5,32,32.5,33,33.5,34,34.5,35,35.5,36,36.5,37,37.5,38,38.5,39,39.5,40
0,,2.155865,2.164177,2.163407,2.159182,2.153934,2.148557,2.143398,2.138608,2.134239,2.130306,2.126814,2.123758,2.121129,2.118917,2.117111,2.115705,2.11469,2.114059,2.113806,2.113923,2.114405,2.115243,2.116432,2.117964,2.119832,2.122028,2.124544,2.127374,2.130508,2.13394,2.137661,2.141662,2.145935,2.150472,2.155265,2.160303,2.16558,2.171087,2.176814,2.182754,2.188897,2.195235,2.201761,2.208464,2.215338,2.222373,2.229562,2.236896,2.244364,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Applying Natural Log to the Whole DataFrame






In [29]:
df_logged = df.iloc[:, 1:].astype(float).apply(np.log) 

df_logged
display(Markdown("The sum of all the individual 'logged' values"))
df_logged.sum().sum()

The sum of all the individual 'logged' values

np.float64(51133.746445601464)

### Checking the Log Calculation
given that:
$
\sum_i \log(x_i) = \log\left( \prod_i x_i \right)
$
we can perform a check on the log calculation.
however the product approach doesn't work
since there are so many values we get overflow
for the product side of the equation we can instead chunk up the calculation
to make it more manageable
we therefore calculate the product for each row
then take the log
the sum the log of products for each row
#### The Product of All Entries




In [30]:
display(Markdown("the product of each row"))
print(df.iloc[:,1:].product(axis=1))
display(Markdown("the log of each row product"))
print(df.iloc[:,1:].product(axis=1).apply(np.log))
display(Markdown("the sum of log of row products"))
print(df.iloc[:,1:].product(axis=1).apply(np.log).sum())


logcheck=pd.DataFrame({"the product of each row":df.iloc[:,1:].product(axis=1),"the log of each row product":df.iloc[:,1:].product(axis=1).apply(np.log),"the sum of log of row products":df.iloc[:,1:].product(axis=1).apply(np.log).sum()})


logcheck















the product of each row

0      6.097911e+45
1      1.220385e+45
2      4.668193e+44
3      8.629185e+45
4      4.803156e+45
           ...     
655    3.951460e+50
656    8.433770e+50
657    2.465667e+53
658    2.347661e+52
659    6.731017e+54
Length: 660, dtype: float64


the log of each row product

0      105.424275
1      103.815495
2      102.854516
3      105.771479
4      105.185602
          ...    
655    116.503340
656    117.261498
657    122.939472
658    120.587844
659    126.246321
Length: 660, dtype: float64


the sum of log of row products

51053.785440947686


Unnamed: 0,the product of each row,the log of each row product,the sum of log of row products
0,6.097911e+45,105.424275,51053.785441
1,1.220385e+45,103.815495,51053.785441
2,4.668193e+44,102.854516,51053.785441
3,8.629185e+45,105.771479,51053.785441
4,4.803156e+45,105.185602,51053.785441
...,...,...,...
655,3.951460e+50,116.503340,51053.785441
656,8.433770e+50,117.261498,51053.785441
657,2.465667e+53,122.939472,51053.785441
658,2.347661e+52,120.587844,51053.785441


## Looking at Dataframes in Spreadsheet

import os

with pd.ExcelWriter("output.xlsx", engine="openpyxl") as writer:
    df1.to_excel(writer, sheet_name="df1", index=False)
    df2.to_excel(writer, sheet_name="df2", index=False)
    df.to_excel(writer, sheet_name="df", index=False)
    df_numeric.to_excel(writer, sheet_name="df_numeric", index=False)
    df_converted.to_excel(writer, sheet_name="df_converted", index=False)
    mask_problem.to_excel(writer, sheet_name="mask_problem", index=False)
    filtered.to_excel(writer, sheet_name="filtered", index=False)
    col_names.to_excel(writer, sheet_name="col_names", index=False)
    col_names2.to_excel(writer, sheet_name="col_names2", index=False)
os.system("libreoffice --calc output.xlsx & disown")


## Unsorted

In [31]:
row = df.iloc[1, 1:].to_numpy()
print("Min:", np.nanmin(row))     # Ignores NaN
print("Any <= 0:", np.any(row <= 0))  # Will trigger log error

Min: 8.048354459961853
Any <= 0: False


  print("Any <= 0:", np.any(row <= 0))  # Will trigger log error


## a check on the log calculation
∑log(x 
i
​
 )=log(∏x 
i
​
 )

In [32]:
row = df.iloc[1, 1:]
bad_values = [v for v in row if not isinstance(v, (float, np.float64))]
print(bad_values)

[]


#### for i, val in enumerate(row):
    try:
        np.log(val)
    except Exception as e:
        print(f"Column {row.index[i]}: value={val!r} -> {type(val)} caused error: {e}")


In [33]:
np.log(df.iloc[1, 1:].reset_index(drop=True))

TypeError: loop of ufunc does not support argument 0 of type numpy.float64 which has no callable log method

In [None]:
np.log(pd.to_numeric(df.iloc[1, 1:], errors='coerce').values)

In [None]:
np.log(pd.to_numeric(df.iloc[:, 1:].stack(), errors='coerce').unstack())

pd.to_numeric(..., errors='coerce') converts anything non-numeric to NaN

.values or .unstack() strips weird index metadata

np.log(...) then applies safely

In [None]:
df.head()