In [None]:
# Import relevant libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
import seaborn as sns; sns.set
from scipy import stats
from scipy.stats import norm, skew
os.chdir('../utils')
import utils_correlations
import utils_correlation_activations
import utils_evaluation
import utils_eda

In [None]:
# Import data 

data_dir = '../data/raw_in/'
file_name = 'Risques/dataset_final_scenario_4.csv'
mapping_name = 'Risques 2/final_mapping_candidat.csv'

df = pd.read_csv(os.path.join(data_dir, file_name), index_col=0)
mapping = pd.read_csv(os.path.join(data_dir, mapping_name))

In [None]:
print(f'the dataframe consists of {df.shape[0]} entries over {df.shape[1]} series')
print('-'*55)
print(mapping.Type.value_counts())

In [None]:
# --- step 1: identify the different types of series
df.columns = [str(typ) + '_' + str(col)  for col,typ in zip(df.columns, mapping.Type)]
# --- Share prices & Stock indexes
df_stock = df.loc[: , df.columns.str.contains('STOCK')]
# --- OAT bond (obligation assimilables au trésor) prices
df_bond = df.loc[: , df.columns.str.contains('BOND')]
# --- Exchange rate
df_xchang = df.loc[: , df.columns.str.contains('FXRATE')]
# --- Interests rate
df_yieldc = df.loc[: , df.columns.str.contains('YIELD_CURVE')]
# --- Commodity price
df_commod = df.loc[: , df.columns.str.contains('COMMO_CURVE_FO')]
# --- CDS Spread
df_cdsb = df.loc[: , df.columns.str.contains('CDS_BASKET_ZC')]

In [None]:
# first glance at our dataset and missing values

fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(df_xchang.isnull(), cbar=False, ax=ax)

In [None]:
#First look at the correlations

i = 0
df_bondd = df_bond.copy()
for x in df_bond.columns:
    if i > 10:
        df_bondd.drop([x], axis=1, inplace = True)
    
    i= i +1

corr = df_bondd.corr()
fig, ax = plt.subplots(figsize=(6,5))
ax.set_title('correlation plot for 10 bonds')
sns.heatmap(corr,
            xticklabels=[x.split('~')[0] for x in corr.columns],
            yticklabels=[x.split('~')[0] for x in corr.columns],
            ax=ax,
            center=0, annot=True)

In [None]:
#Analyse how the correlation coefficient evolves for two highly correlated series

correlations = []
i = df.reset_index()['BOND_OAT41J4.5~PRICE'].first_valid_index()
while i < len(df['BOND_OAT41J4.5~PRICE'].values) - 10:
    rho = df[['BOND_OAT41J4.5~PRICE', 'BOND_OAT4J55~PRICE']][i : i+10].corr()
    correlations.append(rho.iloc[0, 1])
    i = i + 10

plt.title('Evolution of correlation coefficient through time')
plt.plot(correlations, 'r') # plotting t, a separately 
plt.ylabel('coef')
plt.xlabel('10 day windows')
plt.show()

In [None]:
t = df.index
a = df['BOND_OAT41J4.5~PRICE'].values
b = df['BOND_OAT4J55~PRICE'].values

plt.title('Behaviour of two correlated bonds')

plt.plot( a, 'r') # plotting t, a separately 
plt.plot( b, 'b') # plotting t, b separately 
plt.legend(('BOND_OAT41J4.5~PRICE','BOND_OAT4J55~PRICE'))
plt.ylabel('Price')
plt.xlabel('time')
plt.show()

Study of the data set

In [None]:
study_dataset(df_stock)
study_dataset(df_bond)
study_dataset(df_xchang)
study_dataset(df_yieldc)
study_dataset(df_commod)
study_dataset(df_cdsb)
study_dataset(df)

In [None]:
study_dataset_crop(df_stock)
study_dataset_crop(df_bond)
study_dataset_crop(df_xchang)
study_dataset_crop(df_yieldc)
study_dataset_crop(df_commod)
study_dataset_crop(df_cdsb)

In [None]:
missing_values(df_stock)
missing_values(df_bond)
missing_values(df_xchang)
missing_values(df_yieldc)
missing_values(df_commod)
missing_values(df_cdsb)
missing_values(df)

In [None]:
missing_values_crop(df_stock)
missing_values_crop(df_bond)
missing_values_crop(df_xchang)
missing_values_crop(df_yieldc)
missing_values_crop(df_commod)
missing_values_crop(df_cdsb)