# Data manipulation

Goal: Create dataframes to that are clean and ready for the tasks at hand.

In [124]:
### Installing the required packages if not already installed
packages = ['numpy', 'pandas', 'warnings', 'sqlite3']

for package in packages:
    try:
        __import__(package)
    except ImportError:
        %pip install {package}

import numpy as np
import pandas as pd
import warnings
import sqlite3

### Ignoring the warnings
warnings.filterwarnings('ignore')

### Connecting to the database
conn = sqlite3.connect('data.db')

### Reading all the data we have to clean

In [125]:
### Reading the datasets
annual_80_20 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/annual_stock_returns_ose.csv",
                   sep=';', encoding='latin1')

monthly_80_20 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/monthly_stock_returns_ose.csv",
                      sep=';', encoding='latin1')

daily_80_90 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_stock_returns_ose_csv/daily_stock_returns_ose_1980_1989.csv",
                        sep=';', encoding='latin1')

daily_90_00 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_stock_returns_ose_csv/daily_stock_returns_ose_1990_1999.csv",
                        sep=';', encoding='latin1')

daily_00_10 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_stock_returns_ose_csv/daily_stock_returns_ose_2000_2009.csv",
                        sep=';', encoding='latin1')

daily_10_20 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_stock_returns_ose_csv/daily_stock_returns_ose_2010_2020.csv",
                        sep=';', encoding='latin1')

daily_20_24 = pd.read_csv('/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/ose_equity_euronext_data/daily_ose_stocks_nov_2020_aug_2024.csv',
                         sep=';', encoding='latin1')

monthly_20_24 = pd.read_csv('/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/ose_equity_euronext_data/monthly_ose_stocks_nov_2020_aug_2024.csv',
                         sep=',', encoding='latin1')

#### Getting all the tickers and names

In [126]:
### Saving all unique ticker and Last_Sec_Name in a new dataframe
all_tickers = pd.concat(
    [
    annual_80_20[['ticker', 'Last_Sec_Name']], monthly_80_20[['ticker', 'Last_Sec_Name']],
    daily_80_90[['ticker', 'Last_Sec_Name']], daily_90_00[['ticker', 'Last_Sec_Name']],
    daily_00_10[['ticker', 'Last_Sec_Name']], daily_10_20[['ticker', 'Last_Sec_Name']],
    daily_20_24[['ticker', 'Name']].rename(columns={'Name': 'Last_Sec_Name'}), 
    monthly_20_24[['ticker', 'Name']].rename(columns={'Name': 'Last_Sec_Name'})
    ]
                        ).drop_duplicates(subset='ticker').sort_values(by='ticker').reset_index(drop=True).dropna(subset=['ticker'])

all_tickers

Unnamed: 0,ticker,Last_Sec_Name
0,2020,2020 Bulkers
1,5PG,5th Planet Games
2,AASB,Aasen Sparebank
3,AAT,Aust-Agder Trafikkselskap
4,AAV,Adresseavisen
...,...,...
1290,ZAP,Zaptec
1291,ZENA,Zenith Energy
1292,ZENT,Zenitel
1293,ZONC,Zoncolan


In [127]:
### Looking at the values in monthly_80_20 where the ticker is NaN
monthly_80_20[monthly_80_20['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Last_Sec_Name
Kongsberg Automotive    180
Northern Offshore        96
EVRY                     30
BW Energy Limited         4
Northern Ocean Ltd.       4
Atlantic Sapphire         1
Pexip Holding             1
Name: count, dtype: int64

Only the first 3 have enough trading days, the rest will be removed. 

In [128]:
### If the ticker is NaN and the last_sec_name is 'Kongsberg Automotive' then the ticker is 'KOA'
monthly_80_20.loc[monthly_80_20['Last_Sec_Name'] == 'Kongsberg Automotive', 'ticker'] = 'KOA'

### If the ticker is NaN and the last_sec_name is 'Northern Offshore ' then the ticker is 'NOL'
monthly_80_20.loc[monthly_80_20['Last_Sec_Name'] == 'Northern Offshore', 'ticker'] = 'NOL'

### If the ticker is NaN and the last_sec_name is 'EVRY' then the ticker is 'EVRY'
monthly_80_20.loc[monthly_80_20['Last_Sec_Name'] == 'EVRY', 'ticker'] = 'EVRY'

### If the ticker is NaN, we remove the row
monthly_80_20 = monthly_80_20.dropna(subset=['ticker'])

### Looking at the values in monthly_80_20 where the ticker is NaN
monthly_80_20[monthly_80_20['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Series([], Name: count, dtype: int64)

In [129]:
### Looking at the values in monthly_20_24 where the ticker is NaN
monthly_20_24[monthly_20_24['ticker'].isnull()]["Name"].value_counts()

Series([], Name: count, dtype: int64)

In [130]:
### Looking at the values in daily_80_90 where the ticker is NaN
daily_80_90[daily_80_90['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Series([], Name: count, dtype: int64)

In [131]:
### Looking at the values in daily_90_00 where the ticker is NaN
daily_90_00[daily_90_00['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Series([], Name: count, dtype: int64)

In [132]:
### Looking at the values in daily_00_10 where the ticker is NaN
daily_00_10[daily_00_10['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Last_Sec_Name
Kongsberg Automotive    1138
Northern Offshore        584
Name: count, dtype: int64

In [133]:
### If the ticker is NaN and the last_sec_name is 'Kongsberg Automotive' then the ticker is 'KOA'
daily_00_10.loc[daily_00_10['Last_Sec_Name'] == 'Kongsberg Automotive', 'ticker'] = 'KOA'

### If the ticker is NaN and the last_sec_name is 'Northern Offshore ' then the ticker is 'NOL'
daily_00_10.loc[daily_00_10['Last_Sec_Name'] == 'Northern Offshore', 'ticker'] = 'NOL'

### Looking at the values in daily_00_10 where the ticker is NaN
daily_00_10[daily_00_10['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Series([], Name: count, dtype: int64)

In [134]:
### Looking at the values in daily_10_20 where the ticker is NaN
daily_10_20[daily_10_20['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Last_Sec_Name
Kongsberg Automotive    2631
Northern Offshore       1403
EVRY                     618
BW Energy Limited         88
Northern Ocean Ltd.       82
Atlantic Sapphire         38
Pexip Holding             31
Name: count, dtype: int64

Only the first 3 have enough trading days, the rest will be removed. 

In [135]:
### If the ticker is NaN and the last_sec_name is 'Kongsberg Automotive' then the ticker is 'KOA'
daily_10_20.loc[daily_10_20['Last_Sec_Name'] == 'Kongsberg Automotive', 'ticker'] = 'KOA'

### If the ticker is NaN and the last_sec_name is 'Northern Offshore ' then the ticker is 'NOL'
daily_10_20.loc[daily_10_20['Last_Sec_Name'] == 'Northern Offshore', 'ticker'] = 'NOL'

### If the ticker is NaN and the last_sec_name is 'EVRY' then the ticker is 'EVRY'
daily_10_20.loc[daily_10_20['Last_Sec_Name'] == 'EVRY', 'ticker'] = 'EVRY'

### If the ticker is NaN, we remove the row
daily_10_20 = daily_10_20.dropna(subset=['ticker'])

### Looking at the values in daily_10_20 where the ticker is NaN
daily_10_20[daily_10_20['ticker'].isnull()]["Last_Sec_Name"].value_counts()

Series([], Name: count, dtype: int64)

In [136]:
### Looking at the values in daily_20_24 where the ticker is NaN
daily_20_24[daily_20_24['ticker'].isnull()]["Name"].value_counts()

Series([], Name: count, dtype: int64)

### Monthly data 

We will start with monthly data, and filter out the tickers we need going forward from this. Since we will base the analysis with monthly rebalancing, this will be our main data.

The monthly data for the stocks are missing data between 2020-06 - 2020-12, and unfortuently they are missing from the daily data as well. We will try our best downloading some of them from yahoo finance and CIQ. 

In [137]:
### Removing all columns expect ticker, Date and Return
monthly_80_20 = monthly_80_20[['ticker', 'Date', 'MonthlyReturn', 'NoShares', 'LastPrice']]
monthly_20_24 = monthly_20_24[['ticker', 'Date', 'Return', 'SharesOutstanding', 'Price']]

### Combining the datasets but MonthlyReturn is renamed to Return and LastPrice is renamed to Price
monthly = pd.concat([monthly_80_20.rename(columns={'MonthlyReturn': 'Return', 'LastPrice': 'Price', 'NoShares': 'SharesOutstanding'}), monthly_20_24])

### Fixing the dates
monthly['Date'] = pd.to_datetime(monthly['Date'], format='%Y%m%d')

### Gathering all the dates from the datasets
dates = monthly[['Date']].drop_duplicates().sort_values(by='Date').reset_index(drop=True)

### Adding dates of 2020-07-31, 2020-08-31, 2020-09-30, 2020-10-31, 2020-11-30 to the dates
dates = pd.concat([dates, pd.DataFrame({'Date': pd.to_datetime(['2020-07-31', '2020-08-31', '2020-09-30', '2020-10-31', '2020-11-30'])})]).drop_duplicates().sort_values(by='Date').reset_index(drop=True)

### Creating a dataframe with all the dates and tickers
returns_monthly = pd.DataFrame(np.zeros((len(dates), len(all_tickers))), columns=all_tickers['ticker'], index=dates['Date'])
noshares_monthly = returns_monthly.copy()
mcap_monthly    = returns_monthly.copy()
prices_monthly  = returns_monthly.copy()

In [138]:
### Filling the dataframe with the returns
for i, row in monthly.iterrows():
    returns_monthly.loc[row['Date'], row['ticker']] = row['Return']
    noshares_monthly.loc[row['Date'], row['ticker']] = row['SharesOutstanding']
    prices_monthly.loc[row['Date'], row['ticker']] = row['Price']
    mcap_monthly.loc[row['Date'], row['ticker']] = row['SharesOutstanding'] * row['Price']

### Replacing all 0 with NaN
returns_monthly.replace(0, np.nan, inplace=True)
noshares_monthly.replace(0, np.nan, inplace=True)
mcap_monthly.replace(0, np.nan, inplace=True)
prices_monthly.replace(0, np.nan, inplace=True)

### Looking at our data
returns_monthly

ticker,2020,5PG,AASB,AAT,AAV,ABG,ABL,ABS,ABT,ABTEC,...,WWL,XPLRA,XXL,YAR,ZAL,ZAP,ZENA,ZENT,ZONC,ZWIPE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-01-31,,,,,,,,,,,...,,,,,,,,,,
1980-02-29,,,,,,,,,,,...,,,,,,,,,,
1980-03-31,,,,,,,,,,,...,,,,,,,,,,
1980-04-30,,,,0.0417,,,,,,,...,,,,,,,,,,
1980-05-31,,,,0.4600,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-30,0.4181,0.1030,-0.0273,,,0.0362,-0.0288,-0.0180,,0.1491,...,,,-0.1819,-0.0741,0.0294,-0.2237,-0.1455,,,-0.4175
2024-05-31,-0.0439,-0.1209,-0.0569,,,0.1034,0.0932,0.0510,,-0.1851,...,,-0.0588,-0.0897,0.0391,0.1086,0.0112,-0.0468,,,-0.0067
2024-06-30,-0.0377,-0.0937,0.0631,,,-0.0391,-0.0280,0.0194,,0.0313,...,,-0.0820,0.1943,-0.0532,0.0180,-0.0685,-0.0035,,,-0.1779
2024-07-31,-0.0267,-0.0690,-0.0319,,,0.0699,0.0370,-0.0095,,-0.1595,...,,0.1872,-0.0024,0.0111,-0.1013,-0.0909,-0.0211,,,0.0776


We are missing some data in 2020, lets get this, but first we need to fill in the numbers of shares for 2020. We will here interpolate the noshares between June 2020 and December 2020.

So what we do here, is that if we have a number from 2020-06 and 2020-12, we interpolate to get the numbers in between. If we do not have a number for both, we simply will the missing values with the number we have.

In [139]:
dummy = noshares_monthly.loc['2020-06-30':'2020-12-31'].copy()

for i in range(dummy.shape[1]):
    if not np.isnan(dummy.iloc[6, i]) or not np.isnan(dummy.iloc[0, i]):
        if not np.isnan(dummy.iloc[6, i]) and not np.isnan(dummy.iloc[0, i]):
            dummy.iloc[:, i] = dummy.iloc[:, i].interpolate()
        elif not np.isnan(dummy.iloc[6, i]):
            dummy.iloc[1:6, i] = dummy.iloc[6, i]
        elif not np.isnan(dummy.iloc[0, i]):
            dummy.iloc[1:6, i] = dummy.iloc[0, i]
            
noshares_monthly.loc['2020-06-30':'2020-12-31'] = dummy

Now we can start to download the missing data.

In [140]:
### Getting the tickers 2020
data_2020 = returns_monthly.loc['2020']
data_2020 = data_2020.dropna(axis=1, how='all')
tickers_2020 = data_2020.columns

### Getting the companies names for the tickers
companies_2020 = all_tickers[all_tickers['ticker'].isin(tickers_2020)][['ticker', 'Last_Sec_Name']].set_index('ticker')
companies_2020

Unnamed: 0_level_0,Last_Sec_Name
ticker,Unnamed: 1_level_1
2020,2020 Bulkers
5PG,5th Planet Games
AASB,Aasen Sparebank
ABG,ABG Sundal Collier
ABT,Aqua Bio Technology
...,...
YAR,Yara International
ZAL,Zalaris
ZAP,Zaptec
ZENA,Zenith Energy


In [141]:
### Lets see how many companies yahoo finance has data for
### Adding .OL at the end of the stock names
tickers_to_download = [stock + '.OL' for stock in companies_2020.index]

### Downloading daily returns for the stocks that had returns between 2020-2021 from yahoo finance
import yfinance as yf
data_2020_yf = yf.download(tickers_to_download, start="2020-01-01", end="2020-12-31")["Close"]

### Removing .OL from the stock names
data_2020_yf.columns = [col.replace('.OL', '') for col in data_2020_yf.columns]

### Remove the columns with only NaN values
data_2020_yf.dropna(axis=1, how='all', inplace=True)

### Making sure index is datetime
data_2020_yf.index = pd.to_datetime(data_2020_yf.index)

### Looking at the number of companies
print(data_2020_yf.shape)

### Looking at the data
data_2020_yf

[*********************100%***********************]  325 of 325 completed

120 Failed downloads:
['NOFI.OL', 'TEAM.OL', 'SBANK.OL', 'ELOP.OL', 'NANOV.OL', 'INFRO.OL', 'OCY.OL', 'NORBIT.OL', 'SBLK.OL', 'SADG.OL', 'KOMP.OL', 'SBTE.OL', 'AOW.OL', 'SRBANK.OL', 'DOF.OL', 'EPIC.OL', 'THIN.OL', 'LSTSB.OL', 'SRBNK.OL', 'MRCEL.OL', 'ASETEK.OL', 'VISTIN.OL', 'FJORD.OL', 'FUNCOM.OL', 'SACAM.OL', 'SALMON.OL', 'CSAM.OL', 'AXA.OL', 'ITE.OL', 'TRVX.OL', 'BOUVET.OL', 'NRS.OL', 'OTS.OL', 'BDRILL.OL', 'PROTCT.OL', 'STORM.OL', 'INC.OL', 'STRONG.OL', 'AVANCE.OL', 'OBSERV.OL', 'HYARD.OL', 'FKRFT.OL', 'JPK.OL', 'KAHOT.OL', 'NANO.OL', 'INSR.OL', 'ICE.OL', 'OTELLO.OL', 'ADE.OL', 'PLCS.OL', 'AKERBP.OL', 'AGS.OL', 'ICEGR.OL', 'COV.OL', 'VACC.OL', 'HIDDN.OL', 'BON.OL', 'SSC.OL', 'INFRNT.OL', 'PSKY.OL', 'HOC.OL', 'GIG.OL', 'CRAYON.OL', 'HAFNIA.OL', 'PMG.OL', 'NODL.OL', 'VOLUE.OL', 'AQUA.OL', 'QFR.OL', 'BEL.OL', 'SOLON.OL', 'FIVEPG.OL', 'BIOTEC.OL', 'SIOFF.OL', 'WILS.OL', 'RISH.OL', 'NOR.OL', 'SIKRI.OL', 'DAT.OL', 

(252, 205)


Unnamed: 0_level_0,2020,5PG,AASB,ABG,ACC,ADS,AEGA,AFG,AFK,AGAS,...,WSTEP,WWI,WWIB,XPLRA,XXL,YAR,ZAL,ZAP,ZENA,ZWIPE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,81.099236,0.293681,94.646873,2.383401,,20.670980,-1.072074,134.820602,85.350777,43.144279,...,18.073875,145.023682,150.238083,,808.447449,229.328506,26.309610,,2.180,5.275051
2020-01-03,77.766365,0.302388,94.646873,2.305940,,21.616714,-1.072074,135.992950,83.121468,46.631500,...,18.073875,146.307068,149.807632,,677.683167,225.964447,24.653080,,2.380,5.321733
2020-01-06,77.581238,0.314262,94.646873,2.371484,,21.616714,-1.187974,135.602158,83.121468,44.360752,...,18.073875,145.879272,145.072311,,588.438965,225.964447,25.335182,,2.770,5.228369
2020-01-07,78.136665,0.300805,94.646873,2.389360,,21.076294,-1.178315,133.648254,83.439934,45.415028,...,18.375109,142.029083,145.502777,,650.480408,223.044678,24.360752,,2.720,4.994960
2020-01-08,78.136665,0.300805,94.646873,2.392339,,21.616714,-1.110707,134.820602,84.395363,44.117455,...,18.073875,141.601303,142.489441,,610.869324,226.789612,24.945410,,2.730,4.668187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-22,53.933010,1.650000,107.893791,3.964034,16.850000,16.627333,-3.345936,139.625717,149.405365,37.342751,...,17.761789,142.279739,140.948288,22.995001,1035.020386,245.175293,48.916389,31.250000,0.586,16.511723
2020-12-23,54.676910,1.380000,107.893791,4.035634,17.496000,16.671671,-3.423299,140.783752,147.763550,37.089199,...,17.761789,142.279739,142.704681,23.190001,1041.708618,245.732193,50.865250,35.349998,1.120,16.399399
2020-12-28,53.747040,1.300000,107.893791,4.055161,18.340000,16.804689,-4.100222,143.099823,150.226273,36.636452,...,18.084732,143.589020,143.143753,23.400000,1066.789917,249.908966,50.670364,42.099998,1.840,17.578808
2020-12-29,54.676910,1.230000,111.831528,4.159307,17.639999,16.671671,-4.738464,144.754150,150.226273,36.600239,...,18.246202,144.025497,144.900131,22.495001,1055.085327,248.516693,51.839680,41.099998,1.550,17.204391


In [115]:
### Lets not look at what we can get from S&P CIQ
data_2020_CIQ = pd.read_excel("Data/SPGlobalCIQ_data_2020.xlsx", index_col=0)

### Making sure index is in datetime format
data_2020_CIQ.index = pd.to_datetime(data_2020_CIQ.index)

### Removing -OB drom the tickers
data_2020_CIQ.columns = [col.replace('-OB', '') for col in data_2020_CIQ.columns]

### Putting the columns in alphabetical order
data_2020_CIQ = data_2020_CIQ.reindex(sorted(data_2020_CIQ.columns), axis=1)

### Removing collums with only NaN values
data_2020_CIQ.dropna(axis=1, how='all', inplace=True)

### Looking at the number of companies
print(data_2020_CIQ.shape)

### Looking at the data
data_2020_CIQ

(252, 197)


Unnamed: 0,2020,5PG,AASB,ABG,ABTEC,ACC,ADS,AFG,AFK,AGAS,...,WAWI,WSTEP,WWI,WWIB,XPLRA,XXL,YAR,ZAL,ZAP,ZWIPE
2020-01-02,87.6,0.371,129,4.000,2.36,,30.60,172.5,107.2,53.20,...,24.10,24.0,169.5,174.5,,1694.0,361.3,27.0,,5.65
2020-01-03,84.0,0.382,129,3.870,2.58,,32.00,174.0,104.4,57.50,...,27.20,24.0,171.0,174.0,,1420.0,356.0,25.3,,5.70
2020-01-06,83.8,0.397,129,3.980,2.42,,32.00,173.5,104.4,54.70,...,27.00,24.0,170.5,168.5,,1233.0,356.0,26.0,,5.60
2020-01-07,84.4,0.380,129,4.010,2.60,,31.20,171.0,104.8,56.00,...,27.34,24.4,166.0,169.0,,1363.0,351.4,25.0,,5.35
2020-01-08,84.4,0.380,129,4.015,2.54,,32.00,172.5,106.0,54.40,...,27.30,24.0,165.5,165.5,,1280.0,357.3,25.6,,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-22,58.0,1.650,137,6.090,5.90,16.850,18.75,168.8,182.0,41.24,...,22.15,22.0,163.0,160.5,22.995,1857.0,352.2,50.2,31.25,17.64
2020-12-23,58.8,1.380,137,6.200,6.05,17.496,18.80,170.2,180.0,40.96,...,22.40,22.0,163.0,162.5,23.190,1869.0,353.0,52.2,35.35,17.52
2020-12-28,57.8,1.300,137,6.230,6.15,18.340,18.95,173.0,183.0,40.46,...,22.50,22.4,164.5,163.0,23.400,1914.0,359.0,52.0,42.10,18.78
2020-12-29,58.8,1.230,142,6.390,5.90,17.640,18.80,175.0,183.0,40.42,...,22.05,22.6,165.0,165.0,22.495,1893.0,357.0,53.2,41.10,18.38


In [116]:
### Checking how many tickers are in yfinance and not in S&P CIQ
tickers_yf = data_2020_yf.columns
tickers_CIQ = data_2020_CIQ.columns

### Getting the tickers that are in yfinance but not in S&P CIQ
tickers_not_in_CIQ = [ticker for ticker in tickers_yf if ticker not in tickers_CIQ]
print(tickers_not_in_CIQ)

['AEGA', 'ASA', 'AWDR', 'BORR', 'CARA', 'DNB', 'EFUEL', 'FLNG', 'FRO', 'GEOS', 'GOGL', 'HLNG', 'IFISH', 'INSTA', 'JIN', 'MELG', 'ODF', 'ODFB', 'PGS', 'PPG', 'QEC', 'SUSB', 'TECO', 'TIETO', 'ZENA']


In [117]:
### Getting the tickers that are in S&P CIQ but not in yfinance
tickers_not_in_yf = [ticker for ticker in tickers_CIQ if ticker not in tickers_yf]
print(tickers_not_in_yf)

['ABTEC', 'AIX', 'AQUIL', 'BALT', 'BNOR', 'ELMRA', 'ENSU', 'EQVA', 'HUGO', 'MELG-OSL', 'NOFIN', 'PNOR', 'ROGS', 'ROMER', 'SB1NO', 'SOR.1', 'STRO']


We combine both dataframs to get overall a better dataframe.

In [118]:
### Combining the data from yfinance and S&P CIQ 
SP_data = data_2020_CIQ[tickers_not_in_yf]

### Dropping 'MELG-OSL', 'SB1NO', 'SOR.1' as they are not in the original data 
SP_data.drop(['MELG-OSL', 'SB1NO', 'SOR.1'], axis=1, inplace=True)

### Combining the data from yfinance and S&P CIQ
data_2020_daily = pd.concat([data_2020_yf, SP_data], axis=1)

### Getting returns
data_2020_daily_returns = data_2020_daily.pct_change()

### removing the first row
data_2020_daily_returns = data_2020_daily_returns.iloc[1:]

### Getting the monthly prices
data_2020_monthly_prices = data_2020_daily.resample('M').last()

### Getting the market cap and prices
mcap_monthly.loc['2020-07-31':'2020-11-30'] = noshares_monthly.loc['2020-07-31':'2020-11-30'] * data_2020_monthly_prices.loc['2020-07-31':'2020-11-30']
prices_monthly.loc['2020-07-31':'2020-11-30'] = data_2020_monthly_prices.loc['2020-07-31':'2020-11-30']

### Getting monthly returns
data_2020_monthly_returns = data_2020_daily_returns.resample('M').last()

### Getting the dates that are in data_2020_monthly_returns but not in returns_monthly
dates_to_add = data_2020_monthly_returns.index.difference(returns_monthly.index)

### Adding the data from data_2020_monthly_returns to returns_monthly
returns_monthly = pd.concat([returns_monthly, data_2020_monthly_returns.loc[dates_to_add]], axis=0)

### Sorting the index
returns_monthly.sort_index(inplace=True)

### Looking at the data
returns_monthly

Unnamed: 0,2020,5PG,AASB,AAT,AAV,ABG,ABL,ABS,ABT,ABTEC,...,WWL,XPLRA,XXL,YAR,ZAL,ZAP,ZENA,ZENT,ZONC,ZWIPE
1980-01-31,,,,,,,,,,,...,,,,,,,,,,
1980-02-29,,,,,,,,,,,...,,,,,,,,,,
1980-03-31,,,,,,,,,,,...,,,,,,,,,,
1980-04-30,,,,0.0417,,,,,,,...,,,,,,,,,,
1980-05-31,,,,0.4600,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-30,0.4181,0.1030,-0.0273,,,0.0362,-0.0288,-0.0180,,0.1491,...,,,-0.1819,-0.0741,0.0294,-0.2237,-0.1455,,,-0.4175
2024-05-31,-0.0439,-0.1209,-0.0569,,,0.1034,0.0932,0.0510,,-0.1851,...,,-0.0588,-0.0897,0.0391,0.1086,0.0112,-0.0468,,,-0.0067
2024-06-30,-0.0377,-0.0937,0.0631,,,-0.0391,-0.0280,0.0194,,0.0313,...,,-0.0820,0.1943,-0.0532,0.0180,-0.0685,-0.0035,,,-0.1779
2024-07-31,-0.0267,-0.0690,-0.0319,,,0.0699,0.0370,-0.0095,,-0.1595,...,,0.1872,-0.0024,0.0111,-0.1013,-0.0909,-0.0211,,,0.0776


So we added 5 months of returns, but no new companies.

Are data gathering process is now done.

In [119]:
### Mean return of a stock in the universe
print((returns_monthly.mean(axis=None)*12))

### SD of a stock in the universe
print(returns_monthly.std().mean()*np.sqrt(12))

### Max return of a stock in the universe
print(returns_monthly.max().max())

### Min return of a stock in the universe
print(returns_monthly.min().min())

0.46049508033403347
5.633500841214948
2499.0
-0.9994


#### Filter
We put the return($t+1$) of stocks with a marketcap in $t$ of under NOK 1 Million to NaN so this stock for the next period is not taken into considiration.

In [120]:
### Saving the unfiltered data
returns_monthly.to_sql('unfiltered_returns_monthly', conn, if_exists='replace')
mcap_monthly.to_sql('unfiltered_mcap_monthly', conn, if_exists='replace')
prices_monthly.to_sql('unfiltered_prices_monthly', conn, if_exists='replace')

### Checking our starting value
print(f"We start with {returns_monthly.shape}")

### If marketcap < 1000000 or price < 10, we set the return to NaN
returns_monthly[(prices_monthly < 10)] = np.nan
returns_monthly[(mcap_monthly < 1000000)] = np.nan

### Removing columns with only NaN
returns_monthly.dropna(axis=1, how='all', inplace=True)
mcap_monthly = mcap_monthly[returns_monthly.columns]
prices_monthly = prices_monthly[returns_monthly.columns]

### Winsorize the returns
#returns_monthly = returns_monthly.clip(lower=returns_monthly.quantile(0.01), upper=returns_monthly.quantile(0.97), axis=1)

### Removing columns where we only have one observation
#returns_monthly.dropna(axis=1, thresh=2, inplace=True)

### Printing the tickers that are left
print(f"We are left with {returns_monthly.shape}")

### Getting the tickers that are left
filtered_tickers = returns_monthly.columns

We start with (536, 1295)
We are left with (536, 1181)


In [121]:
### Mean return of a stock in the universe
print((returns_monthly.mean(axis=None)*12))

### SD of a stock in the universe
print(returns_monthly.std().mean()*np.sqrt(12))

### Max return of a stock in the universe
print(returns_monthly.max().max())

### Min return of a stock in the universe
print(returns_monthly.min().min())

0.6286365367660023
0.6563133816250811
2499.0
-0.9994


In [122]:
### Filtered tickers
tickers = all_tickers[all_tickers['ticker'].isin(returns_monthly.columns)].reset_index(drop=True)

In [123]:
### Adding it into the database
returns_monthly.to_sql('filtered_returns_monthly', conn, if_exists='replace')
mcap_monthly.to_sql('filtered_mcap_monthly', conn, if_exists='replace')
prices_monthly.to_sql('filtered_prices_monthly', conn, if_exists='replace')
tickers.to_sql('filtered_tickers', conn, if_exists='replace')

1181

#### Daily returns

In [84]:
### Removing all columns expect ticker, Date and Return
daily_80_90 = daily_80_90[['ticker', 'Date', 'Return']]
daily_90_00 = daily_90_00[['ticker', 'Date', 'Return']]
daily_00_10 = daily_00_10[['ticker', 'Date', 'Return']]
daily_10_20 = daily_10_20[['ticker', 'Date', 'Return']]
daily_20_24 = daily_20_24[['ticker', 'Date', 'Return']]

### Getting all the returns in the same dataframe
daily = pd.concat([daily_80_90, daily_90_00, daily_00_10, daily_10_20, daily_20_24])

### Fixing the dates
daily['Date'] = pd.to_datetime(daily['Date'], format='%Y%m%d')

### Gathering all the dates from the datasets
dates = daily[['Date']].drop_duplicates().sort_values(by='Date').reset_index(drop=True)

### Creating a dataframe with all the dates and tickers
returns_daily = pd.DataFrame(np.zeros((len(dates), len(tickers))), columns=tickers['ticker'], index=dates['Date'])

### Filling the dataframe with the returns
for i, row in daily.iterrows():
    returns_daily.loc[row['Date'], row['ticker']] = row['Return']

### Replacing all 0 with NaN
returns_daily.replace(0, np.nan, inplace=True)

### Looking at our data
returns_daily

ticker,2020,AASB,AAT,AAV,ABL,ABS,ABT,ACC,ACL,ACR,...,HYON,KNOX,STSU,SPIR,ENERG,AIX,HERMA,SASNO,AKOBO,EPICT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-01-03,,,,,,,,,,,...,,,,,,,,,,
1980-01-04,,,,,,,,,,,...,,,,,,,,,,
1980-01-07,,,,,,,,,,,...,,,,,,,,,,
1980-01-08,,,,,,,,,,,...,,,,,,,,,,
1980-01-09,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-08,-0.0104,0.0487,,,,0.0459,,-0.0167,,-0.0025,...,,-0.1294,0.0144,,0.0556,-0.0055,0.0394,0.0281,0.0269,
2024-08-09,0.0197,,,,0.0131,-0.0263,,0.0146,,0.0025,...,,0.0010,-0.0071,-0.0074,-0.0526,0.0250,-0.0341,-0.0195,-0.0262,
2024-08-12,0.0028,,,,0.0345,,,0.0192,,,...,,0.1475,-0.0048,0.0224,-0.0222,-0.0461,-0.0353,-0.6056,-0.0135,
2024-08-13,-0.0076,,,,-0.0042,0.0090,,0.0227,,-0.0025,...,,0.0871,-0.0024,-0.0146,0.0057,0.0511,0.0163,-0.6667,-0.0727,


In [85]:
### Extracting the dates that are in downloaded data but not in the returns_daily dataframe
dates = data_2020_daily_returns.index
dates = dates[~dates.isin(returns_daily.index)]

### Restricting the downloaded data to only the dates that are not in the returns_daily dataframe
data_2020_daily_returns = data_2020_daily_returns.loc[dates]

### Combining the data
returns_daily = pd.concat([returns_daily, data_2020_daily_returns], axis=0)

### Sorting the dataframe
returns_daily = returns_daily.sort_index()

### Looking at the data
returns_daily

Unnamed: 0,2020,AASB,AAT,AAV,ABL,ABS,ABT,ACC,ACL,ACR,...,HYON,KNOX,STSU,SPIR,ENERG,AIX,HERMA,SASNO,AKOBO,EPICT
1980-01-03,,,,,,,,,,,...,,,,,,,,,,
1980-01-04,,,,,,,,,,,...,,,,,,,,,,
1980-01-07,,,,,,,,,,,...,,,,,,,,,,
1980-01-08,,,,,,,,,,,...,,,,,,,,,,
1980-01-09,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-08,-0.0104,0.0487,,,,0.0459,,-0.0167,,-0.0025,...,,-0.1294,0.0144,,0.0556,-0.0055,0.0394,0.0281,0.0269,
2024-08-09,0.0197,,,,0.0131,-0.0263,,0.0146,,0.0025,...,,0.0010,-0.0071,-0.0074,-0.0526,0.0250,-0.0341,-0.0195,-0.0262,
2024-08-12,0.0028,,,,0.0345,,,0.0192,,,...,,0.1475,-0.0048,0.0224,-0.0222,-0.0461,-0.0353,-0.6056,-0.0135,
2024-08-13,-0.0076,,,,-0.0042,0.0090,,0.0227,,-0.0025,...,,0.0871,-0.0024,-0.0146,0.0057,0.0511,0.0163,-0.6667,-0.0727,


We added 109 trading days!

In [86]:
### Storing the unfiltered data
returns_daily.to_sql('unfiltered_returns_daily', conn, if_exists='replace')

### Filtering the tickers
returns_daily = returns_daily.loc[:, returns_monthly.columns]

### Storing the returns in the database
returns_daily.to_sql('filtered_returns_daily', conn, if_exists='replace')

11196

### Indicies

In [87]:
### Reading the index daily data
index_83_99 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_equity_index_ose/stock_index_1983_1999.csv",
                     sep=';', encoding='latin1')

index_00_09 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_equity_index_ose/stock_index_2000_2009.csv",
                        sep=';', encoding='latin1')

index_10_20 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_equity_index_ose/stock_index_2010_2020.csv",
                        sep=';', encoding='latin1')

index_20_24 = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/daily_equity_index_ose/norway_index_observations_2020_2024.csv",
                        sep=';', encoding='latin1')

In [88]:
### Renaming column symbol to ticker and Date to date in the index_20_24 dataset
index_20_24.rename(columns={'symbol': 'ticker', 'Date': 'date'}, inplace=True)

### Combining the ticker and name from the datasets
index_tickers = pd.concat([index_83_99[['ticker', 'name']], index_00_09[['ticker', 'name']],
                           index_10_20[['ticker', 'name']], index_20_24[['ticker', 'name']]]).drop_duplicates(subset='ticker').sort_values(by='ticker').reset_index(drop=True)

### Storing the index_tickers dataframe in the database
index_tickers.to_sql('index_tickers', conn, if_exists='replace')

### Removing all columns expect ticker, Date and close
index_83_99 = index_83_99[['ticker', 'date', 'close']]
index_00_09 = index_00_09[['ticker', 'date', 'close']]
index_10_20 = index_10_20[['ticker', 'date', 'close']]
index_20_24 = index_20_24[['ticker', 'date', 'close']]

### Combining the datasets
index = pd.concat([index_83_99, index_00_09, index_10_20, index_20_24])

### Fixing the dates
index['date'] = pd.to_datetime(index['date'], format='%Y%m%d')

### Gathering all the dates from the datasets
dates = index[['date']].drop_duplicates().sort_values(by='date').reset_index(drop=True)

### Creating a dataframe with all the dates and tickers
index_daily = pd.DataFrame(np.zeros((len(dates), len(index_tickers)), dtype=np.float64), columns=index_tickers['ticker'], index=dates['date'])

### Filling the dataframe with the returns
for i, row in index.iterrows():
    index_daily.loc[row['date'], row['ticker']] = row['close']

### Replacing all 0 with NaN
index_daily.replace(0, np.nan, inplace=True)

### Turning the prices into returns
index_returns_daily = index_daily.pct_change()

### Storing the returns in the database
index_returns_daily.to_sql('unfiltered_index_daily', conn, if_exists='replace')

### Saving the filtered data
index_returns_daily.to_sql('index_daily', conn, if_exists='replace')

### Looking at our data
index_returns_daily

ticker,AKAKS,AKSKVA,BAEX,BANX,FINX,FRSX,GFBX,GNHYAR,INDX,ITSX,...,SSENP,SSENX,SSSFP,SSSFX,SSSHP,SSSHX,STLNHY,TOTX,UOBX,XOBX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1983-01-03,,,,,,,,,,,...,,,,,,,,,,
1983-01-04,,,,,,-0.027700,,,-0.009900,,...,,,,,,,,-0.010200,,
1983-01-05,,,,-0.004512,,-0.034660,,,0.005858,,...,,,,,,,,0.002425,,
1983-01-06,,,,-0.005338,,0.013957,,,0.004519,,...,,,,,,,,0.005342,,
1983-01-07,,,,0.003342,,0.000000,,,0.031787,,...,,,,,,,,0.021955,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-06,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
2024-08-08,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.002144,0.0,0.000000,0.0,0.0
2024-08-09,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.042187,0.017035,0.0,0.000000,0.0,0.0
2024-08-13,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.013247,0.000000,0.0,0.000000,0.0,0.0


### Using the daily index returns to make monthly index returns

In [89]:
### Turning the daily returns into monthly returns
index_returns_monthly = index_daily.resample('M').ffill().pct_change()

### Storing the returns in the database
index_returns_monthly.to_sql('unfiltered_index_monthly', conn, if_exists='replace')

### Saving the filtered data
index_returns_monthly.to_sql('index_monthly', conn, if_exists='replace')

### Looking at our data
index_returns_monthly

ticker,AKAKS,AKSKVA,BAEX,BANX,FINX,FRSX,GFBX,GNHYAR,INDX,ITSX,...,SSENP,SSENX,SSSFP,SSSFX,SSSHP,SSSHX,STLNHY,TOTX,UOBX,XOBX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1983-01-31,,,,,,,,,,,...,,,,,,,,,,
1983-02-28,,,,0.022876,,0.026656,,,0.132471,,...,,,,,,,,0.087137,,
1983-03-31,,,,0.050500,,-0.027659,,,0.063993,,...,,,,,,,,0.072375,,
1983-04-30,,,,0.048330,,0.146449,,,0.202760,,...,,,,,,,,0.158662,,
1983-05-31,,,,-0.023092,,0.026893,,,-0.001840,,...,,,,,,,,-0.007345,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-30,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.23098,0.000000,0.203916,0.0,0.000000,0.0,0.0
2024-05-31,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.000000,0.0,0.0
2024-06-30,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.251061,0.000000,0.0,0.000000,0.0,0.0
2024-07-31,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,-0.058117,0.000000,0.0,0.000000,0.0,0.0


#### Factors and portfolios

In [90]:
### Getting all the factor data
liq_monthly = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/liq_measures_ose_monthly.txt",
                        sep=',', encoding='latin1')

mkt_daily = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/market_portfolios_daily.txt",
                        sep=',', encoding='latin1')

mkt_monthly = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/market_portfolios_monthly.txt",
                        sep=',', encoding='latin1')

factors_daily = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/pricing_factors_daily.txt",
                        sep=',', encoding='latin1')

factors_monthly = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/pricing_factors_monthly.txt",
                        sep=',', encoding='latin1')

rf_daily = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/rf_daily.txt",
                        sep=',', encoding='latin1', skiprows=1)

rf_monthly = pd.read_csv("/Users/emilwilliamhansen/Library/Mobile Documents/com~apple~CloudDocs/School/Master Thesis/Data/rf_monthly.txt",
                        sep=',', encoding='latin1', skiprows=1)

In [91]:
### Fixing the dates and putting them as the index
mkt_daily['date'] = pd.to_datetime(mkt_daily['date'], format='%Y%m%d')
mkt_daily.set_index('date', inplace=True)

mkt_monthly['date'] = pd.to_datetime(mkt_monthly['date'], format='%Y%m%d')
mkt_monthly.set_index('date', inplace=True)

factors_daily['date'] = pd.to_datetime(factors_daily['date'], format='%Y%m%d')
factors_daily.set_index('date', inplace=True)

factors_monthly['date'] = pd.to_datetime(factors_monthly['date'], format='%Y%m%d')
factors_monthly.set_index('date', inplace=True)

rf_daily['date'] = pd.to_datetime(rf_daily['date'], format='%Y%m%d')
rf_daily.set_index('date', inplace=True)

rf_monthly['date'] = pd.to_datetime(rf_monthly['date'], format='%Y%m%d')
rf_monthly.set_index('date', inplace=True)

In [95]:
### Putting factors_daily and rf_daily together
factors_daily = pd.concat([factors_daily, rf_daily, mkt_daily[["EW", "VW"]]], axis=1)

### Changing the column name of Rf(1d) to rf
factors_daily.rename(columns={'Rf(1d)': 'rf'}, inplace=True)

### Interpolating the risk free rate
factors_daily['rf'] = factors_daily['rf'].interpolate()

### Doing the same for the monthly data
factors_monthly = pd.concat([factors_monthly, rf_monthly, mkt_monthly[["EW", "VW"]]], axis=1)
factors_monthly.rename(columns={'Rf(1m)': 'rf'}, inplace=True)
factors_monthly['rf'] = factors_monthly['rf'].interpolate()

In [93]:
### Adding the factors to the database
factors_daily.to_sql('factors_daily', conn, if_exists='replace')
factors_monthly.to_sql('factors_monthly', conn, if_exists='replace')

530

In [94]:
### END OF DATA PREPARATION

### Close the connection
conn.close()