## Master's Thesis - Machine Learning in Asset Pricing

### Thomas Theodor Kjølbye 

The Following script handles all data used in the paper. On my computer, the entire script takes approximately 5 minutes to run. The data consist of individual firm characteristics as well as macroeconomic variables and are generously made available by Professors Gu, Kelly, Xiu, and Goyal. 

The script is the 2nd of two data processing scripts. The first one does not compute interaction terms or reduce dimensionality by PCA. However, it does perform the rank normalization leveraged in Gu, Kelly, and Xiu (2020). 

In [1]:
# Load the usual suspects
import csv
import pandas as pd
import numpy as np
import os
import Toolbox as tb
import time as time
from sklearn.decomposition import PCA

%load_ext autoreload
%autoreload 2

In [None]:
# Initialize warning log container 
log = list()

In [None]:
os.getcwd()

In [2]:
# Load monthly returns data from CRSP and process data
returns = pd.read_csv(os.path.dirname(os.getcwd()) + '\\RET.txt')
returns.columns = returns.columns.str.strip()
returns.columns = returns.columns.str.lower() 
returns.rename(columns = {"col1":"date"}, inplace = True)
returns["date"] = returns["date"].floordiv(100)
returns_raw = returns[["date", "permno", "ret"]]

# Load macroeconomic predictors and clean it
macro_data_raw = pd.read_csv(os.path.dirname(os.getcwd()) + '\\macro_data.csv') 
macro_data_raw = macro_data_raw[(macro_data_raw["date"] > 195612) & (macro_data_raw["date"] < 201701)]

# Load firm characteristics
firm_data_raw = pd.read_csv(os.path.dirname(os.getcwd()) + '\\datashare.zip')
firm_data_raw.columns = firm_data_raw.columns.str.lower()
firm_data_raw["date"] = firm_data_raw["date"].floordiv(100) # https://stackoverflow.com/questions/33034559/how-to-remove-last-the-two-digits-in-a-column-that-is-of-integer-type
firm_data_raw.set_index(["permno", "date"], inplace = True)
print("The firm characteristics dataset is {:1.3f} GB".format(firm_data_raw.memory_usage().sum()/(1024 ** 3)))

# I have refrained from saving the firm char. in my wd (repo) because I am unable to push 3 GB worth of data to the github.

The firm characteristics dataset is 2.939 GB


In [6]:
# Downcast from 64bit flots and ints to 32bit
tb.downcast(firm_data_raw)
tb.downcast(returns_raw)
tb.downcast(macro_data_raw)

Before downcast: 1.481 GB and float32    95
dtype: int64
After downcast: 1.481 GB and float32    95
dtype: int64
Before downcast: 0.045 GB and int32      2
float32    1
dtype: int64
After downcast: 0.045 GB and int32      2
float32    1
dtype: int64
Before downcast: 0.000 GB and float32    8
int32      1
dtype: int64
After downcast: 0.000 GB and float32    8
int32      1
dtype: int64


In [4]:
# Merge with returns data and filter out rows missing in returns data
data = firm_data_raw.reset_index().merge(returns_raw, on = ["permno", "date"], how = "inner")

# Merge with macro data 
data = data.merge(macro_data_raw, on = "date")
data = data.set_index(["permno", "date"]) # 3760315 x 104 (94 char, industry dummy, returns, 8 macro)

In [5]:
# Split data up in (firm characteristics, macro), returns and industry codes ((Not) used in computation of inter terms)

# (Firm characteristics, macro)
non_interaction_to_drop = ["ret", "sic2"]
interaction_data = data.drop(non_interaction_to_drop, axis = 1).reset_index()

# Returns
returns = data.ret

# Industry codes
industry_code = data.sic2

In [12]:
# Save interaction data
interaction_data.to_csv(os.path.dirname(os.getcwd()) + '\\interaction_data.csv', header = True, index = False)

In [18]:
# Load data for interaction terms
interaction_data = pd.read_csv(os.path.dirname(os.getcwd()) + '\\interaction_data.csv')
tb.downcast(interaction_data)

Before downcast: 2.914 GB and float64    102
int64        2
dtype: int64
After downcast: 1.457 GB and float32    102
int32        2
dtype: int64


In [20]:
# Split and clean data
tfirm, vfirm, ttfirm, tmacro, vmacro, ttmacro = tb.data_processing(data = interaction_data, start = 197501, end = 198612)

  data["constant"] = 1


In [22]:
# Compute interaction terms for training and validation
tinteraction, mean, std = tb.interaction(tfirm, tmacro)
#vinteraction, _, _ = tb.interaction(vfirm, vmacro)

In [32]:
# Compute interaction terms for testing 
tb.interaction_noRAM(ttfirm, ttmacro, mean = mean, std = std, filename = 'test_interaction.csv')

In [41]:
# PCA for training data
pca = PCA(n_components = 0.95)
pca.fit(tinteraction)
tdata = pca.transform(tinteraction) 


In [51]:
# PCA for test data
tb.save_txt(name = os.path.dirname(os.getcwd()) + "\\" + 'test_interaction.csv', newfilename = 'test_data.csv',
                    pc = pca.components_.T)

In [56]:
tdata.nbytes / 1024 ** 3

0.5249649211764336

In [62]:
tfirm.shape

(479317, 85)

In [None]:
# Compute interaction terms for training data 

# Initialize empty dataframe
interaction_training = pd.DataFrame(columns = range(0), index = range(firm_training.shape[0]))

# Loop through macro predictors, multiply with characteristics and return to interaction dataframe
for count, value in enumerate(macro_training.columns):
                          
    macro_ite = macro_training[value].values # .values returns np.array not pd.series
    product_ite = macro_ite.reshape(-1,1) * firm_training.values # 2D to make compatible for element-wise multiplication       
    column_ite = [str(col) + f'X{value}' for col in firm_training.columns] 
    df_ite = pd.DataFrame(product_ite, columns = column_ite)
    interaction_training = pd.concat([interaction_training, df_ite], axis = 1)
    
    #interaction_training[column_ite] = product_ite 
    #interaction.to_csv(os.path.dirname(os.getcwd()) + f'\\firmX{value}.csv')

In [None]:
# Combine training data for standardization and PCA 
training_data = pd.concat([firm_training.reset_index(), interaction_training], axis = 1)
training_data = training_data.set_index(["permno","date"])

In [None]:
# Standardize data
training_data = training_data.apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
training_data = training_data.fillna(0)

In [None]:
# PCA 
pca = PCA(n_components = 0.95)
pca.fit(training_data)
reduced = pca.transform(training_data) 


# pca.explained_variance_ giver eigenvalues for hver principal component
# reduced.shape giver antal covariates i reduced datasæt
# reduced.nbytes / 1024 ** 3 giver size i GB for reduced data sæt som er numpy array. Det er 0.262482 GB
'''
Manuel måde at få eigenvalues på:
training_data_tester = training_data.copy()
n_samples = reduced.shape[0]
# We center the data and compute the sample covariance matrix.
training_data_tester -= np.mean(training_data_tester, axis=0)
cov_matrix = np.dot(training_data_tester.T, training_data_tester) / n_samples
for eigenvector in pca.components_:
    print(np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)))
'''

In [None]:
# Compute interaction terms for the test data - jeg gør det på træning men metoden bliver det samme til funktionen
macro_training_test = macro_training.copy()
macro_training_test.insert(0, "constant", 1)

for i in range(firm_training.shape[0]):
    
    firm_row = firm_training.iloc[i, :].values.reshape(-1,1).T # 1x88
    macro_row = macro_training_test.iloc[i, :].values.reshape(-1,1) # 9x1 
    
    interaction = macro_row @ firm_row # 9x1 @ 1x88 = 9x88
    interaction_flat = interaction.reshape((1, training_data.shape[1]))
    interaction_std = (interaction_flat - training_mean) / training_std # Elementwise
    interaction_std = np.nan_to_num(interaction_std)
    
    to_append = (interaction_std[0]).tolist()
    #to_append = (interaction.reshape((1, 792))[0]).tolist() # https://stackoverflow.com/questions/39694318/difference-between-single-and-double-bracket-numpy-array
    
    with open("test.csv", "a", newline = "") as t:

        writer = csv.writer(t)
        writer.writerow(to_append)

# Husk at gem andet sted end i repo! 

# Der kommer til at være en lille ændring ift. koden i funktionen. øverst vælger jeg macro_training_test, det kommer til at
# være en del af det samlede træningssæt som jeg smider ind i funktionen. jeg kommer altså ikke til at have separate data
# sæt som ejg smider ind i ufnktionen 

In [None]:
# Denne her skal køre inden jeg standardiserer training_data - den skal komme tidligere når jeg
# laver funktionen (efter alle 3 sæt har fået lavet interatkion terms)
training_mean = training_data.mean().values.reshape(-1,1).T
training_std = training_data.std().values

In [None]:
with open("test.csv", "r", newline = "") as t:
    for line in range(5):
        line = t.readline()
        print(line)

In [None]:
def loadtxt(name):
    with open(name, "r") as file:
        for line in file:
            line_int = np.fromstring(line, sep = ",").reshape(-1,1).T # 1xK 2D array
            yield line_int


In [None]:
def pca_each_line(name, pc): # name er ikke nødvendig som den er nu 
    
    for line in loadtxt(name):
        pca_line = np.dot(line, pc)
        to_append = (pca_line[0]).tolist()
        
        yield to_append
        

In [None]:
def save_txt(name, newfilename, pc):
    with open(newfilename, "a", newline = "") as newfile:
        writer = csv.writer(newfile)
        for newline in pca_each_line(name, pc):
            writer.writerow(newline)

In [None]:
# Jeg har fundet ud af at jeg skal lave PCA på hvert træningsdatasæt og på hvert validation sæt og test!.

# Jeg skal faktisk lave al databehandling jeg har gjort her for hvert fold fordi træning og validation vil være anderledes
# for hver fold -- hvad er en smart måde at gøre det på? Lave en funktion som klarer alt data behandling og som bare 
# spytter træning og validation sæt ud? 

# Lav PCA på træningssæt så kan jeg lave funktionen

In [None]:
# Industry dummies
# Merge
# DONE med træningssæt 