In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

This dataset contains the fundamentals of a series of Tickets over 16 years. Each row is one `Ticker` and the `date` of the fundamental. The column `freq` is `Q` or `A` depending if that fundamental is for a quarter or for a year, respectively.

In [2]:
data = pd.read_feather("data/financials_against_sp500.feather")
data.head()

Unnamed: 0,Ticker,date,freq,Asset Turnover,Basic EPS,Basic Shares Outstanding,Book Value Per Share,Cash Flow From Financial Activities,Cash Flow From Investing Activities,Cash Flow From Operating Activities,...,dividend_cumsum_in_period_-730,stock_change_div_-730,sp500_change_-730,improve_sp500,std_365,std_730,std_-120,std_-365,std_-730,sharpe_ratio_365
0,A,2005-01-31,Q,0.1695,0.21,491.0,7.6741,81.0,-57.0,137.0,...,0.0,-0.1351,-0.175443,1.0,0.017758,0.017009,0.014371,0.020641,0.021253,20.890487
1,CNR,2005-01-31,Q,0.2614,2.65,4.0672,101.9772,155.401,-26.165,14.376,...,0.0,-0.479573,-0.175443,1.0,0.018951,0.018238,0.016412,0.022401,0.02124,32.770451
2,DY,2005-01-31,Q,0.336,0.15,48.689,11.1993,0.982,-39.076,58.545,...,0.0,-0.141848,-0.175443,1.0,0.024645,0.022685,0.030047,0.026593,0.026688,3.031387
3,GYRO,2005-01-31,Q,0.0482,-0.7743,0.1072,57.1457,0.6171,0.0986,-1.0356,...,0.0,-0.531536,-0.175443,1.0,0.020841,0.020143,0.026398,0.021356,0.020044,10.953862
4,CULP,2005-01-31,Q,0.3925,-0.42,11.55,8.0901,-1.886,-8.216,8.554,...,0.0,0.554545,-0.175443,1.0,0.021073,0.025946,0.021159,0.021098,0.028727,5.284556


Create a column called `period` with the quarter or the year, depending of `freq`:

In [3]:
data.loc[data["freq"]=="Q","period"] = data["date"].dt.year.astype(str) + "_" + data["date"].dt.quarter.astype(str)
data.loc[data["freq"]=="A","period"] = data["date"].dt.year.astype(str)
data = data.drop(columns = ["date"])
# ensure there is only one row per Ticker and period
print(data.groupby(["Ticker","freq","period"]).size().value_counts())

1    275593
2       731
3         1
dtype: int64


In [4]:
print(data.shape)
data = data.sort_values(["Ticker","freq","period"]).drop_duplicates(["Ticker","freq","period"])
print(data.shape)

(277058, 140)
(276325, 140)


Ensure that we have 1 and only 1 row for each combination of `period`, `freq` and `Ticker`:

In [5]:
all_period_quarter = data[data["freq"]=="Q"].sort_values("period")["period"].unique()
all_period_anual = data[data["freq"]=="A"].sort_values("period")["period"].unique()
all_periods = pd.concat([pd.DataFrame({"period":all_period_quarter,"freq":"Q"}),pd.DataFrame({"period":all_period_anual,"freq":"A"})])
all_periods = all_periods.merge(data.drop_duplicates("Ticker")["Ticker"], how='cross')
data = pd.merge(all_periods,data,on = ["freq","period","Ticker"],how = "left")
# ensure that for each ticker and frequency we have the same number of rows
assert len(data[data["freq"]=="Q"].groupby(["Ticker","freq"]).size().reset_index()[0].value_counts()) ==1
assert len(data[data["freq"]=="A"].groupby(["Ticker","freq"]).size().reset_index()[0].value_counts()) ==1


Compute the Market Cap:

Put NaN on negative values of `Shares Outstanding`:

In [6]:
data.loc[data['Shares Outstanding'] <= 0, 'Shares Outstanding'] = np.nan

In [7]:
data = data.sort_values(["Ticker","period"],ascending = True)
data['Filled Shares Outstanding'] = data.groupby('Ticker')['Shares Outstanding'].fillna(method = 'ffill')
data['Market_cap'] = data['close_0'] * data['Filled Shares Outstanding']

In [17]:
data.reset_index().to_feather("data/trns_financials_against_sp500.feather")