In [43]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

today = datetime.today().strftime('%Y_%m_%d')
roce_df = pd.read_csv("stock_financials/ROCE_Ratios_recent.csv")
roce_df.drop_duplicates(subset='ticker', inplace=True)

## Percential Ranking

In [44]:
rank_variable = '2024'
#roce_df['roce_rank'] = roce_df.groupby('Industry')[rank_variable].rank(method='dense', ascending=False)
roce_df['roce_rank'] = roce_df[rank_variable].rank(ascending=False)

## Regression

In [45]:
df = roce_df[[ '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2023', '2014']]

In [46]:
def cal_coef(X,y):
    """Function to calculate coefficients or w or theta or b0 & b1"""
    coeffs = np.dot(np.linalg.inv(np.dot(X.T,X)),np.dot(X.T,y)) #
    
    #Similar way to calculate coeffs
    # coeffs = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return coeffs
  
def predict_normeq(new_x,coeffs):
    """Function to predict linear regression using normal equation"""
    
    new_y = np.dot(new_x, coeffs)
    return new_y
  

In [47]:
reg_results = {"ticker":[],
              "coef":[],
              "r_squared":[],
              "std":[]}

for i in roce_df['ticker']:
    df = roce_df[roce_df['ticker'] == i]
    df = df[['2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2023', '2024']]
    
    regression_df = df.transpose()
    regression_df.reset_index(inplace=True)
    regression_df.columns = ['year', 'roce']
    regression_df['year'] = regression_df['year'].astype(int)
    
    # Drop rows where ROCE is NA
    regression_df = regression_df.dropna(subset=['roce'])
    
    # Only proceed with regression if we have enough data points
    if len(regression_df) >= 2:  # Need at least 2 points for regression
        X = np.column_stack([np.ones(len(regression_df), dtype=np.float32), regression_df['year'].values])
        y = regression_df['roce']
        
    else:
        print(f"Insufficient data points for ticker {i} after removing NA values")

    coeffs = cal_coef(X,y) # b0-intercept, b1-slope
    slope = coeffs[1]
    normeq_preds = predict_normeq(X,coeffs)

    ## Model Evaluation
    SSE = sum((y-normeq_preds)**2) # Sum of squared error
    SST = sum((y-np.mean(y))**2) # Sum of squared total
    R_squared = 1-(SSE/SST) # R Square

    std = np.std(y)
    
    reg_results['ticker'].append(i)
    reg_results['coef'].append(slope)
    reg_results['r_squared'].append(R_squared)
    reg_results['std'].append(std)

reg_results = pd.DataFrame(reg_results)

In [48]:
#reg_results['coef'] = np.where(reg_results['r_squared'] < 0.15, 0, reg_results['coef'])
reg_results['coef_w'] = reg_results['coef'] * reg_results['r_squared']

In [49]:
reg_results['coef_rank'] = reg_results['coef_w'].rank(ascending=False)
reg_results['std_rank'] = reg_results['std'].rank(ascending=True)

In [50]:
roce_df = roce_df.merge(reg_results, left_on='ticker', right_on='ticker')

In [51]:
#long-term can build regression to find the best weights
roce_wt = 0.35
growth_wt = 0.65
std_wt = 0.0

In [52]:
roce_df['average_score'] = ((growth_wt *roce_df['coef_rank'] ) + 
                            (roce_wt * roce_df['roce_rank'] ) + 
                            (std_wt * roce_df['std_rank'])) / 3
roce_df['final_rank'] = roce_df['average_score'].rank(ascending=True)

In [55]:
roce_df.to_csv(f'stock_scores/stock_score_data_{today}.csv')
roce_df.to_csv(f'stock_scores/stock_score_data_recent.csv')

In [53]:
roce_df

Unnamed: 0.1,Unnamed: 0,ticker,2014,2015,2016,2017,2018,2019,2020,2021,...,2024,roce_rank,coef,r_squared,std,coef_w,coef_rank,std_rank,average_score,final_rank
0,0,A,0.091653,0.080499,0.093327,0.120946,0.126015,0.125188,0.106420,0.156629,...,0.122353,253.0,0.006903,0.532734,0.029907,0.003677,431.0,613.0,122.900000,172.0
1,0,AAPL,0.310000,0.356523,0.264343,0.237142,0.260334,0.260153,0.291111,0.484646,...,0.441290,17.0,0.025365,0.493695,0.114159,0.012523,207.0,1695.0,46.833333,28.0
2,0,ABBV,0.156615,0.242364,0.177018,0.173030,0.121957,0.270983,0.110972,0.153200,...,0.078632,481.0,-0.008089,0.235964,0.052661,-0.001909,1812.0,1035.0,448.716667,1401.0
3,0,ABNB,,,,,,-0.106600,-0.539270,0.045522,...,0.057083,697.0,0.087672,0.357452,0.250437,0.031339,115.0,2115.0,106.233333,129.0
4,0,ABT,0.088426,0.085910,0.092338,0.026348,0.061590,0.077944,0.090715,0.137997,...,0.053886,742.0,0.003132,0.092414,0.032576,0.000289,824.0,668.0,265.100000,577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,0,ZURA,,,,,,,,0.698240,...,-0.177943,2191.0,-0.276317,0.273057,0.591201,-0.075450,2404.0,2295.0,776.483333,2377.0
2473,0,ZVRA,-1.007231,-0.675935,-0.416832,-0.617408,-45.365815,1.976577,-0.841119,0.085930,...,-0.377587,2371.0,0.462732,0.012680,12.994823,0.005867,346.0,2439.0,351.583333,971.0
2474,0,ZWS,0.083706,0.065220,0.047430,0.078254,0.065488,0.112045,0.090000,0.100552,...,0.054171,733.0,0.001683,0.063733,0.021079,0.000107,913.0,445.0,283.333333,672.5
2475,0,ZYME,,-0.432725,-0.262588,-0.273006,-0.296454,-0.502186,-0.377339,-0.589631,...,-0.131620,2119.0,0.017780,0.113137,0.151830,0.002012,546.0,1886.0,365.516667,1043.0


In [54]:
roce_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2477 entries, 0 to 2476
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     2477 non-null   int64  
 1   ticker         2477 non-null   object 
 2   2014           1780 non-null   float64
 3   2015           1828 non-null   float64
 4   2016           1890 non-null   float64
 5   2017           1972 non-null   float64
 6   2018           2042 non-null   float64
 7   2019           2157 non-null   float64
 8   2020           2363 non-null   float64
 9   2021           2433 non-null   float64
 10  2022           2464 non-null   float64
 11  2023           2476 non-null   float64
 12  2024           2453 non-null   float64
 13  roce_rank      2453 non-null   float64
 14  coef           2477 non-null   float64
 15  r_squared      2477 non-null   float64
 16  std            2477 non-null   float64
 17  coef_w         2477 non-null   float64
 18  coef_ran