# Importing Data
This is a project about Data Analysis for the objective stock indicators from ShangHai and Vietnam, the data is collected from the range 2020 to 2024. The source for ShangHai stock dataset is S&P Global, and VCI(VietCap Stock)

In [2]:
pip install shap

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp312-cp312-win_amd64.whl (456 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8
Note: you may need to restart the kernel to use updated packages.


## Load the dataset

In [1]:
import shap
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from linearmodels.panel import PanelOLS, RandomEffects
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error, mean_absolute_percentage_error


The raw dataset is provided in folder, and load into DataFrame

In [2]:
vn_df = pd.read_csv("vietnam_data.csv")
vn_df

Unnamed: 0,Comp_data,Year,Quarter,EPS,BVPS,ROA,ROE,P/E Ratio,DAR,MB,DY,DIV,Market Cap,Total Assets,Stock Price
0,AAV,2024,1,-48.079754,10726.64399,-0.018067,-0.022292,-25.300774,0.146169,0.568677,0.000000,0.000000,4.210000e+11,9.090000e+11,3.70
1,AAV,2024,2,-27.627976,10699.01602,-0.015186,-0.018721,-28.782145,0.146289,0.542106,0.000000,0.000000,4.000000e+11,9.070000e+11,5.50
2,AAV,2024,3,-27.095742,10671.92028,-0.012993,-0.015996,-41.990328,0.147597,0.674668,0.000000,0.000000,4.970000e+11,9.070000e+11,6.40
3,AAV,2023,1,-48.477520,10971.74966,-0.001734,-0.002448,-221.925697,0.241432,0.546859,0.000000,0.000000,4.140000e+11,1.060000e+12,3.90
4,AAV,2023,2,-67.213499,10904.53616,-0.011205,-0.015320,-27.290519,0.149924,0.421843,0.000000,0.000000,3.170000e+11,9.280000e+11,5.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17883,XMP,2023,4,245.606801,11476.31876,0.011925,0.028992,58.936098,0.574130,1.690438,0.010309,0.191959,2.910000e+11,4.042190e+11,18.62
17884,XMP,2022,1,453.834519,11481.76614,0.033454,0.090928,16.670194,0.611227,1.463190,0.053571,0.798750,2.520000e+11,4.430010e+11,14.91
17885,XMP,2022,2,225.706812,10679.54133,0.024045,0.064674,24.167644,0.632687,1.610556,0.052326,0.849244,2.580000e+11,4.361210e+11,16.23
17886,XMP,2022,3,224.676527,10904.21786,0.037683,0.100155,16.303713,0.621295,1.650737,0.050000,0.754500,2.700000e+11,4.319010e+11,15.09


In [3]:
sh_df = pd.read_csv("shanghai.csv")
sh_df

Unnamed: 0,Comp_code,Time,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap
0,600000,9/30/2024,9416535,8685534.00,0.23,24.62,0.00,0.05,0.00,9.87,0.00,297338.0
1,600000,6/30/2024,9253778,8500022.00,0.36,21.65,0.01,0.10,0.00,7.94,0.00,241568.0
2,600000,3/31/2024,9053468,8302126.00,0.57,25.31,0.00,0.05,0.00,6.88,0.00,209281.0
3,600000,12/31/2023,9007247,8274363.00,0.15,20.94,0.00,0.03,0.05,6.38,0.32,194311.0
4,600000,9/30/2023,8808320,8084504.00,0.12,24.38,0.00,0.04,0.00,6.85,0.00,208400.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16597,603920,3/31/2021,5025,2175.00,0.05,5.35,0.07,0.13,Na,11.04,Na,6843.0
16598,603920,12/31/2020,3803,1103.00,0.17,5.07,0.08,0.11,0.03,17.56,0.46,10712.0
16599,603920,9/30/2020,3821,1217.00,0.14,4.89,0.07,0.14,Na,16.79,Na,10074.0
16600,603920,6/30/2020,3751,1215.00,0.17,4.76,0.03,0.08,Na,15.93,Na,10151.0


## Pre-processing Data

### ShangHai Dataset

Replace Na to NaN in the dataset

In [4]:
sh_df.replace("Na",np.nan,inplace= True)
sh_df

Unnamed: 0,Comp_code,Time,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap
0,600000,9/30/2024,9416535,8685534.00,0.23,24.62,0.00,0.05,0.00,9.87,0.00,297338.0
1,600000,6/30/2024,9253778,8500022.00,0.36,21.65,0.01,0.10,0.00,7.94,0.00,241568.0
2,600000,3/31/2024,9053468,8302126.00,0.57,25.31,0.00,0.05,0.00,6.88,0.00,209281.0
3,600000,12/31/2023,9007247,8274363.00,0.15,20.94,0.00,0.03,0.05,6.38,0.32,194311.0
4,600000,9/30/2023,8808320,8084504.00,0.12,24.38,0.00,0.04,0.00,6.85,0.00,208400.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16597,603920,3/31/2021,5025,2175.00,0.05,5.35,0.07,0.13,,11.04,,6843.0
16598,603920,12/31/2020,3803,1103.00,0.17,5.07,0.08,0.11,0.03,17.56,0.46,10712.0
16599,603920,9/30/2020,3821,1217.00,0.14,4.89,0.07,0.14,,16.79,,10074.0
16600,603920,6/30/2020,3751,1215.00,0.17,4.76,0.03,0.08,,15.93,,10151.0


#### Observe the dataset to measure the null value of each columns.

In [5]:
missing_data = sh_df.isnull()
for column in missing_data.columns.tolist():
    print(column)
    print(f"Missing value: {missing_data[column].value_counts()}")

Comp_code
Missing value: Comp_code
False    16602
Name: count, dtype: int64
Time
Missing value: Time
False    16602
Name: count, dtype: int64
Total Assets
Missing value: Total Assets
False    16442
True       160
Name: count, dtype: int64
Total Debt
Missing value: Total Debt
False    16442
True       160
Name: count, dtype: int64
EPS
Missing value: EPS
False    16577
True        25
Name: count, dtype: int64
BVPS
Missing value: BVPS
False    16456
True       146
Name: count, dtype: int64
ROA
Missing value: ROA
False    16153
True       449
Name: count, dtype: int64
ROE
Missing value: ROE
False    16145
True       457
Name: count, dtype: int64
DY
Missing value: DY
False    11138
True      5464
Name: count, dtype: int64
Price
Missing value: Price
False    16602
Name: count, dtype: int64
DIV
Missing value: DIV
False    11138
True      5464
Name: count, dtype: int64
Market Cap
Missing value: Market Cap
False    16583
True        19
Name: count, dtype: int64


#### Replace DY null values with 0


In [6]:
sh_df["DY"].fillna(0, inplace = True)
sh_df["DIV"].fillna(0, inplace = True)
sh_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sh_df["DY"].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sh_df["DIV"].fillna(0, inplace = True)


Unnamed: 0,Comp_code,Time,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap
0,600000,9/30/2024,9416535,8685534.00,0.23,24.62,0.00,0.05,0.00,9.87,0.00,297338.0
1,600000,6/30/2024,9253778,8500022.00,0.36,21.65,0.01,0.10,0.00,7.94,0.00,241568.0
2,600000,3/31/2024,9053468,8302126.00,0.57,25.31,0.00,0.05,0.00,6.88,0.00,209281.0
3,600000,12/31/2023,9007247,8274363.00,0.15,20.94,0.00,0.03,0.05,6.38,0.32,194311.0
4,600000,9/30/2023,8808320,8084504.00,0.12,24.38,0.00,0.04,0.00,6.85,0.00,208400.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16597,603920,3/31/2021,5025,2175.00,0.05,5.35,0.07,0.13,0,11.04,0,6843.0
16598,603920,12/31/2020,3803,1103.00,0.17,5.07,0.08,0.11,0.03,17.56,0.46,10712.0
16599,603920,9/30/2020,3821,1217.00,0.14,4.89,0.07,0.14,0,16.79,0,10074.0
16600,603920,6/30/2020,3751,1215.00,0.17,4.76,0.03,0.08,0,15.93,0,10151.0


#### Drop missing value apply for other columns

In [7]:
sh_df.dropna(axis = 0, inplace = True)
sh_df

Unnamed: 0,Comp_code,Time,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap
0,600000,9/30/2024,9416535,8685534.00,0.23,24.62,0.00,0.05,0.00,9.87,0.00,297338.0
1,600000,6/30/2024,9253778,8500022.00,0.36,21.65,0.01,0.10,0.00,7.94,0.00,241568.0
2,600000,3/31/2024,9053468,8302126.00,0.57,25.31,0.00,0.05,0.00,6.88,0.00,209281.0
3,600000,12/31/2023,9007247,8274363.00,0.15,20.94,0.00,0.03,0.05,6.38,0.32,194311.0
4,600000,9/30/2023,8808320,8084504.00,0.12,24.38,0.00,0.04,0.00,6.85,0.00,208400.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16597,603920,3/31/2021,5025,2175.00,0.05,5.35,0.07,0.13,0,11.04,0,6843.0
16598,603920,12/31/2020,3803,1103.00,0.17,5.07,0.08,0.11,0.03,17.56,0.46,10712.0
16599,603920,9/30/2020,3821,1217.00,0.14,4.89,0.07,0.14,0,16.79,0,10074.0
16600,603920,6/30/2020,3751,1215.00,0.17,4.76,0.03,0.08,0,15.93,0,10151.0


#### Correcting data types

In [8]:
sh_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15900 entries, 0 to 16601
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Comp_code     15900 non-null  int64  
 1   Time          15900 non-null  object 
 2   Total Assets  15900 non-null  object 
 3   Total Debt    15900 non-null  object 
 4   EPS           15900 non-null  object 
 5   BVPS          15900 non-null  object 
 6   ROA           15900 non-null  object 
 7   ROE           15900 non-null  object 
 8   DY            15900 non-null  object 
 9   Price         15900 non-null  float64
 10  DIV           15900 non-null  object 
 11  Market Cap    15900 non-null  float64
dtypes: float64(2), int64(1), object(9)
memory usage: 1.6+ MB


In [9]:
ignore = ['Time']
sh_df = (sh_df.set_index(ignore, append=True)
        .astype(float)
        .reset_index(ignore)
       )
sh_df["Time"].astype("datetime64[ns]")
sh_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15900 entries, 0 to 16601
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Time          15900 non-null  object 
 1   Comp_code     15900 non-null  float64
 2   Total Assets  15900 non-null  float64
 3   Total Debt    15900 non-null  float64
 4   EPS           15900 non-null  float64
 5   BVPS          15900 non-null  float64
 6   ROA           15900 non-null  float64
 7   ROE           15900 non-null  float64
 8   DY            15900 non-null  float64
 9   Price         15900 non-null  float64
 10  DIV           15900 non-null  float64
 11  Market Cap    15900 non-null  float64
dtypes: float64(11), object(1)
memory usage: 1.6+ MB


#### Split data

In [41]:
shl_df= sh_df[sh_df["Market Cap"]>70000]
shm_df= sh_df[(sh_df["Market Cap"]<70000) & (sh_df["Market Cap"]>14000)]
shs_df= sh_df[sh_df["Market Cap"]<14000]
shm_df

Unnamed: 0,Time,Comp_code,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap,normalized_total_assets,normalized_total_debt,normalized_market_cap,DAR,SIZE
19,2024-09-30,600004.0,26982.0,8665.00,0.10,7.64,0.03,0.06,0.00,9.78,0.00,24756.0,-0.110606,-0.120044,0.031226,0.321140,10.202925
20,2024-06-30,600004.0,26955.0,8870.00,0.10,7.54,0.02,0.04,0.00,9.41,0.00,22436.0,-0.110642,-0.119742,-0.003089,0.329067,10.201924
21,2024-03-31,600004.0,26597.0,8606.00,0.08,7.52,0.02,0.04,0.00,10.00,0.00,23857.0,-0.111120,-0.120131,0.017929,0.323570,10.188554
22,2023-12-31,600004.0,26190.0,8384.00,0.07,7.43,0.02,0.03,0.01,9.70,0.08,23147.0,-0.111664,-0.120459,0.007427,0.320122,10.173133
23,2023-09-30,600004.0,26765.0,9079.00,0.05,7.38,0.02,0.03,0.00,11.11,0.00,26507.0,-0.110896,-0.119433,0.057125,0.339212,10.194850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16576,2021-09-30,603919.0,3559.0,659.33,0.11,5.72,0.07,0.10,0.00,33.32,0.00,17485.0,-0.141890,-0.131857,-0.076320,0.185257,8.177235
16577,2021-06-30,603919.0,3479.0,635.28,0.14,5.61,0.10,0.16,0.00,38.95,0.00,20859.0,-0.141996,-0.131893,-0.026415,0.182604,8.154500
16578,2021-03-31,603919.0,3441.0,547.34,0.23,5.70,0.16,0.26,0.00,33.66,0.00,17450.0,-0.142047,-0.132022,-0.076838,0.159064,8.143517
16579,2020-12-31,603919.0,3532.0,753.69,0.34,5.48,0.05,0.06,0.01,37.71,0.24,20539.0,-0.141926,-0.131718,-0.031148,0.213389,8.169620


#### Normalize values


In [75]:
shs_df["normalized_total_assets"] = (shs_df["Total Assets"]-shs_df["Total Assets"].mean())/shs_df["Total Assets"].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shs_df["normalized_total_assets"] = (shs_df["Total Assets"]-shs_df["Total Assets"].mean())/shs_df["Total Assets"].std()


In [76]:
shs_df["normalized_total_debt"] = (shs_df["Total Debt"]-shs_df["Total Debt"].mean())/shs_df["Total Debt"].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shs_df["normalized_total_debt"] = (shs_df["Total Debt"]-shs_df["Total Debt"].mean())/shs_df["Total Debt"].std()


In [77]:
shs_df["normalized_market_cap"] = (shs_df["Market Cap"]-shs_df["Market Cap"].mean())/shs_df["Market Cap"].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shs_df["normalized_market_cap"] = (shs_df["Market Cap"]-shs_df["Market Cap"].mean())/shs_df["Market Cap"].std()


In [78]:
shs_df["DAR"] = shs_df["Total Debt"]/shs_df["Total Assets"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shs_df["DAR"] = shs_df["Total Debt"]/shs_df["Total Assets"]


In [79]:
shs_df["SIZE"] = np.log(shs_df["Total Assets"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shs_df["SIZE"] = np.log(shs_df["Total Assets"])


In [80]:
shm_df

Unnamed: 0,Time,Comp_code,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap,normalized_total_assets,normalized_total_debt,normalized_market_cap,DAR,SIZE
19,2024-09-30,600004.0,26982.0,8665.00,0.10,7.64,0.03,0.06,0.00,9.78,0.00,24756.0,-0.265214,-0.296622,-0.290282,0.321140,10.202925
20,2024-06-30,600004.0,26955.0,8870.00,0.10,7.54,0.02,0.04,0.00,9.41,0.00,22436.0,-0.265435,-0.294690,-0.463409,0.329067,10.201924
21,2024-03-31,600004.0,26597.0,8606.00,0.08,7.52,0.02,0.04,0.00,10.00,0.00,23857.0,-0.268355,-0.297179,-0.357369,0.323570,10.188554
22,2023-12-31,600004.0,26190.0,8384.00,0.07,7.43,0.02,0.03,0.01,9.70,0.08,23147.0,-0.271675,-0.299272,-0.410352,0.320122,10.173133
23,2023-09-30,600004.0,26765.0,9079.00,0.05,7.38,0.02,0.03,0.00,11.11,0.00,26507.0,-0.266985,-0.292719,-0.159616,0.339212,10.194850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16576,2021-09-30,603919.0,3559.0,659.33,0.11,5.72,0.07,0.10,0.00,33.32,0.00,17485.0,-0.456297,-0.372098,-0.832872,0.185257,8.177235
16577,2021-06-30,603919.0,3479.0,635.28,0.14,5.61,0.10,0.16,0.00,38.95,0.00,20859.0,-0.456950,-0.372325,-0.581091,0.182604,8.154500
16578,2021-03-31,603919.0,3441.0,547.34,0.23,5.70,0.16,0.26,0.00,33.66,0.00,17450.0,-0.457260,-0.373154,-0.835484,0.159064,8.143517
16579,2020-12-31,603919.0,3532.0,753.69,0.34,5.48,0.05,0.06,0.01,37.71,0.24,20539.0,-0.456517,-0.371209,-0.604971,0.213389,8.169620


## Data Exploring

#### Data Correlation

In [81]:
numeric_df = shs_df.select_dtypes(include = ["float64","int64"])
numeric_df.drop("Comp_code",axis=1, inplace = True)
numeric_df.corr()

Unnamed: 0,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap,normalized_total_assets,normalized_total_debt,normalized_market_cap,DAR,SIZE
Total Assets,1.0,0.987177,0.008716,0.122132,-0.024084,-0.000536,0.061582,-0.181114,0.006923,0.34848,1.0,0.987177,0.34848,0.392807,0.668215
Total Debt,0.987177,1.0,-0.003436,0.084739,-0.03155,-0.007965,0.048978,-0.15993,0.000478,0.267848,0.987177,1.0,0.267848,0.40974,0.595073
EPS,0.008716,-0.003436,1.0,0.366134,0.223009,0.065788,0.11032,0.26634,0.138903,0.155447,0.008716,-0.003436,0.155447,-0.133106,0.040798
BVPS,0.122132,0.084739,0.366134,1.0,0.136475,0.071835,0.125792,0.5395,0.218338,0.142716,0.122132,0.084739,0.142716,-0.169832,0.21044
ROA,-0.024084,-0.03155,0.223009,0.136475,1.0,0.215332,0.139341,0.160421,0.158938,0.141369,-0.024084,-0.03155,0.141369,-0.112462,0.013548
ROE,-0.000536,-0.007965,0.065788,0.071835,0.215332,1.0,0.027452,0.049476,0.028914,0.045712,-0.000536,-0.007965,0.045712,-0.096619,0.004305
DY,0.061582,0.048978,0.11032,0.125792,0.139341,0.027452,1.0,0.004184,0.760612,0.083246,0.061582,0.048978,0.083246,-0.026557,0.091747
Price,-0.181114,-0.15993,0.26634,0.5395,0.160421,0.049476,0.004184,1.0,0.149701,0.164329,-0.181114,-0.15993,0.164329,-0.181104,-0.238435
DIV,0.006923,0.000478,0.138903,0.218338,0.158938,0.028914,0.760612,0.149701,1.0,0.085115,0.006923,0.000478,0.085115,-0.050323,0.032858
Market Cap,0.34848,0.267848,0.155447,0.142716,0.141369,0.045712,0.083246,0.164329,0.085115,1.0,0.34848,0.267848,1.0,0.122615,0.603023


#### Data describe

In [82]:
numeric_df.describe()

Unnamed: 0,Total Assets,Total Debt,EPS,BVPS,ROA,ROE,DY,Price,DIV,Market Cap,normalized_total_assets,normalized_total_debt,normalized_market_cap,DAR,SIZE
count,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0,11372.0
mean,9661.177662,6021.996842,0.06669,4.949909,0.019692,0.015944,0.00308,10.233841,0.028244,5376.947591,-4.9985420000000005e-17,-2.9991250000000004e-17,2.9991250000000004e-17,0.441451,8.414375
std,20989.338966,17887.960152,0.217153,3.309466,0.056115,1.000245,0.011245,8.136355,0.112329,3298.824964,1.0,1.0,1.0,0.213131,1.128784
min,102.1,7.22,-4.74,-1.92,-1.26,-91.66,0.0,0.66,0.0,459.0,-0.4554254,-0.3362472,-1.490818,0.00617,4.625953
25%,2082.75,640.45,0.0,2.83,0.0,0.0,0.0,5.0875,0.0,2743.75,-0.3610608,-0.3008474,-0.7982229,0.279776,7.641444
50%,4018.0,1620.5,0.05,4.39,0.02,0.05,0.0,8.15,0.0,4328.5,-0.2688592,-0.2460592,-0.3178246,0.436227,8.298539
75%,8772.0,4386.25,0.14,6.33,0.04,0.11,0.0,12.69,0.0,7399.0,-0.0423633,-0.09144401,0.6129614,0.588264,9.07932
max,393155.0,361707.0,5.85,35.06,1.75,22.54,0.21,117.46,3.79,13999.0,18.27089,19.88404,2.613674,1.603604,12.881959


## Data Analysis

#### Dataset Preparation

In [83]:
Y = numeric_df["Price"]
X = numeric_df.drop(["DY","Total Debt","Total Assets", "Market Cap","Price", "normalized_total_assets","normalized_total_debt","normalized_market_cap"],axis = 1)
X

Unnamed: 0,EPS,BVPS,ROA,ROE,DIV,DAR,SIZE
39,-0.04,4.16,0.00,0.06,0.00,0.524483,9.803336
41,0.08,4.15,-0.03,-0.02,0.03,0.551533,9.861206
42,-0.02,4.07,-0.02,-0.01,0.00,0.543724,9.825039
43,-0.01,4.10,0.00,0.05,0.00,0.519860,9.779850
44,0.05,4.15,-0.03,0.09,0.00,0.524332,9.800458
...,...,...,...,...,...,...,...
16597,0.05,5.35,0.07,0.13,0.00,0.432836,8.522181
16598,0.17,5.07,0.08,0.11,0.46,0.290034,8.243546
16599,0.14,4.89,0.07,0.14,0.00,0.318503,8.248267
16600,0.17,4.76,0.03,0.08,0.00,0.323914,8.229778


In [89]:
panel_df = shs_df
panel_df["Time"] = panel_df["Time"].astype("datetime64[ns]")
panel_df = panel_df.set_index(['Comp_code','Time'])
Y_panel = panel_df["Price"]
X_panel = panel_df.drop(["DY","Total Debt","Total Assets", "Market Cap","Price", "normalized_total_assets","normalized_total_debt","normalized_market_cap"],axis = 1)
X_panel = sm.add_constant(X_panel)
X_panel

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panel_df["Time"] = panel_df["Time"].astype("datetime64[ns]")


Unnamed: 0_level_0,Unnamed: 1_level_0,const,EPS,BVPS,ROA,ROE,DIV,DAR,SIZE
Comp_code,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
600006.0,2024-06-30,1.0,-0.04,4.16,0.00,0.06,0.00,0.524483,9.803336
600006.0,2023-12-31,1.0,0.08,4.15,-0.03,-0.02,0.03,0.551533,9.861206
600006.0,2023-09-30,1.0,-0.02,4.07,-0.02,-0.01,0.00,0.543724,9.825039
600006.0,2023-06-30,1.0,-0.01,4.10,0.00,0.05,0.00,0.519860,9.779850
600006.0,2023-03-31,1.0,0.05,4.15,-0.03,0.09,0.00,0.524332,9.800458
...,...,...,...,...,...,...,...,...,...
603920.0,2021-03-31,1.0,0.05,5.35,0.07,0.13,0.00,0.432836,8.522181
603920.0,2020-12-31,1.0,0.17,5.07,0.08,0.11,0.46,0.290034,8.243546
603920.0,2020-09-30,1.0,0.14,4.89,0.07,0.14,0.00,0.318503,8.248267
603920.0,2020-06-30,1.0,0.17,4.76,0.03,0.08,0.00,0.323914,8.229778


#### OLS model

Đối với mô hình Least Squared, hay còn gọi là mô hình Linear Regression. Đối với tập train, OLS model trả về R^2 0.722

In [85]:
X = sm.add_constant(X)
ols_model = sm.OLS(Y,X).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.453
Model:,OLS,Adj. R-squared:,0.452
Method:,Least Squares,F-statistic:,1343.0
Date:,"Fri, 14 Mar 2025",Prob (F-statistic):,0.0
Time:,12:40:56,Log-Likelihood:,-36548.0
No. Observations:,11372,AIC:,73110.0
Df Residuals:,11364,BIC:,73170.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,27.6231,0.443,62.346,0.000,26.755,28.492
EPS,2.1310,0.285,7.470,0.000,1.572,2.690
BVPS,1.5623,0.020,78.500,0.000,1.523,1.601
ROA,12.4841,1.067,11.696,0.000,10.392,14.576
ROE,0.0117,0.058,0.202,0.840,-0.102,0.126
DIV,1.0436,0.520,2.006,0.045,0.024,2.063
DAR,7.2210,0.331,21.822,0.000,6.572,7.870
SIZE,-3.4142,0.063,-54.618,0.000,-3.537,-3.292

0,1,2,3
Omnibus:,7798.937,Durbin-Watson:,0.376
Prob(Omnibus):,0.0,Jarque-Bera (JB):,238996.777
Skew:,2.879,Prob(JB):,0.0
Kurtosis:,24.708,Cond. No.,191.0


#### GLS

In [72]:
X = sm.add_constant(X)
gls_model = sm.GLS(Y,X).fit()
gls_model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.596
Model:,GLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,737.5
Date:,"Fri, 14 Mar 2025",Prob (F-statistic):,0.0
Time:,12:18:50,Log-Likelihood:,-14719.0
No. Observations:,3504,AIC:,29450.0
Df Residuals:,3496,BIC:,29500.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,113.9380,2.878,39.591,0.000,108.296,119.581
EPS,0.7205,0.262,2.754,0.006,0.208,1.234
BVPS,2.8994,0.060,48.659,0.000,2.783,3.016
ROA,104.3186,6.370,16.377,0.000,91.829,116.808
ROE,-0.0339,0.613,-0.055,0.956,-1.235,1.167
DIV,1.6852,1.562,1.079,0.281,-1.377,4.747
DAR,31.6950,2.038,15.551,0.000,27.699,35.691
SIZE,-13.0708,0.333,-39.249,0.000,-13.724,-12.418

0,1,2,3
Omnibus:,2669.435,Durbin-Watson:,0.409
Prob(Omnibus):,0.0,Jarque-Bera (JB):,93813.157
Skew:,3.282,Prob(JB):,0.0
Kurtosis:,27.484,Cond. No.,304.0


#### FEM

In [90]:
fem_model = PanelOLS(Y_panel, X_panel, entity_effects=True)  # entity_effects=True để thêm hiệu ứng cá thể
results = fem_model.fit()
results

0,1,2,3
Dep. Variable:,Price,R-squared:,0.0465
Estimator:,PanelOLS,R-squared (Between):,0.0780
No. Observations:,11372,R-squared (Within):,0.0465
Date:,"Fri, Mar 14 2025",R-squared (Overall):,0.0573
Time:,12:42:05,Log-likelihood,-3.002e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,74.840
Entities:,622,P-value,0.0000
Avg Obs:,18.283,Distribution:,"F(7,10743)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.7134,1.4977,0.4763,0.6339,-2.2224,3.6491
EPS,1.9246,0.1830,10.515,0.0000,1.5658,2.2833
BVPS,0.2715,0.0521,5.2076,0.0000,0.1693,0.3737
ROA,9.4468,0.7084,13.335,0.0000,8.0581,10.835
ROE,-0.0166,0.0353,-0.4705,0.6380,-0.0857,0.0525
DIV,2.0844,0.3202,6.5101,0.0000,1.4568,2.7120
DAR,0.6630,0.5948,1.1148,0.2650,-0.5028,1.8289
SIZE,0.8926,0.2100,4.2505,0.0000,0.4810,1.3043


#### REM

In [91]:
rem_model = RandomEffects(Y_panel, X_panel)  # entity_effects=True để thêm hiệu ứng cá thể
results = rem_model.fit()
results

0,1,2,3
Dep. Variable:,Price,R-squared:,0.0970
Estimator:,RandomEffects,R-squared (Between):,0.4848
No. Observations:,11372,R-squared (Within):,0.0285
Date:,"Fri, Mar 14 2025",R-squared (Overall):,0.3775
Time:,12:42:30,Log-likelihood,-3.053e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,174.37
Entities:,622,P-value,0.0000
Avg Obs:,18.283,Distribution:,"F(7,11364)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,17.774,1.1054,16.080,0.0000,15.608,19.941
EPS,1.9100,0.1855,10.297,0.0000,1.5465,2.2736
BVPS,0.9437,0.0394,23.924,0.0000,0.8664,1.0210
ROA,9.4438,0.7170,13.171,0.0000,8.0384,10.849
ROE,-0.0106,0.0358,-0.2971,0.7664,-0.0808,0.0595
DIV,2.0779,0.3249,6.3951,0.0000,1.4410,2.7148
DAR,3.9555,0.5331,7.4206,0.0000,2.9107,5.0004
SIZE,-1.6947,0.1479,-11.462,0.0000,-1.9846,-1.4049
