**4. Phân tích mô tả**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Read data from Google Drive
file_path = "/content/drive/MyDrive/financials_features_scaled.csv" # Update this path if your file is in a different location on Drive

try:
    df = pd.read_csv(file_path)
    print(df.info())
    display(df.head()) # Use display for better formatting in Colab
except FileNotFoundError:
    print(f"Error: The file was not found at '{file_path}'. Please check the file path on your Google Drive.")
except Exception as e:
    print(f"An error occurred: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   price_earnings               505 non-null    float64
 1   price_book                   505 non-null    float64
 2   price_sales                  505 non-null    float64
 3   valuation_z__scaled          505 non-null    float64
 4   earnings_share               505 non-null    float64
 5   dividend_yield               505 non-null    float64
 6   ebitda                       505 non-null    float64
 7   ebitda_log__scaled           505 non-null    float64
 8   profitability_ratio__scaled  505 non-null    float64
 9   market_cap                   505 non-null    float64
 10  price                        505 non-null    float64
 11  52w_ran

Unnamed: 0,price_earnings,price_book,price_sales,valuation_z__scaled,earnings_share,dividend_yield,ebitda,ebitda_log__scaled,profitability_ratio__scaled,market_cap,price,52w_range__scaled,price_to_52w_high__scaled,market_cap_log__scaled,dividend_flag__scaled
0,24.31,11.34,4.390271,0.908281,7.92,2.332862,9048000000.0,1.102991,-0.189244,138721100000.0,222.89,-0.229478,0.26995,1.477267,0.0
1,27.76,6.35,3.575483,0.502909,1.7,1.147959,601000000.0,-0.632341,-0.336064,10783420000.0,60.24,0.083849,0.112032,-0.541755,0.0
2,22.51,3.19,3.74048,0.076988,0.26,1.908982,5744000000.0,0.81221,-0.328124,102121000000.0,56.27,-0.395256,0.51718,1.235165,0.0
3,19.41,26.14,6.291571,2.228956,3.29,2.49956,10310000000.0,1.186549,-0.31895,181386300000.0,108.48,-1.79081,2.451493,1.689222,0.0
4,25.47,10.62,2.604117,0.685735,5.44,1.71447,5643228000.0,0.800883,-0.314349,98765860000.0,150.51,0.01121,0.435615,1.20876,0.0


**4.1. THỐNG KÊ CƠ BẢN**

In [None]:
# Chọn các cột số
numeric_df = df.select_dtypes(include=[np.number])

# Hàm tính thống kê chi tiết
def descriptive_stats(df):
    desc = df.describe().T
    desc["median"] = df.median()
    desc["Q1"] = df.quantile(0.25)
    desc["Q3"] = df.quantile(0.75)
    desc["IQR"] = desc["Q3"] - desc["Q1"]
    desc["skew"] = df.skew()
    desc["kurtosis"] = df.kurtosis()
    desc["cv"] = desc["std"] / desc["mean"]   # Hệ số biến thiên
    # Khoảng tin cậy 95% cho trung bình
    desc["ci95_low"] = desc["mean"] - 1.96 * (desc["std"] / np.sqrt(len(df)))
    desc["ci95_high"] = desc["mean"] + 1.96 * (desc["std"] / np.sqrt(len(df)))
    return desc

# Tính thống kê
summary = descriptive_stats(numeric_df)
print(summary)

# Lưu bảng thống kê
summary.to_csv("descriptive_summary_financials.csv")

# Vẽ biểu đồ cho từng biến
for col in numeric_df.columns:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, color="skyblue")
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Tần suất")
    plt.tight_layout()
    plt.savefig(f"hist_{col}.png")
    plt.close()

    plt.figure(figsize=(4,4))
    sns.boxplot(y=df[col], color="lightcoral")
    plt.title(f"Boxplot of {col}")
    plt.tight_layout()
    plt.savefig(f"box_{col}.png")
    plt.close()

print("✅ Đã lưu bảng thống kê và biểu đồ!")

                             count          mean           std           min  \
price_earnings               505.0  2.418889e+01  2.865063e+01 -5.360960e+01   
price_book                   505.0  9.587256e+00  2.800768e+01  8.304000e-01   
price_sales                  505.0  3.930674e+00  3.406980e+00  4.081934e-01   
valuation_z__scaled          505.0  7.026387e-01  2.727031e+00 -2.763734e+00   
earnings_share               505.0  3.739051e+00  4.447504e+00 -6.874400e+00   
dividend_yield               505.0  1.877506e+00  1.453842e+00  0.000000e+00   
ebitda                       505.0  3.439166e+09  5.606562e+09 -3.712160e+08   
ebitda_log__scaled           505.0 -1.631732e+00  4.713349e+00 -1.356819e+01   
profitability_ratio__scaled  505.0  1.620277e-01  1.064887e+00 -1.737273e+00   
market_cap                   505.0  4.714421e+10  7.524540e+10  4.230380e+09   
price                        505.0  9.812043e+01  8.739460e+01  1.283280e+01   
52w_range__scaled            505.0 -1.03