In [10]:
# thư viện cần tải
!pip install ta







In [12]:
# import thư viện
import os
from ta.trend import ADXIndicator
from ta.momentum import ROCIndicator
import time
import numpy as np
import pandas as pd
import joblib

import xgboost as xgb
from vnstock import Vnstock
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Thư viện vẽ biểu đồ
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from statsmodels.tsa.seasonal import STL

In [1]:
import joblib
model = joblib.load("models/best_xgb_model.pkl")
# ticker_to_id = joblib.load("models/ticker_to_id.pkl")


In [None]:
# dùng để lọc những data có khối lượng giao dịch thấp, hạn chế giao dịch và dữ liệu kém
def filter_data(df):
    obtain = True
    if len(df) < 100:
        obtain = False
    
    last_date = df['time'].max()
    if (pd.Timestamp.today() - last_date).days > 90:
        obtain = False

    recent_dates = df['time'].dt.date.unique()
    recent_30d = pd.Timestamp.today().date() - pd.to_timedelta(30, unit='d')
    active_days_last_30 = [d for d in recent_dates if d >= recent_30d]
    if len(active_days_last_30) < 10:
        obtain = False

    if df['volume'].tail(20).mean() < 100000:
        obtain = False

    last_20_volume = df['volume'].tail(20)
    if (last_20_volume > 0).sum() < 15:
        obtain = False
        
    return obtain

In [None]:
# dùng để tính rsi
def relative_strength_idx(df, n=14):
    close = df['close']
    delta = close.diff()
    delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

In [23]:
# các hàm xử lí (Mô hình xgboost dự đoán xu hướng cổ phiếu)
def create_labels(df, horizon=30, threshold=0.08):
    df = df.copy()
    df['future_price'] = df['close'].shift(-horizon)/df['close'] - 1
    df['label_price'] = (df['future_price'] >= threshold).astype(int)
    return df

In [9]:
def feature_adx_roc(df):
    df = df.copy()
    
    # Tính ADX
    adx = ADXIndicator(high=df['high'], low=df['low'], close=df['close'], window=14)
    df['adx'] = adx.adx()

    # Tính ROC (Rate of Change)
    roc = ROCIndicator(close=df['close'], window=10)
    df['roc'] = roc.roc()

    return df

In [13]:
def scale_features(df_temp, feature):
    df_temp = df_temp.copy()
    scaler = StandardScaler()
    df_temp[feature] = scaler.fit_transform(df_temp[feature])
    return df_temp, scaler

In [None]:
def create_feature_data(df): 
    horizon = 30
    df['ema_9'] = df['close'].ewm(9).mean().shift(horizon)
    df['sma_5'] = df['close'].rolling(5).mean().shift(horizon)
    df['sma_10'] = df['close'].rolling(10).mean().shift(horizon)
    df['sma_15'] = df['close'].rolling(15).mean().shift(horizon)
    df['sma_30'] = df['close'].rolling(30).mean().shift(horizon)
    df['rsi'] = relative_strength_idx(df).fillna(0)
    ema_12 = pd.Series(df['close'].ewm(span=12, min_periods=12).mean())
    ema_26 = pd.Series(df['close'].ewm(span=26, min_periods=26).mean())
    df['macd'] = pd.Series(ema_12 - ema_26)
    df['macd_signal'] = pd.Series(df.macd.ewm(span=9, min_periods=9).mean())
    
    df_temp = feature_adx_roc(df)
    df_scaled, scaler = scale_features(df_temp, ['adx', 'roc'])
    
    # Gán các cột đã chuẩn hóa vào df gốc
    df['adx'] = df_scaled['adx']
    df['roc'] = df_scaled['roc']
    df['close'] = df['close'].shift(-horizon)
    return df

In [45]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL

all_tickers = [f.replace('.csv', '') for f in os.listdir("data_vnstock") if f.endswith(".csv")]
X_test_all = []
ticker_tests = []
features = [
    "trend",
    "volume", "ema_9", "sma_5", "sma_10", "rsi", "macd", "macd_signal", 
    "ticker_encoded","adx","roc"
]
drop_cols = ["time","high", "low","close", "open"]

X, y = [], []
for t, ticker in enumerate(all_tickers):
    
    filePath = os.path.join("data_vnstock", ticker + '.csv')
    df = pd.read_csv(filePath)
    
    df['close'] = df.apply(
    lambda row: (row['open'] + row['high'] + row['low']) / 3 if row['close'] == 0.0 else row['close'],
    axis=1
)
    
    df['time'] = pd.to_datetime(df['time'])

    if df.empty:
        continue
    
    # bỏ những cổ phiếu không chất lượng
    if (filter_data(df) == False):
        continue

    # Gán mã số
    df['ticker_encoded'] = t

    df_close = df[['time', 'close']].copy()
    df_close = df_close.set_index('time')
    df_close.tail()
    
    # Phân rã bằng STL (modern)
    stl = STL(df_close['close'], period=50)
    res = stl.fit()
    
    df["trend"] = res.trend.values
    df['trend'] = (df['trend'].diff().shift(-1) > 0).astype(int)

    # Tạo label + feature
    df = create_labels(df)
    df = create_feature_data(df)
    df = df.iloc[62:] # Because of moving averages and MACD line
    df = df[:-30]     # bỏ 10 ngày cuối vì không có label
    df = df.drop(drop_cols, axis=1)
    
    y_train = df['label_price'].copy()
    X_train = df.drop(['label_price'], axis=1)
    n_test = X_train.shape[0]
    if df.empty:
        continue
    
    last_window = df.iloc[-30:] 
    ticker_tests.append(np.full(30, t))
    
    X.append(last_window[features].values)
    # y.append(df['label_price'].values)
    # X_test_all.append(last_row[features].values[0])
    # ticker_test_all.append(ticker)
X = np.vstack(X)
ticker_tests = np.hstack(ticker_tests)


In [47]:
print("Len ticker_tests:", len(ticker_tests))
print("Len probas:", len(probas))

Len ticker_tests: 9570
Len probas: 9570


In [49]:
print(np.unique(probas))  

[0.4097739  0.4098556  0.4112857  ... 0.5887341  0.58913827 0.5892089 ]


In [28]:
print(X)

[[ 1.00000000e+00  1.46410000e+05  5.24122995e+00 ...  1.00000000e+00
   4.66467642e-02  5.58990531e-02]
 [ 1.00000000e+00  2.85000000e+05  5.23081735e+00 ...  1.00000000e+00
  -9.08052216e-02  8.22595673e-01]
 [ 1.00000000e+00  4.62000000e+05  5.20919436e+00 ...  1.00000000e+00
  -1.15532463e-01  1.02152462e+00]
 ...
 [ 0.00000000e+00  6.67000000e+06  1.52484986e+01 ...  1.67300000e+03
  -7.94883836e-01  4.92901187e-01]
 [ 0.00000000e+00  4.50710000e+06  1.51586487e+01 ...  1.67300000e+03
  -7.71981096e-01  3.92133170e-01]
 [ 0.00000000e+00  4.75110000e+06  1.49977838e+01 ...  1.67300000e+03
  -7.80187017e-01  3.64105790e-01]]


In [None]:
# X_input = np.array([window.flatten() for window in X])  # nếu model là XGBoost

# probas = model.predict_proba(X_input)[:, 1]  # hoặc model.predict(X_input)[:, 0] nếu LSTM

# df_pred = pd.DataFrame({
#     'ticker_encoded': ticker_tests,
#     'proba_increase': probas
# })

# top_50 = df_pred.sort_values(by='proba_increase', ascending=False).head(50)
# top_50['ticker'] = top_50['ticker_encoded'].map(lambda x: all_tickers[x])

# print(top_50[['ticker', 'proba_increase']])

     ticker  proba_increase
73      AAS        0.589209
72      AAS        0.589209
71      AAS        0.589209
79      AAS        0.589209
4300    ITC        0.589209
4306    ITC        0.589209
4299    ITC        0.589209
4305    ITC        0.589209
4304    ITC        0.589209
2470    DST        0.589209
2467    DST        0.589209
2466    DST        0.589209
2471    DST        0.589209
2469    DST        0.589209
4298    ITC        0.589209
4297    ITC        0.589209
2468    DST        0.589209
4303    ITC        0.589209
915     BMS        0.589209
906     BMS        0.589209
5743    NVL        0.589209
4517    KHP        0.589209
4518    KHP        0.589209
917     BMS        0.589209
4302    ITC        0.589209
475     API        0.589209
4301    ITC        0.589209
473     API        0.589209
474     API        0.589209
5473    NDN        0.589209
100     AAV        0.589209
5475    NDN        0.589209
5472    NDN        0.589209
5474    NDN        0.589209
98      AAV        0

In [54]:
df_all = pd.DataFrame({
    'ticker_encoded': ticker_tests,
    'proba': probas
})
df_mean = (
    df_all
    .groupby('ticker_encoded')
    .mean()
    .reset_index()
    .rename(columns={'proba': 'proba_increase'})
)

# B3: ánh xạ sang tên mã
df_mean['ticker'] = df_mean['ticker_encoded'].map(lambda x: all_tickers[x])

# B4: lấy top 50 cổ phiếu dự đoán tăng mạnh nhất
top_50 = df_mean.sort_values(by='proba_increase', ascending=False).head(50)

# B5: in ra
print(top_50[['ticker', 'proba_increase']])

    ticker  proba_increase
82     DST        0.566855
145    JVC        0.565584
141    IMP        0.563637
159    LDG        0.562439
143    ITC        0.562424
101    G36        0.561272
151    KLB        0.561110
142    IPA        0.559096
62     DC4        0.558393
285    VE9        0.557607
272    TV2        0.557157
63     DCL        0.557096
86     DXG        0.556678
89     EIB        0.555768
304    VNE        0.554632
3      AAV        0.554539
157    LAS        0.553739
150    KHP        0.553445
250    TCB        0.553266
226    REE        0.552634
248    SVN        0.551876
79     DRH        0.551510
257    TDH        0.549346
166    MBB        0.549137
15     API        0.546605
188    NTL        0.546011
209    POW        0.545863
185    NLG        0.545757
66     DDV        0.545596
24     BFC        0.545205
191    NVL        0.545162
41     CEO        0.545065
252    TCH        0.545026
260    TLD        0.544611
48     CRE        0.544137
233    SCR        0.544038
2

In [55]:
top_50[['ticker', 'proba_increase']].to_json("top50_tickers.json", orient='records', lines=True)


In [39]:
top_stocks = df_pred.sort_values(by='proba_increase', ascending=False).head(50)
print("Top cổ phiếu có xác suất tăng trưởng cao nhất:")
print(top_stocks)


Top cổ phiếu có xác suất tăng trưởng cao nhất:
        ticker_encoded  proba_increase
439756            1063        0.589209
439757            1063        0.589209
222302             528        0.589209
222303             528        0.589209
615296            1554        0.589209
222528             528        0.589209
157468             390        0.589209
157474             390        0.589209
222295             528        0.589209
337307             793        0.589209
615297            1554        0.589209
615292            1554        0.589209
615293            1554        0.589209
337308             793        0.589209
162755             401        0.589209
439512            1063        0.589209
439509            1063        0.589209
439510            1063        0.589209
157146             390        0.589209
615298            1554        0.589209
615299            1554        0.589209
615302            1554        0.589209
162756             401        0.589209
650670           