# Init

In [50]:
# List of tickers for Korean stocks
tickers = {
    "삼성전자": "005930", "SK": "034730", "한화": "000880",
    "두산": "000150", "기아": "000270", "현대차": "005380",
    "LG": "003550", "NAVER": "035420", "카카오": "035720", "롯데지주": "004990"
}

# Date range for the stock data
start_date = "20200101"
end_date = "20250101"

In [51]:
# Target ticker for analysis
TARGET_TICKER = "삼성전자"
ticker_code = tickers[TARGET_TICKER]

## Analyzing the target stock against all other stocks in the list

In [52]:
import pandas as pd
from ta import add_all_ta_features
from pykrx import stock

print()
print(f"--- Loading stock data for ticker: {TARGET_TICKER} ({ticker_code}) ---")
df_stock = pd.read_parquet(f"{ticker_code}.parquet")

# Load OHLCV data for the specified ticker and date range
print(f"--- Loading OHLCV data from KRX for ticker: {TARGET_TICKER} ({ticker_code}) ---")
df_ohlcv = stock.get_market_ohlcv_by_date(start_date, end_date, ticker_code)
df_ohlcv.reset_index(inplace=True)
df_ohlcv.rename(columns={'날짜':'date', '시가':'open', '고가':'high', '저가':'low', '종가':'close', '거래량':'volume'}, inplace=True)

print("--- Adding technical indicators using 'ta' library ---")
# Add all technical indicators using the 'ta' library
df_ohlcv = add_all_ta_features(
    df_ohlcv, open="open", high="high", low="low", close="close", volume="volume", fillna=True
)

# Remove unnecessary columns and handle missing values
df_ohlcv.drop(columns=['open', 'high', 'low', 'volume', 'close', '등락률'], inplace=True)

# Merge the existing stock data with OHLCV data on 'date'
df_stock = pd.merge(df_stock, df_ohlcv, on='date', how='left')

print(f"\n--- Data shape after adding indicators: {df_stock.shape}")

df_stock


--- Loading stock data for ticker: 삼성전자 (005930) ---
--- Loading OHLCV data from KRX for ticker: 삼성전자 (005930) ---
--- Adding technical indicators using 'ta' library ---

--- Data shape after adding indicators: (1227, 93)


  self._psar.iloc[i] = self._psar.iloc[i - 1] + (


Unnamed: 0,date,close,kospi_close,target,news,sentiment,ma5,volume_adi,volume_obv,volume_cmf,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
0,2020-01-08,56800,2151.31,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,0.496105,55760.0,-4.413388e+06,72205383,-0.061123,...,0.341003,0.105714,0.235289,4.516718,0.621321,3.895397,56060.028457,1.792115,1.776246,2.898551
1,2020-01-09,58600,2186.45,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,0.211507,56440.0,1.968919e+07,96307962,0.204440,...,0.746674,0.233906,0.512767,9.498463,2.396749,7.101713,56070.600659,3.169014,3.119837,6.159420
2,2020-01-10,59500,2206.39,1,"[삼성전자 액면분할 전 2000억 '몰빵' 슈퍼개미 근황은… - 한국경제, 스마트홈...",0.222219,57240.0,3.111788e+07,112308132,0.277076,...,1.180111,0.423147,0.756964,8.895797,3.696559,5.199238,56084.874954,1.535836,1.524162,7.789855
3,2020-01-13,60000,2229.26,1,"[삼성전자, 미국 5G·4G LTE 망설계·최적화 전문기업 텔레월드 솔루션즈 인수 ...",0.339890,58140.0,4.247702e+07,123667271,0.343478,...,1.572078,0.652933,0.919145,5.916898,4.140627,1.776271,56101.171000,0.840336,0.836825,8.695652
4,2020-01-14,60000,2238.88,0,"[삼성전자, 미국 5G·4G LTE 망설계·최적화 전문기업 텔레월드 솔루션즈 인수 ...",0.282625,58980.0,2.864460e+07,140573566,0.203769,...,1.856986,0.893744,0.963242,6.426200,4.597742,1.828459,56117.399216,0.000000,0.000000,8.695652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222,2024-12-23,53500,2442.01,2,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼...",0.352443,53740.0,-1.718161e+09,338601556,-0.070158,...,-1.279904,-1.302150,0.022245,-10.259350,-8.361296,-1.898055,69004.659057,0.943396,0.938974,-3.079710
1223,2024-12-24,54400,2440.52,0,"[삼성전자, CES 2025서 ‘가정용 히트펌프 EHS’ 美 시장에 첫 선봬 - S...",0.205407,53780.0,-1.709112e+09,350236233,-0.058435,...,-1.155257,-1.272771,0.117514,-13.008088,-9.290654,-3.717434,68943.869634,1.682243,1.668250,-1.449275
1224,2024-12-26,53600,2429.67,1,"[삼성전자 360조 투자 '용인 반도체 국가산단' 2026년 착공 - 중부일보, 삼...",0.231703,53520.0,-1.717717e+09,339719158,-0.045524,...,-1.162075,-1.250632,0.088557,-15.725975,-10.577718,-5.148257,68880.003371,-1.470588,-1.481509,-2.898551
1225,2024-12-27,53700,2404.77,1,"[반도체 진출 50년만에···나홀로 뒷걸음질한 삼성전자 - 매일경제, 삼성전자 반도...",0.272901,53640.0,-1.716522e+09,350466354,-0.018542,...,-1.139708,-1.228447,0.088739,-17.824807,-12.027136,-5.797671,68816.819174,0.186567,0.186393,-2.717391


In [53]:
# Split the dataset into training and testing sets (80% train, 20% test)
import numpy as np
from sklearn.model_selection import train_test_split
print("\nSplitting the dataset into training and testing sets (80% train, 20% test)")

features = df_stock.select_dtypes(include=np.number)
X = features.drop(columns=['target', 'close'])
y = df_stock['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")


Splitting the dataset into training and testing sets (80% train, 20% test)
Training data shape: (981, 89), Testing data shape: (246, 89)


In [54]:
import numpy as np

# Data augmentation using noise
augmentation = 1

def augment_data_noise(X_input, y_input, n, noise_level=0.01):
    if n <= 1:
        return X_input, y_input
    print(f"\n--- Data augmentation: {n} times (Noise level: {noise_level}) ---")
        
    augmented_X = [X_input]
    augmented_y = [y_input]
    
    for _ in range(n - 1):
        noise = np.random.normal(loc=0.0, scale=noise_level, size=X_input.shape)
        augmented_X.append(X_input + noise)
        augmented_y.append(y_input)
        
    X_final = np.concatenate(augmented_X, axis=0)
    y_final = np.concatenate(augmented_y, axis=0)
    
    return X_final, y_final

print(f"\n--- Data shape before augmentation: X_train: {X_train.shape}, y_train: {y_train.shape}")
if augmentation > 1:
    X_train, y_train = augment_data_noise(X_train, y_train, augmentation)
    
    print(f"--- Data shape after augmentation: X_train: {X_train.shape}, y_train: {y_train.shape}")
else:
    print("\n--- Not using data augmentation ---")


--- Data shape before augmentation: X_train: (981, 89), y_train: (981,)

--- Not using data augmentation ---


In [55]:
import xgboost as xgb

print("\n--- Training XGBoost model ---")
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False
)
model.fit(X_train, y_train)
print("\n--- Model training completed ---")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Training XGBoost model ---

--- Model training completed ---


## Evaluation

In [56]:
from sklearn.metrics import accuracy_score, classification_report
print("\n--- Model evaluation ---")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Classification report ---")
print(classification_report(y_test, y_pred, target_names=['하락', '보합', '상승']))


--- Model evaluation ---
Accuracy: 0.4187

--- Classification report ---
              precision    recall  f1-score   support

          하락       0.36      0.17      0.23        76
          보합       0.49      0.70      0.58       112
          상승       0.23      0.21      0.22        58

    accuracy                           0.42       246
   macro avg       0.36      0.36      0.34       246
weighted avg       0.39      0.42      0.39       246



In [57]:
# --- Feature importance extraction and sorting ---
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})

In [58]:
# Sort features by importance
sorted_features = feature_importances.sort_values(by='importance', ascending=False)

# https://github.com/bukosabino/ta/issues/181
# data leakage issue
# remove `volume_vpt, trend_vortex_ind_pos, trend_vortex_ind_neg, trend_vortex_ind_diff, trend_trix, trend_dpo, trend_kst, trend_kst_sig, trend_kst_diff, trend_visual_ichimoku_a, trend_visual_ichimoku_b, others_dr`
# + 'trend_ichimoku_base, trend_ichimoku_b' from the list
sorted_features = sorted_features[~sorted_features['feature'].isin([
    'volume_vpt', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg',
    'trend_vortex_ind_diff', 'trend_trix', 'trend_dpo', 'trend_kst',
    'trend_kst_sig', 'trend_kst_diff', 'trend_visual_ichimoku_a',
    'trend_visual_ichimoku_b', 'others_dr', 'trend_ichimoku_base',
    "trend_ichimoku_b"
])]

print("\n--- Feature importance (Top 30) ---")
print(sorted_features.head(30))


--- Feature importance (Top 30) ---
                      feature  importance
66    trend_psar_up_indicator    0.047176
27             volatility_dcl    0.032617
28             volatility_dch    0.028358
30             volatility_dcw    0.018240
72               momentum_tsi    0.016693
14             volatility_bbh    0.016465
11                 volume_mfi    0.015758
8               volume_sma_em    0.015217
3                  volume_adi    0.014722
69         momentum_stoch_rsi    0.014620
4                  volume_obv    0.014503
68               momentum_rsi    0.013856
85              momentum_kama    0.013776
24             volatility_kcp    0.013542
7                   volume_em    0.013082
67  trend_psar_down_indicator    0.012996
35          trend_macd_signal    0.012746
36            trend_macd_diff    0.012589
5                  volume_cmf    0.012436
55                  trend_adx    0.012276
0                 kospi_close    0.012259
83        momentum_pvo_signal    0.0121

In [59]:
# --- Select top N features based on importance ---
top_n = 10
top_features = sorted_features['feature'].head(top_n).tolist()

print(f"\n--- Selected top {top_n} features ---")
for feature in top_features:
    print(feature)


--- Selected top 10 features ---
trend_psar_up_indicator
volatility_dcl
volatility_dch
volatility_dcw
momentum_tsi
volatility_bbh
volume_mfi
volume_sma_em
volume_adi
momentum_stoch_rsi


In [60]:
# view columns with top features
print("\n--- Data with top features ---")
df_top_features = df_stock[top_features + ['target' , 'momentum_stoch']]
df_top_features


--- Data with top features ---


Unnamed: 0,trend_psar_up_indicator,volatility_dcl,volatility_dch,volatility_dcw,momentum_tsi,volatility_bbh,volume_mfi,volume_sma_em,volume_adi,momentum_stoch_rsi,target,momentum_stoch
0,0.0,54600.0,57400.0,5.021521,100.000000,56867.068200,82.892383,1.943463e+06,-4.413388e+06,0.000000,2,78.571429
1,0.0,54600.0,58600.0,7.113219,100.000000,58579.014945,87.975195,2.899025e+06,1.968919e+07,0.000000,2,100.000000
2,0.0,54600.0,59700.0,8.994709,100.000000,59853.229637,89.983561,3.874172e+06,3.111788e+07,0.000000,1,96.078431
3,0.0,54600.0,60000.0,9.455023,100.000000,60781.883463,91.053890,3.943251e+06,4.247702e+07,0.000000,1,100.000000
4,0.0,54600.0,61000.0,11.143355,100.000000,61340.022753,92.292080,4.182320e+06,2.864460e+07,0.000000,0,84.375000
...,...,...,...,...,...,...,...,...,...,...,...,...
1222,0.0,51900.0,58900.0,12.842858,-10.618431,57190.125695,45.292302,3.380816e+05,-1.718161e+09,0.336391,2,34.042553
1223,0.0,51900.0,57800.0,10.863561,-9.719687,56354.407004,52.865942,9.211053e+05,-1.709112e+09,0.584670,0,53.191489
1224,0.0,51900.0,56600.0,8.675588,-9.860530,56023.107140,45.970402,5.213910e+05,-1.717717e+09,0.310771,1,36.170213
1225,0.0,51900.0,56600.0,8.690025,-9.840733,55839.166469,48.393501,4.149109e+05,-1.716522e+09,0.353258,1,38.297872
