## KNN

In [None]:
import FinanceDataReader as fdr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import json

In [None]:
stock2code = json.load(open('data/stock2code.json', 'r'))
df = fdr.DataReader(stock2code['KODEX 200'], '2015-01-01', '2020-01-01')
df.head()

In [None]:
ohlc = df[['Open','High','Low','Close']]
ohlc.tail()

### 데이터 가공

In [None]:
X_data = ohlc
X_data

In [None]:
# 다음날 기준 종가가 시가보다 올랐다면 1, 그렇지 않다면 -1
Y = np.where(ohlc.Close.shift(-1) > ohlc.Open.shift(-1), 1, -1)
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y, test_size=0.3)

In [None]:
tr_accuracy = []
te_accuracy = []

for k in range(2, 20):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    tr_accuracy.append(model.score(X_train, y_train))
    te_accuracy.append((pred==y_test).mean())

In [None]:
plt.figure(figsize=(14,5))
plt.plot(range(2,20), tr_accuracy, label='train')
plt.plot(range(2,20), te_accuracy, label='test')
plt.legend()
plt.show()

In [None]:
final_model = KNeighborsClassifier(n_neighbors=10)
final_model.fit(X_train, y_train)

tr_acc = accuracy_score(y_train, final_model.predict(X_train))
te_acc = accuracy_score(y_test, final_model.predict(X_test))

print(tr_acc)
print(te_acc)

## K-Means

In [None]:
import FinanceDataReader as fdr
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = fdr.StockListing('S&P500')
data.tail()

In [None]:
ls = []
for ticker in data.Symbol[:100]:
    try: 
        df = fdr.DataReader(ticker, '2016-01-01','2020-12-31')['Close']
        df.columns = [ticker]
        ls.append(df)
    except:
        continue
    total_df = pd.concat(ls, axis=1)

total_df.sort_index(inplace=True)
total_df.columns = data.Symbol[:100]
total_df

In [None]:
#결측치 확인
total_df.T.loc[total_df.isna().sum() > 10]

In [None]:
total_df.drop(['AMCR','CARR'], axis=1, inplace=True)  #두 종목은 버리고
total_df.fillna(method='bfill', inplace=True)  # 다른종목에 대해서는 fillna

In [None]:
df = total_df.pct_change().dropna().T
df

In [None]:
tickers = list(df.index)
returns = df.values

### 피처 스케일링

In [None]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled, columns=df.columns)
final = df_scaled.set_index(df.index)
final.tail()

### 클러스터링

In [None]:
err = []
for n in range(2, 11):
    model = KMeans(n)
    model.fit(final)
    err.append(model.inertia_ / 100) #군집응집도 (작을수록 Good)

In [None]:
result = pd.DataFrame({'n_clusters':list(range(2,11)), 'err':err})
result

In [None]:
plt.figure(figsize=(14,10))
plt.plot(result.n_clusters, result.err)
plt.show()

In [None]:
model = KMeans(10)
model.fit(final)
model.labels_

In [None]:
result = data.loc[data.Symbol.isin(df.index.values)]
result['label'] = model.labels_
display(result)

print("\n****각 클러스터별 속한 종목의 수****")
display(result.groupby('label').size())