# 0. 准备环境

## 0.1 import libs

In [1]:
# fetch and install tushare package
pip install tushare



In [0]:
import tushare as ts

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib.collections import LineCollection
from sklearn import cluster, covariance, manifold
from IPython.display import clear_output

## 0.2 用于plot的中文显示


In [0]:
import matplotlib as mpl
import matplotlib.font_manager as fm

# fetch and set the name of font that was installed
path = '/usr/share/fonts/truetype/NotoSansCJKsc-Regular.otf'
fontprop = fm.FontProperties(fname=path)

## 0.3准备数据和字体文件
 （或者上传文件, 或者使用google drive中的文件）

In [0]:
# unzip zip files to current working directory for further utility
!unzip HSData.zip 


In [0]:
!cp /content/HSData/fonts/NotoSansCJKsc-Regular.otf /usr/share/fonts/truetype/NotoSansCJKsc-Regular.otf

# 1. 获取数据
1.1 从Tushare获取股票代号（symbol）和股票名称（Names），<p>也可以从预存的文件中获取

In [0]:
# Through tushare API, get symmbols and names
# the key is retrived from tushare website
pro = ts.pro_api('676be26deafece306aa1bfa5c77b9a313d4fb6a75b7953b861b6d95f')
data = pro.query('stock_basic', exchange='', list_status='L', fields='ts_code,symbol,name')

# put data into respectively lists
hscode=data.ts_code
tu_namelist=data.name

1.2 **从文件中获取股票价格数据**

In [18]:
# put price data of all stocks into list
quotes = []

for symbol in hscode:
  quotes.append(pd.read_csv('./HSData/data/{}.csv'.format(symbol)))
  clear_output(wait=True)
  print('processing {}'.format(symbol))

processing 688399.SH


#2 处理数据以符合分析函数的要求
  <li fontsize='8'>某个股票数据可能为空，需要剔除</li>
  <li fontsize='8'>某些数据可能少于平均水平，需要剔除，之前必须先确定剔除的条件，比如：行数少于600</li>
  <li fontsize='8'>最后，要确保每个股票数据的数据行数要一致，否则会出错</li>



In [0]:
# keep the rows whose number is equal to or bigger than 600
ref_quotes = []
ref_symbols=[]
ref_names=[]

for num in range(0,len(quotes)):
  if len(quotes[num]) >= 600:
    ref_quotes.append(quotes[num])
    ref_symbols.append(hscode[num])
    ref_names.append(tu_namelist[num])

In [20]:
# verify the output
print(len(ref_quotes),len(ref_names),len(ref_symbols))

3062 3062 3062


In [21]:
#put back to designated parameters of analysis
quotes=ref_quotes.copy()
symbols=ref_symbols.copy()
names=ref_names.copy()
print(len(quotes),len(names),len(symbols))

3062 3062 3062


In [0]:
# for ayalysis performce tuning, can be bypass if performance is excellent
del ref_symbols[100:]
del ref_names[100:]
del quotes[100:] 

In [22]:
# make the number of rows consistent
shortlen = len(quotes[0])

for m in range (0,len(quotes)):
  
  if len(quotes[m]) < shortlen:
    shortlen = len(quotes[m])
    print(shortlen,m) 

for x in range(0,len(quotes)):
  quotes[x]=quotes[x].drop(quotes[x].index[600:len(quotes[x])])

726 1
657 2
614 4
608 49
604 54
601 413
600 604


In [0]:
# Finalize data process
symbols = np.array(ref_symbols).T
names = np.array(ref_names).T

close_prices = np.vstack([q['close'] for q in quotes])
open_prices = np.vstack([q['open'] for q in quotes])

# The daily variations of the quotes are what carry most information
variation = close_prices - open_prices

#3. 用Sk-learn的相关模型分析并用plot画出图表

In [0]:

# #############################################################################
# Learn a graphical structure from the correlations
edge_model = covariance.GraphicalLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# #############################################################################
# Cluster using affinity propagation

_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

# #############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)

embedding = node_position_model.fit_transform(X.T).T

# #############################################################################
# Visualization
plt.figure(1, facecolor='w', figsize=(40, 32))

plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')

# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.nipy_spectral)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)



# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels




for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):

    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, size='60',
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
                       alpha=.6),
             fontproperties=fontprop)

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()

  * coefs)
  * coefs)
