In [1]:
import os
import scipy
import numpy as np
import pandas as pd
import cvxpy as cp

from matplotlib import pyplot as plt
%matplotlib inline

# 固定随机种子

In [2]:
from numpy.random import default_rng
seed = 1024
np.random.seed(seed)
rng = default_rng(seed = seed)

# 1. 第一问

## 1.1 生成数据

In [3]:
days = 10          # 取过去两周作为总周期
num_industry = 40   # 总行业数
num_stock = 3000    # 总股票数

industries = np.arange(num_industry)
times = np.arange(0, days)

# 假设所有股票的个体回报独立同分布
mu = 0.05
sigma = 10
stock_return = rng.normal(mu, sigma, size = (num_stock, days))

# 假设所有的行业回报独立同分布
mu = 0.05
sigma = 5
industry_return = rng.normal(mu, sigma, size = (num_industry, days))

# 建立每只股票所属的行业全集字典
sectors = dict()
sizes = np.random.randint(low = 1, high = 4, size = num_stock)
for i in range(num_stock):
    sectors[i] = np.random.randint(low = 0, high = num_industry, size = sizes[i])


## 1.2 凸优化求解

In [4]:
betas1 = []
for i in range(num_stock):
    ind_idx = sectors[i] # 获取当前股票所属的行业全集
    num_ind = ind_idx.shape[0]

    beta = cp.Variable(shape = num_ind, name = "beta")                                          # 设置凸优化的变量
    objective1 = cp.Minimize(cp.sum((stock_return[i] - beta@industry_return[ind_idx])**2))   # 设置凸优化的目标
    constraints1 = [cp.sum(beta) == 1, beta >= 0]                                                           # 设置凸优化的约束条件
    problem1 = cp.Problem(objective1, constraints1)

    result1 = problem1.solve()    # 求解
    betas1.append(beta.value)    # 将求得的解存储

## 1.3 结果打印

In [12]:
print("%-9s\t%-20s\t%-40s"%("Stock", "Industries", "Betas"))
for i in range(num_stock):
    ind_idx = sectors[i]
    print("%-9d\t%-20s\t%-40s"%(i, repr(list(ind_idx)), np.array_repr(np.round(betas1[i], decimals=4))))

Stock    	Industries          	Betas                                   
0        	[36, 25]            	array([0.9304, 0.0696])                 
1        	[24, 27]            	array([0.1011, 0.8989])                 
2        	[35]                	array([1.])                             
3        	[0, 36]             	array([0.5915, 0.4085])                 
4        	[31, 27]            	array([1., 0.])                         
5        	[15, 5]             	array([0.1505, 0.8495])                 
6        	[6]                 	array([1.])                             
7        	[17, 20]            	array([ 1., -0.])                       
8        	[6]                 	array([1.])                             
9        	[2, 24, 21]         	array([ 0.6313, -0.    ,  0.3687])      
10       	[26]                	array([1.])                             
11       	[38]                	array([1.])                             
12       	[17, 10]            	array([-0.,  1.])                

# 2. 第二问

## 2.1 生成数据

可以发现并不需要生成新数据，使用第一问的数据即可，只需要假装不知道每个行业的回报就好了

## 2.2 EM算法设置参数

In [6]:
num_iterations = 1000   # 最大迭代次数
delta = 1/2*1e-4        # 设置误差阈值，误差小于delta则认为收敛成功

## 2.3 EM算法

In [7]:
# 初始化
betas2 = []
beta_matrix = np.zeros((num_stock, num_industry))
for i in range(num_stock):
    ind_idx = sectors[i]
    num_ind = ind_idx.shape[0]
    betas2.append(np.ones(num_ind)/num_ind)
    beta_matrix[i, ind_idx] = betas2[i]
error = 1

# EM算法
for iter in range(num_iterations):
    # E-step
    ind_return = cp.Variable(shape = (num_industry, days), name = "industry_return")
    constraints2 = []
    objective2 = cp.Minimize(cp.sum((stock_return - beta_matrix@ind_return)**2))
    problem = cp.Problem(objective2, constraints2)
    result = problem.solve()
    ind_return.value

    # M-step
    betas2 = []
    new_beta_matrix = np.zeros((num_stock, num_industry))
    ind_return = ind_return.value
    for i in range(num_stock):
        ind_idx = sectors[i]
        num_ind = ind_idx.shape[0]
        beta = cp.Variable(shape = num_ind, name = "beta")
        objective2 = cp.Minimize(cp.sum((stock_return[i] - beta@ind_return[ind_idx])**2))
        constraints2 = [cp.sum(beta) == 1, beta >= 0]
        problem2 = cp.Problem(objective2, constraints2)
        result2 = problem2.solve()
        betas2.append(beta.value)
        new_beta_matrix[i, ind_idx] = betas2[i]

    error = np.sqrt(np.sum((new_beta_matrix - beta_matrix)**2))/num_stock
    beta_matrix = new_beta_matrix
    print("%d / %d, error = %f"%(iter+1, num_iterations, error))
    if(error < delta):
        print("收敛成功！")
        break


1 / 1000, error = 0.010027
2 / 1000, error = 0.003312
3 / 1000, error = 0.001744
4 / 1000, error = 0.001313
5 / 1000, error = 0.000961
6 / 1000, error = 0.000693
7 / 1000, error = 0.000522
8 / 1000, error = 0.000394
9 / 1000, error = 0.000307
10 / 1000, error = 0.000261
11 / 1000, error = 0.000244
12 / 1000, error = 0.000253
13 / 1000, error = 0.000277
14 / 1000, error = 0.000286
15 / 1000, error = 0.000258
16 / 1000, error = 0.000211
17 / 1000, error = 0.000179
18 / 1000, error = 0.000151
19 / 1000, error = 0.000156
20 / 1000, error = 0.000175
21 / 1000, error = 0.000173
22 / 1000, error = 0.000159
23 / 1000, error = 0.000126
24 / 1000, error = 0.000100
25 / 1000, error = 0.000088
26 / 1000, error = 0.000080
27 / 1000, error = 0.000074
28 / 1000, error = 0.000070
29 / 1000, error = 0.000067
30 / 1000, error = 0.000063
31 / 1000, error = 0.000060
32 / 1000, error = 0.000057
33 / 1000, error = 0.000054
34 / 1000, error = 0.000052
35 / 1000, error = 0.000051
36 / 1000, error = 0.000049
收

## 2.4 结果打印

In [13]:
print("%-9s\t%-20s\t%-40s"%("Stock", "Industries", "Betas"))
for i in range(num_stock):
    ind_idx = sectors[i]
    print("%-9d\t%-20s\t%-40s"%(i, repr(list(ind_idx)), np.array_repr(np.round(betas2[i], decimals=4))))

Stock    	Industries          	Betas                                   
0        	[36, 25]            	array([ 1., -0.])                       
1        	[24, 27]            	array([ 1., -0.])                       
2        	[35]                	array([1.])                             
3        	[0, 36]             	array([ 1., -0.])                       
4        	[31, 27]            	array([-0.,  1.])                       
5        	[15, 5]             	array([-0.,  1.])                       
6        	[6]                 	array([1.])                             
7        	[17, 20]            	array([-0.,  1.])                       
8        	[6]                 	array([1.])                             
9        	[2, 24, 21]         	array([ 0.2894,  0.7106, -0.    ])      
10       	[26]                	array([1.])                             
11       	[38]                	array([1.])                             
12       	[17, 10]            	array([0.6381, 0.3619])          

In [23]:
print("行业回报为：")
pd.DataFrame(np.round(ind_return, decimals=4))

行业回报为：


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.5039,-1.556,2.189,-3.4996,2.7641,2.8421,1.7185,-5.4106,-2.1595,-3.8727
1,1.565,-2.6823,4.943,5.5773,-2.2634,-3.122,-1.8905,2.0031,-4.349,0.4755
2,2.2214,0.5713,4.8095,-5.1667,-3.4218,-0.8263,-4.0253,-4.1329,1.3546,2.9783
3,-0.1497,-4.0725,1.9112,-1.9484,0.0773,1.6301,0.6512,-5.1229,5.1116,-0.6364
4,1.042,-2.9021,1.1026,-1.1256,2.8973,-1.7086,-2.6698,1.973,3.0111,0.9355
5,0.6031,-1.4163,2.4821,-1.3727,4.3388,-2.1456,4.6483,-0.0393,-1.4368,0.4126
6,-0.1473,0.1164,-2.8092,-2.1557,4.1037,-5.4472,-3.0514,-0.0677,0.7213,-0.1281
7,2.2329,-4.2965,-3.8134,2.9283,1.1853,-2.0915,-1.787,-1.5315,2.5642,-0.5919
8,-1.7272,-0.2645,-3.0557,4.7014,0.3067,2.9802,-2.7094,2.8481,-1.0066,1.906
9,-1.8265,3.6253,3.0768,-4.4429,-0.5649,2.4198,-4.2793,2.9467,4.4878,1.2282
