In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline

from greenseer.dataset.china_dataset import fetch_train_set, TRAIN_SET_ALL, RELEASE_AT_INDEX_NAME
from greenseer.plots.boundary_plots import plot_gaussian_mixture_boundaries
from greenseer.plots.normal_plots import DistributeGramEntry, plot_distribute_gram
from greenseer.preprocessing.clean_data import remove_inf_and_na, RemoveAbnormalFilter
from greenseer.preprocessing.transformers import pick_annual_report_china, regular_expression_column_filter, \
    re_percent_column_transform, append_industry_transform
from greenseer.utils.tools import refresh_report_data, enable_matplotlib_chinese, save_fig, save_csv
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

enable_matplotlib_chinese()

# 数据说明
- reports: 数据全集
- analysis_reports: 2018年前的年报
- latest_reports: 2019年的年报

In [3]:
refresh_report_data("cash_in_assert_report")
reports = fetch_train_set(TRAIN_SET_ALL)

In [4]:
reports.sort_index(ascending=True,inplace=True)
idx = pd.IndexSlice

analysis_reports = reports.loc[idx[:,:'2018'], :].fillna(0)
latest_reports = reports.loc[idx[:,'2019-12-31'], :].fillna(0)

在这里，取得用于分析的基础数据，存储在变量data中。
其中包含了基本的货币资金，短期短款，资产总计，以及他们存在的比值。

In [5]:
subjects = [r'\w*货币资金\w*|\w*短期借款\w*|\w*资产总计\w*']
cash_subject_name = 'cash'
st_debt_name = 'short-term debt'

prepare_pipeline = Pipeline([
        ('pick_annual_report', pick_annual_report_china()),
        ('pick_subject', regular_expression_column_filter(patterns=subjects,rename={"资产总计(万元)":"base"})),
        ('percent cache', re_percent_column_transform(numerator=[r'\w*货币资金\w*'], denominator=[r'base'],new_name=cash_subject_name)),
        ('percent receivable', re_percent_column_transform(numerator=[r'\w*短期借款\w*'], denominator=[r'base'],new_name=st_debt_name)),
        ('append industry',append_industry_transform()),
        ('clean data',remove_inf_and_na()),
        ('remove abnormal',RemoveAbnormalFilter([cash_subject_name, st_debt_name],quantile=0.98,mode="high"))
])

data = prepare_pipeline.fit_transform(analysis_reports)

这里。打算计算每个年份的所有现金的平均值。有个简单的比较

In [6]:

annual_avg_data =data.reset_index().groupby(RELEASE_AT_INDEX_NAME).mean()

plt.plot(annual_avg_data.index.year,annual_avg_data[cash_subject_name],'-r',label='现金比例')
plt.plot(annual_avg_data.index.year,annual_avg_data[st_debt_name],'-b',label='现金债比例')
annual_x_axis = np.array(annual_avg_data.index.year)
plt.legend( loc='upper right')
plt.xticks(annual_x_axis[::2])
save_fig("annual_avg_all_fig")
save_csv(annual_avg_data,"annual_avg_all_fig")
plt.show()

KeyError: 'releaseAt'

用高斯分布，用现金比例和短期债，按照行业来进行分类

In [None]:
industry_mean = data.groupby("industry").mean()
industry_mean_model = GaussianMixture(n_components=3,n_init=10)
X = industry_mean[[cash_subject_name,st_debt_name]].values
y_pred = industry_mean_model.fit(X).predict(X)
industry_mean["group"] = y_pred
save_csv(data=industry_mean,name="industry_mean_group")

x_limits = X[:,0].min()-0.1,X[:,0].max()+0.1
y_limits = X[:,1].min()-0.1,X[:,1].max()+0.1
plot_data = [
    DistributeGramEntry(x=X[y_pred==0, 0],y= X[y_pred==0, 1],fmt="yo",label="组0"),
    DistributeGramEntry(x=X[y_pred==1, 0],y= X[y_pred==1, 1],fmt="bs",label="组1"),
    DistributeGramEntry(x=X[y_pred==2, 0],y= X[y_pred==2, 1],fmt="g^",label="组2"),
]
figure,(av1,av2) = plt.subplots(2,1,figsize=(10,10))
subject_label = ["现金比例","短期债务"]
plot_distribute_gram(av1,plot_data,subject_label)
av1.set_ylabel("短期债务")
av1.set_xlim(x_limits)
size = (np.array([X[:,0].min()-0.1,X[:,1].min()-0.1]),np.array([X[:,0].max()+0.1,X[:,1].max()+0.1]))
plot_gaussian_mixture_boundaries(av2,industry_mean_model,size,cm=plt.cm.YlOrBr)
av2.set_xlim(x_limits)
av2.set_xlabel("现金比例")
av2.set_ylabel("短期债务")
save_fig("industry_mean_group")
plt.show()

 ## 行业的分布图
根据行业的信息，用高斯来做分类

In [None]:
industry_company_count = data.groupby("industry").count()
company_sum = industry_company_count['base'].sum()
industry_mean["weight"] = industry_company_count['base'].values/company_sum
company_mean_model = GaussianMixture(n_components=3,n_init=10,
                                     means_init=industry_mean[[cash_subject_name,st_debt_name]].values,
                                     weights_init=industry_mean["weight"].values)

In [None]:
fig, ax = plt.subplots()

ax.scatter(industry_mean[cash_subject_name].values,
           industry_mean[st_debt_name].values,
           s=industry_mean["weight"].values*50000,
           c= plt.cm.hot(industry_mean['weight']*100),
           alpha=0.5)
plt.show()



## 预测一年新的数据

这里，想要取2017年(包括2017年)的之前五年的数据。然后2018年的数据微target建立预测模型。



In [None]:
predict_2018_data = data.copy()
predict_2018_data.sort_index(ascending=True,inplace=True)
data_2012_to_2017 = predict_2018_data.loc[idx[:,'2012':'2017'], :]
data_2012_to_2017 = data_2012_to_2017[cash_subject_name]
stock_ids = data_2012_to_2017.index.levels[0]

data_2012_to_2017.index.names

In [None]:
stock_ids = data_2012_to_2017.index.levels[0]
pending_array = []

for stock_id in stock_ids:
    try:
        one_stock = data_2012_to_2017.loc[stock_id]
    except KeyError as err:
        print("{} can't get the data".format(err))
        continue

    if len(one_stock) <= 5 :
        continue
    one_stock = one_stock.to_frame().T
    one_stock.index= [stock_id]
    one_stock.fillna(0)
    pending_array.append(one_stock)

data_2012_to_2017 = pd.concat(pending_array)

In [None]:
lin_reg = LinearRegression()
index_2012_to_2017 = data_2012_to_2017.index
predict_2018 = predict_2018_data.loc[idx[:,'2018'], :]
predict_2018 = predict_2018.reset_index().set_index("code")
predict_2018_index = predict_2018.index

In [None]:
predict_index = predict_2018_index.intersection(index_2012_to_2017)
#predict_index =  predict_2018.loc[predict_index].index.levels[0].intersection(predict_index)
X = data_2012_to_2017.loc[predict_index.values].values
y = predict_2018.loc[predict_index][cash_subject_name].values
len(X),len(y),len(predict_index),len(index_2012_to_2017),len(predict_2018_index)

In [None]:
lin_reg.fit(X,y)


In [None]:
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X, y)


