In [1]:
import sys 
import numpy as np
import pandas as pd
from sklearn.metrics import  accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
raw_datapath = 'E:\\code\\python\\qoe_model\\raw_data\\3w_data.csv'
data = pd.read_csv(raw_datapath)

column_names = ['InitialBufferTime', 'VideoPlayDuration','StallingRatio', 'VIDEO_BITRATE', 'VIDEO_CLARITY', 'VIDEO_ALL_PEAK_RATE', 
                'VIDEO_AVERAGE_RATE', 'USERBUFFERTIME', 'VIDEOSIZE', 'SCREEN_RESOLUTION_LONG', 'VIDEO_BUFFERING_PEAK_RATE', 
                'EVMOS', 'ELOADING', 'ESTALLING', 'USER_SCORE']
#########################################################
############ 将 name 列的离散数据进行编号 ###############
#########################################################
def class_normalization(name, X):
    
    # name不是list,是str
    a = X[name]
    b = a.value_counts()
    c = b.index

    list1 = []
    list2 = []
    for i in range(len(c)):
        list1.append(i)
        list2.append(c[i])
        
    b = a.replace(list2, list1)
    
    data1 = X.drop([name], axis=1)
    data1.insert(2, name, b)
    
    return data1

##########################################################
#################### 移除 name 列 ########################
##########################################################
def remove_col(name, all_name):
    
    list = []
    for i in range(len(column_names)):
        if column_names[i] != name:
            list.append(column_names[i])
    return list

# 生成每一个batch
def generatebatch(X,Y,n_examples, batch_size): 
    for batch_i in range(n_examples // batch_size): 
        start = batch_i * batch_size 
        end = start + batch_size 
        batch_xs = X[start:end] 
        batch_ys = Y[start:end] 
        yield batch_xs, batch_ys 

name = 'VIDEO_CLARITY'
data1 = class_normalization(name, data)
data1 = shuffle(data1)
data1 = data1.reset_index(drop = True)

X1 = data1[remove_col(name, column_names)]
X2 = data1[[name]]

# 选取第i个分数
Y1 = data1[column_names[11]]
Y2 = data1[column_names[12]]
Y3 = data1[column_names[13]]
Y4 = data1[column_names[14]]

scaler = MinMaxScaler()
X1_data = scaler.fit_transform(X1)
X2_data = OneHotEncoder().fit_transform(X2.values.reshape(-1, 1)).todense()
X_data = np.hstack((X1_data, X2_data)).getA()

# 随机采样25%的数据用于测试，剩下的75%用于构建训练集合。
X1_train, X1_test, y1_train, y1_test = train_test_split(X_data, Y1, test_size=0.25, random_state = 33)

#X_train = X_train.reshape(-1,8,8,1)
#X_test = X_test.reshape(-1,8,8,1)

X1_train = X1_train.tolist()
X1_test = X1_test.tolist()
y1_train = OneHotEncoder().fit_transform(y1_train.values.reshape(-1, 1)).todense().getA().tolist()
y1_test_code = OneHotEncoder().fit_transform(y1_test.values.reshape(-1, 1)).todense().getA().tolist()


In [3]:
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 15 15:25:03 2018

@author: lj
"""
import numpy as np
import copy

class node:
    # 树的节点的类
    def __init__(self, fea=-1, value=None, results=None, right=None, left=None):
        self.fea = fea  # 用于切分数据集的属性的列索引值
        self.value = value  # 设置划分的值
        self.results = results  # 存储叶节点的值
        self.right = right  # 右子树
        self.left = left  # 左子树

class CART_RT(object):
    # CART算法的类
    def __init__(self,data_X,data_Y,min_sample, min_err):
        # 初始化CART类参数
        self.data_X = data_X #待回归样本数据的特征
        self.data_Y = data_Y #待回归样本数据的标签
        self.min_sample = min_sample # 每个叶节点最多的样本数
        self.min_err = min_err #最小方差
        
    def fit(self):
        '''
            构建树
            input:  data(list):训练样本
                    min_sample(int):叶子节点中最少的样本数
                    min_err(float):最小的error
            output: node:树的根结点
        '''  
        # 将样本特征与样本标签合成完整的样本
        data = combine(self.data_X,self.data_Y)
        # 构建决策树，函数返回该决策树的根节点
        if len(data) <= self.min_sample:
            return node(results=leaf(data))
            
        # 1、初始化
        best_err = err_cnt(data)
        bestCriteria = None  # 存储最佳切分属性以及最佳切分点
        bestSets = None  # 存储切分后的两个数据集
    
        # 2、开始构建CART回归树
        feature_num = len(data[0]) - 1
        for fea in range(0, feature_num):
            feature_values = {}
            for sample in data:
                feature_values[sample[fea]] = 1
            for value in feature_values.keys():
                # 2.1、尝试划分
                (set_1, set_2) = split_tree(data, fea, value)
                combine_set_1 = combine(set_1[0],set_1[1])
                combine_set_2 = combine(set_2[0],set_2[1])
                if len(combine_set_1) < 2 or len(combine_set_2) < 2:
                    continue
                # 2.2、计算划分后的error值
                now_err = err_cnt(combine_set_1) + err_cnt(combine_set_2)
                # 2.3、更新最优划分
                if now_err < best_err and len(combine_set_1) > 0 and len(combine_set_2) > 0:
                    best_err = now_err
                    bestCriteria = (fea, value)
                    bestSets = (set_1, set_2)

        # 3、判断划分是否结束
        if best_err > self.min_err:
            right = CART_RT(bestSets[0][0],bestSets[0][1], self.min_sample, self.min_err).fit()
            left = CART_RT(bestSets[1][0],bestSets[1][1], self.min_sample, self.min_err).fit()
            return node(fea=bestCriteria[0], value=bestCriteria[1], right=right, left=left)
        else:
            return node(results=leaf(data))  # 返回当前的类别标签作为最终的类别标签

def combine(data_X,data_Y):
    '''样本特征与标签合并
    input:data_X(list):样本特征
          data_Y(list):样本标签
    output:data(list):样本数据
    '''
    m = len(data_X)
    data = copy.deepcopy(data_X)
    for i in range(m):
        data[i].append(data_Y[i])
    return data
        
def err_cnt(data):
    '''回归树的划分指标
    input:  data(list):训练数据
    output: m*s^2(float):总方差
    '''
    data = np.mat(data)
    return np.var(data[:, -1]) * np.shape(data)[0]

def split_tree(data, fea, value):
    '''根据特征fea中的值value将数据集data划分成左右子树
    input:  data(list):训练样本
            fea(float):需要划分的特征index
            value(float):指定的划分的值
    output: (set_1, set_2)(tuple):左右子树的聚合
    '''
    set_1 = []  # 右子树的集合
    set_2 = []  # 左子树的集合
    tmp_11 = []
    tmp_12 = []
    tmp_21 = []
    tmp_22 = []
    for x in data:
        if x[fea] >= value:
            tmp_11.append(x[0:-1])
            tmp_12.append(x[-1])
        else:
            tmp_21.append(x[0:-1])
            tmp_22.append(x[-1])
    set_1.append(tmp_11)
    set_1.append(tmp_12)
    set_2.append(tmp_21)
    set_2.append(tmp_22)
    return (set_1, set_2)

def leaf(dataSet):
    '''计算叶节点的值
    input:  dataSet(list):训练样本
    output: mean(data[:, -1])(float):均值
    '''
    data = np.mat(dataSet)
    return np.mean(data[:, -1])

def predict(sample, tree):
    '''对每一个样本sample进行预测
    input:  sample(list):样本
            tree:训练好的CART回归树模型
    output: results(float):预测值
    '''
    # 1、只是树根
    if tree.results != None:
        return tree.results
    else:
    # 2、有左右子树
        val_sample = sample[tree.fea]  # fea处的值
        branch = None
        # 2.1、选择右子树
        if val_sample >= tree.value:
            branch = tree.right
        # 2.2、选择左子树
        else:
            branch = tree.left
        return predict(sample, branch)

def cal_error(data_X,data_Y, tree):
    ''' 评估CART回归树模型
    input:  data(list):
            tree:训练好的CART回归树模型
    output: err/m(float):均方误差
    '''
    m = len(data_X)  # 样本的个数   
    n = len(data_X[0])  # 样本中特征的个数
    err = 0.0
    for i in range(m):
        tmp = []
        for j in range(n):
            tmp.append(data_X[i][j])
        pre = predict(tmp, tree)  # 对样本计算其预测值
        # 计算残差
        err += (data_Y[i] - pre) * (data_Y[i] - pre)
    return err / m


In [4]:
class GBDT_RT(object):
    '''
    GBDT回归算法类
    '''
    def __init__(self):
        self.trees = None ##用于存放GBDT的树
        self.learn_rate = learn_rate ## 学习率，防止过拟合
        self.init_value = None ##初始数值
        self.fn = lambda x: x
        
    def get_init_value(self,y):
        '''
        计算初始数值为平均值
        input:y(list):样本标签列表
        output:average(float):样本标签的平均值
        '''
        average = sum(y)/len(y)
        return average
    
    def get_residuals(self,y,y_hat):
        '''
        计算样本标签标签与预测列表的残差
        input:y(list):样本标签列表
              y_hat(list):预测标签列表
        output:y_residuals(list):样本标签标签与预测列表的残差
        '''
        y_residuals = []
        for i in range(len(y)):
            y_residuals.append(y[i] - y_hat[i])
        return y_residuals
    
    def fit(self,data_X,data_Y,n_estimators,learn_rate,min_sample, min_err):
        '''
        训练GBDT模型
        input:self(object):GBDT_RT类
              data_X(list):样本特征
              data_Y(list):样本标签
              n_estimators(int):GBDT中CART树的个数
              learn_rate(float):学习率
              min_sample(int):学习CART时叶节点的最小样本数
              min_err(float):学习CART时最小方差
        '''
        ## 初始化预测标签和残差
        self.init_value = self.get_init_value(data_Y)
        
        n = len(data_Y)
        y_hat = [self.init_value] * n ##初始化预测标签
        y_residuals = self.get_residuals(data_Y,y_hat)
        
        self.trees = []
        self.learn_rate = learn_rate
        ## 迭代训练GBDT
        for j in range(n_estimators):
            idx = range(n)
            X_sub = [data_X[i] for i in idx] ## 样本特征列表
            residuals_sub = [y_residuals[i] for i in idx] ## 标签残差列表
            
            tree = CART_RT(X_sub,residuals_sub, min_sample, min_err).fit()
            res_hat = [] ##残差的预测值
            for m in range(n):
                res_hat.append(predict(data_X[m],tree))
            ## 计算此时的预测值等于原预测值加残差预测值
            y_hat = [y_hat[i] + self.learn_rate * res_hat[i] for i in idx]
            y_residuals = self.get_residuals(data_Y,y_hat)
            self.trees.append(tree)
            
    def GBDT_predict(self,xi):
        '''预测一个样本
        '''
        return self.fn(self.init_value + sum(self.learn_rate * predict(xi,tree) for tree in self.trees))
    
    def GBDT_predicts(self,X):
        '''预测多个样本
        '''
        return [self.GBDT_predict(xi) for xi in X]

def error(Y_test,predict_results):
    '''计算预测误差
    input:Y_test(list):测试样本标签
          predict_results(list):测试样本预测值
    output:error(float):均方误差
    '''
    Y = np.mat(Y_test)
    results = np.mat(predict_results)
    error = np.square(Y - results).sum() / len(Y_test)
    return error

def load_data(data_file):
    '''
    导入训练数据
    input:  data_file(string):保存训练数据的文件
    output: data(list):训练数据
    '''
    data_X = []
    data_Y = []
    f = open(data_file)
    for line in f.readlines():
        sample = []
        lines = line.strip().split("\t")
        data_Y.append(float(lines[-1]))
        for i in range(len(lines) - 1):
            sample.append(float(lines[i]))  # 转换成float格式
        data_X.append(sample)
    f.close()    
    return data_X,data_Y

In [12]:
print ("------------- 1.load data ----------------")
X_train = X1_train[0:150]
Y_train = y1_train[0:150]
X_test = X1_test[150:200]

print('------------2.Parameters Setting-----------')
n_estimators = 4
learn_rate = 0.5
min_sample = 30
min_err = 0.3

print ("--------------3.build GBDT ---------------")
gbdt_rt = GBDT_RT()
gbdt_rt.fit(X1_train,y1_train,n_estimators,learn_rate,min_sample, min_err)

print('-------------4.Predict Result--------------')
predict_results = gbdt_rt.GBDT_predicts(X1_test)

print('--------------5.Predict Error--------------')
error = error(y1_test,predict_results)
print('Predict error is: ',error)

------------- 1.load data ----------------
------------2.Parameters Setting-----------
--------------3.build GBDT ---------------


TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [11]:
X_data

[[0.19035],
 [0.306657],
 [0.017568],
 [0.122328],
 [0.076274],
 [0.614127],
 [0.220722],
 [0.08943],
 [0.278817],
 [0.520287],
 [0.726976],
 [0.180485],
 [0.801524],
 [0.474273],
 [0.345116],
 [0.981951],
 [0.127349],
 [0.75712],
 [0.345419],
 [0.314532],
 [0.250828],
 [0.431255],
 [0.386669],
 [0.143794],
 [0.470839],
 [0.093065],
 [0.205377],
 [0.083329],
 [0.243475],
 [0.062389],
 [0.764116],
 [0.018287],
 [0.973603],
 [0.458826],
 [0.5112],
 [0.712587],
 [0.464745],
 [0.984328],
 [0.414291],
 [0.799551],
 [0.499037],
 [0.966757],
 [0.756594],
 [0.444938],
 [0.410167],
 [0.532335],
 [0.343909],
 [0.854302],
 [0.846882],
 [0.740758],
 [0.150668],
 [0.177606],
 [0.445289],
 [0.734653],
 [0.559488],
 [0.232311],
 [0.934435],
 [0.219089],
 [0.636525],
 [0.307605],
 [0.713198],
 [0.116343],
 [0.680737],
 [0.48473],
 [0.929408],
 [0.008507],
 [0.872161],
 [0.75553],
 [0.620671],
 [0.47226],
 [0.257488],
 [0.130654],
 [0.512333],
 [0.74771],
 [0.669948],
 [0.644856],
 [0.894206],
 [0.8204

In [1]:
dict = {}
dict['you'] = ['a', 'n']

In [2]:
dict

{'you': ['a', 'n']}