In [13]:
# 多类别分类数据处理
import pandas as pd
from pandas import DataFrame
from pylab import *
import matplotlib.pyplot as plot

target_url = ("https://archive.ics.uci.edu/ml/machine-"
              "learning-databases/glass/glass.data")
#print target_url

glass = pd.read_csv(target_url,header=None, prefix="V")
#print glass
glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
                 'K', 'Ca', 'Ba', 'Fe', 'Type']

#print glass
# print glass['Type'].groupby(axis=1).count() 分类统计都忘记了。。。
glass.info()

#generate statistical summaries
summary = glass.describe()
print(summary)
ncol1 = len(glass.columns)
print ncol1

glassNormalized = glass.iloc[:, 1:ncol1]
print glassNormalized.head()
ncol2 = len(glassNormalized.columns)
summary2 = glassNormalized.describe()

for i in range(ncol2):
    mean = summary2.iloc[1, i]
    sd = summary2.iloc[2, i]
    glassNormalized.iloc[:,i:(i + 1)] = \
        (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd

array = glassNormalized.values
boxplot(array)
plot.xlabel("Attribute Index")
plot.ylabel(("Quartile Ranges - Normalized "))
show()

TypeError: You have to supply one of 'by' and 'level'

## 特征工程
特征工程一般需要通过一个由人工参与的、迭代的过程来完成特征选择，决定*可能最优的特征*，并且尝试不同的特征组合。

## 模型性能评估

### 回归问题
+ 均方误差（Mean Square error）
$$MSE=(\frac{1}{m})\sum_{i=1}^m(y_i-pred(x_i))^2$$
$$RMSE=\sqrt{MSE}$$
+ 平均绝对误差（Mean absolute error）
$$MAE=(\frac{1}{m})\sum_{i=1}^m|y_i-pred(x_i)|$$



### 分类问题

## 影响算法选择及性能的因素
### 模型复杂度
如果问题很复杂，一个拥有大量数据的复杂模型可以很精确的生成结果。然而，如果真实模型不复杂或者没有足够多的数据，一个线性模型可能是最好的答案。暂且可初步得出这样一个结论：对于列比行多的数据集或相对简单的问题，倾向于使用线性模型，反之，使用非线性模型

### 数据量



In [19]:
## 分类器性能
#use scikit learn package to perform linear regression
#read in the rocks versus mines data set from uci.edu data repository
import urllib2
import numpy
import random
from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl


def confusionMatrix(predicted, actual, threshold):
    if len(predicted) != len(actual): return -1
    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0
    for i in range(len(actual)):
        if actual[i] > 0.5: #labels that are 1.0  (positive examples)
            if predicted[i] > threshold:
                tp += 1.0 #correctly predicted positive
            else:
                fn += 1.0 #incorrectly predicted negative
        else:              #labels that are 0.0 (negative examples)
            if predicted[i] < threshold:
                tn += 1.0 #correctly predicted negative
            else:
                fp += 1.0 #incorrectly predicted positive
    rtn = [tp, fn, fp, tn]
    return rtn


#read data from uci data repository
target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
data = urllib2.urlopen(target_url)
#print data.head()

#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
    #split on comma
    row = line.strip().split(",")
    #assign label 1.0 for "M" and 0.0 for "R"
    if(row[-1] == 'M'):
        labels.append(1.0)
    else:
        labels.append(0.0)
    #remove label from row
    row.pop()
    #convert row to floats
    floatRow = [float(num) for num in row]
    xList.append(floatRow)
# print xList
# print labels
#divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data)
indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0 ]
xListTrain = [xList[i] for i in indices if i%3 != 0 ]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]

#form list of list input into numpy arrays to match input class for scikit-learn linear model
xTrain = numpy.array(xListTrain);
yTrain = numpy.array(labelsTrain); 
xTest = numpy.array(xListTest);
yTest = numpy.array(labelsTest)

#check shapes to see what they look like
print("Shape of xTrain array", xTrain.shape)
print("Shape of yTrain array", yTrain.shape)
print("Shape of xTest array", xTest.shape)
print("Shape of yTest array", yTest.shape)

#train linear regression model
rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain,yTrain)

#generate predictions on in-sample error
trainingPredictions = rocksVMinesModel.predict(xTrain)
print("Some values predicted by model", trainingPredictions[0:5], trainingPredictions[-6:-1])

#generate confusion matrix for predictions on training set (in-sample
confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
#pick threshold value and generate confusion matrix entries
tp = confusionMatTrain[0];
fn = confusionMatTrain[1]; 
fp = confusionMatTrain[2];
tn = confusionMatTrain[3]

print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')

#generate predictions on out-of-sample data
testPredictions = rocksVMinesModel.predict(xTest)

#generate confusion matrix from predictions on out-of-sample data
conMatTest = confusionMatrix(testPredictions, yTest, 0.5)
#pick threshold value and generate confusion matrix entries
tp = conMatTest[0]; 
fn = conMatTest[1]; 
fp = conMatTest[2]; 
tn = conMatTest[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')

#generate ROC curve for in-sample

fpr, tpr, thresholds = roc_curve(yTrain,trainingPredictions)
print 'the fbr is',fpr
print '',tpr
print thresholds
roc_auc = auc(fpr, tpr)
print( 'AUC for in-sample ROC curve: %f' % roc_auc)

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

#generate ROC curve for out-of-sample
fpr, tpr, thresholds = roc_curve(yTest,testPredictions)
roc_auc = auc(fpr, tpr)
print( 'AUC for out-of-sample ROC curve: %f' % roc_auc)

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out-of-sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

('Shape of xTrain array', (138L, 60L))
('Shape of yTrain array', (138L,))
('Shape of xTest array', (70L, 60L))
('Shape of yTest array', (70L,))
('Some values predicted by model', array([-0.10240253,  0.42090698,  0.38593034,  0.36094537,  0.31520494]), array([ 1.11094176,  1.12242751,  0.77626699,  1.02016858,  0.66338081]))
tp = 68.0	fn = 6.0
fp = 7.0	tn = 57.0

tp = 28.0	fn = 9.0
fp = 9.0	tn = 24.0

[ 0.        0.        0.015625  0.015625  0.046875  0.046875  0.078125
  0.078125  0.09375   0.09375   0.140625  0.140625  0.171875  0.171875
  0.1875    0.1875    0.203125  0.203125  0.21875   0.21875   1.      ]
[ 0.01351351  0.78378378  0.78378378  0.83783784  0.83783784  0.89189189
  0.89189189  0.90540541  0.90540541  0.91891892  0.91891892  0.93243243
  0.93243243  0.95945946  0.95945946  0.97297297  0.97297297  0.98648649
  0.98648649  1.          1.        ]
[ 1.4146433   0.64195949  0.63982219  0.61712091  0.60364498  0.56456686
  0.54019672  0.52149739  0.51821399  0.51175085  0