In [None]:
# 查看当前挂载的数据集目录
!ls /home/kesci/input/

In [None]:
# 查看个人持久化工作区文件
!ls /home/kesci/work/

In [None]:
# 查看当前kernerl下的package
!pip list --format=columns

In [None]:
# 显示cell运行时长
%load_ext klab-autotime

## 1 Plotting for Exploratory data analysis(EDA)
### 1.1 Basic Terminology

* What is EDA?
* Data-point/vector/Observation
* Data-set
* Feature/Variable/Input-variable/Dependent-variable
* Label/Independence-variable/Output-variable/Class/Class-label/Response Label
* Vector:2-D,3-D,4-D,..n-D

In [1]:
!ls

lost+found


## 2 load the dataset(Iris Flower dataset)

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

x = np.linspace(-10,10,1000)
y = 1/(np.exp(x)+x)
plt.plot(x,y,'r--')
plt.show()

In [6]:
iris_path = '/home/kesci/input/iris/iris.csv'
data = pd.read_csv(iris_path)

In [7]:
data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [20]:
# The columns names in our datasets
data.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [21]:
# How many data points for each class are present?
# How many flowers for each species are present?
data["Species"].value_counts()

setosa        50
virginica     50
versicolor    50
Name: Species, dtype: int64

## 3 2-D Scatter Plot

In [24]:
#2-D Scatter plot
#ALWAYS understand the axis:labels and scale
data.plot(kind='scatter',x='Sepal.Length',y='Sepal.Width')
plt.show()
 
# Cannot make much sense out it
# What if we color the points by their class-label/flower-type

In [27]:
# 2-D Scatter plot with color-coding for each flower type/class
# Here 'sns' corresponding to deaborn
sns.set_style("whitegrid")
sns.FacetGrid(data,hue="Species",size=4) \
   .map(plt.scatter,"Sepal.Length","Sepal.Width")\
   .add_legend()
plt.show()



**Notice that the blue points can be easily seperated from red and greed by drawing a line.But
 Red and green data points cannot be easily seperated.**
 * Can we draw multiple 2-D scatter plots for each combination of features?
 * How many combinations exist? 4C2 = 6

**3-D Scatter Plot**
https://plot.ly/pandas/3d-scatter-plots/

In [28]:
# Pairwise scatter plot:Pair-Plot
# Dis-advantages:
# Can be used when number of features are high
# Cannot visualize higer dimension patterns in 3-D and 4-D
# Only possible to view 2D patterns
plt.close();
sns.set_style("whitegrid")
sns.pairplot(data,hue="Species",size=3,diag_kind="kde")
plt.show()



**Observations:**
1. petal.length and petal.width are the most useful features to identity various flower types
2. While Setosa can be easily identitied(linearly seperatable),Virnica and Versicolor have some overlap(almost linearly separate)
3. We can find "Lines" and "if-else" conditions to build a simple model to classifity flower types

In [30]:
sns.violinplot(y='Species',x='Sepal.Length',data=data,inner='quartile')
plt.show()

In [31]:
sns.violinplot(y='Species',x='Sepal.Width',data=data,inner='quartile')
plt.show()

In [32]:
sns.violinplot(y='Species',x='Petal.Length',data=data,inner='quartile')
plt.show()

In [34]:
sns.violinplot(y='Species',x='Petal.Width',data=data,inner='quartile')
plt.show()

## 4 Modeling with scikit-learn

In [36]:
X = data.drop(['Species'],axis=1)
y = data['Species']
print(X.shape)
print(y.shape)

(150, 4)
(150,)


**Train and test on the same dataset**
* This method is not suggested since the end goal is to predict iris species using a datases the model has not seen before
* There is also a risk of overfitting the training data

In [38]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [39]:
# experiments with different n value
k_range = list(range(1,26))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X,y)
    y_pred = knn.predict(X)
    scores.append(metrics.accuracy_score(y,y_pred))
    
plt.plot(k_range,scores)
plt.xlabel('Values of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of K-Nearest-Neighbors')
plt.show()

**Logistic Regression**

In [47]:
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X,y)
y_pred = logreg.predict(X)
print(metrics.accuracy_score(y,y_pred))

0.9533333333333334




**创建训练集与测试集**

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(90, 4)
(90,)
(60, 4)
(60,)


In [50]:

k_range = list(range(1,26))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test,y_pred))
    
plt.plot(k_range,scores,'r')
plt.xlabel('Values of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of K-Nearest-Neighbors')
plt.show()

In [52]:
lr = LogisticRegression(penalty='l2',solver='newton-cg',multi_class='multinomial')
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))

0.9833333333333333


### 5 模型搭建与分类器训练
1. 导入模型，调用逻辑回归LogisticRegression()函数
    * penalty:正则化选择参数（惩罚项的种类），默认方式为L2正则化
		* C:正则化系数的倒数
		* solver:对于多分类任务，使用'newton-cg','sag','saga' and 'lbfgs'来解决多项式loss
		* multi_class:默认'ovr'适用于二分类问题，对于多分类问题，用'multinominal'在全局的概率分布上最小化损失
2. 训练LogisticRegression分类器
    * 调用fit(x,y)的方法来训练模型，其中的x为数据的属性,y为所属类型
		
3. 利用训练得到的模型对数据集进行预测predict(),y为所属类型

In [53]:
lr = LogisticRegression(penalty='l2',solver='newton-cg',multi_class='multinomial')
lr.fit(X_train,y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

**模型评估**

In [55]:
print("Logistic Regression模型训练集的准确率：%.3f" %lr.score(X_train, y_train))
print("Logistic Regression模型测试集的准确率：%.3f" %lr.score(X_test, y_test))

Logistic Regression模型训练集的准确率：0.967
Logistic Regression模型测试集的准确率：0.983


**LogisticRegression分类器正确率分析**

In [56]:
y_pred = lr.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_pred)
print("Logistic Regression模型正确率： %.3f" %accuracy)

Logistic Regression模型正确率： 0.983


In [58]:
target_names = ['setosa','versicolor','virgninica']
print(metrics.classification_report(y_test,y_pred,target_names=target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        20
  versicolor       1.00      0.95      0.98        21
  virgninica       0.95      1.00      0.97        19

   micro avg       0.98      0.98      0.98        60
   macro avg       0.98      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60



## 6 可视化结果
绘制逻辑回归分类器在鸢尾花数据集上的决策边界
1. 确定坐标轴范围，x,y轴各表示一个特征
- 先取二维数组的第一列特征（花萼长度）的最大值最小值和步长h=0.02生成数组
- 先取二维数组的第二列特征（花萼宽度）的最大值最小值和步长h=0.02生成数组

最后由meshgrid（）函数在网格[x_min,x_max]×[y_min,y_max]中绘制出，生成两个网格矩阵

In [66]:
X_data = X.as_matrix()
X_train_data = X_train.as_matrix()
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x1_min, x1_max = X_data[:, 0].min() - .5, X_data[:, 0].max() + .5 # 第0列的范围
x2_min, x2_max = X_data[:, 1].min() - .5, X_data[:, 1].max() + .5 # 第1列的范围
h = .02
x1, x2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h)) # 生成网格采样点

  """Entry point for launching an IPython kernel.
  


In [67]:
logreg = LogisticRegression(penalty='l2',solver='newton-cg',multi_class='multinomial')
logreg.fit(X_train_data[:,:2],y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [74]:
grid_test = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
grid_hat = logreg.predict(grid_test)                  # 预测分类值
# grid_hat = lr.predict(np.c_[x1.ravel(), x2.ravel()])
grid_hat = grid_hat.reshape(x1.shape)             # 使之与输入的形状相同
grid_hat

array([['versicolor', 'versicolor', 'versicolor', ..., 'virginica',
        'virginica', 'virginica'],
       ['versicolor', 'versicolor', 'versicolor', ..., 'virginica',
        'virginica', 'virginica'],
       ['versicolor', 'versicolor', 'versicolor', ..., 'virginica',
        'virginica', 'virginica'],
       ...,
       ['setosa', 'setosa', 'setosa', ..., 'virginica', 'virginica',
        'virginica'],
       ['setosa', 'setosa', 'setosa', ..., 'virginica', 'virginica',
        'virginica'],
       ['setosa', 'setosa', 'setosa', ..., 'virginica', 'virginica',
        'virginica']], dtype=object)

In [73]:
plt.figure(1, figsize=(6, 5))
# 预测值的显示, 输出为三个颜色区块，分布表示分类的三类区域
plt.pcolormesh(x1, x2, grid_hat,cmap=plt.cm.Paired) 

# plt.scatter(X[:, 0], X[:, 1], c=Y,edgecolors='k', cmap=plt.cm.Paired)
plt.scatter(X_data[:50, 0], X_data[:50, 1], marker = '*', edgecolors='red', label='setosa')
plt.scatter(X_data[50:100, 0], X_data[50:100, 1], marker = '+', edgecolors='k', label='versicolor')
plt.scatter(X_data[100:150, 0], X_data[100:150, 1], marker = 'o', edgecolors='k', label='virginica')
plt.xlabel('花萼长度-Sepal length')
plt.ylabel('花萼宽度-Sepal width')
plt.legend(loc = 2)

plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
plt.title("Logistic Regression 鸢尾花分类结果", fontsize = 15)
plt.xticks(())
plt.yticks(())
plt.grid()

plt.show()

TypeError: iteration over a 0-d array

<Figure size 432x360 with 1 Axes>

In [75]:
from sklearn import datasets
iris = datasets.load_iris()
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 数据处理（清洗）

In [9]:
data.shape

(150, 5)

In [10]:
X_copy = data.select_dtypes(include=['object'])

In [11]:
X_copy.head()

Unnamed: 0,Species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


**Label Encoding with numercial values**

In [12]:
from sklearn.preprocessing import LabelEncoder
X_encoder = X_copy.copy()
X_encoder = X_encoder.apply(LabelEncoder().fit_transform)


In [13]:
X_encoder.head()

Unnamed: 0,Species
0,0
1,0
2,0
3,0
4,0


In [14]:
value_data = data.drop(X_copy.columns,axis=1)

In [15]:
value_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


**合并数据集**

In [17]:
mergedata = pd.concat([value_data,X_encoder],axis=1)

In [19]:
mergedata.head(8)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
