 # 感知机模型

In [142]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

## 原始数据划分成训练集和测试集(下次直接读取对应划分好已经存储起来的数据即可)

1. 打乱数据
2. 20%留做测试集,80%作为训练集和验证集,分别存储下来

In [143]:
# 打乱数据
df = pd.read_csv("output.csv")
total = len(df)
print(total)
df = shuffle(df)  
df = df.reset_index()  # 重新建立索引
df = df.iloc[:, 2:]  # 重新建立索引后去掉"index"列
df.head(2)

4000


Unnamed: 0,dfp,nean,pnian,nvc,nnm,nnmp,vcp,njf,nsf,we,class
0,0.042969,1,0,5,108,0.837209,0.0,0,20,7.442584,1
1,0.002476,1,0,5,6,0.272727,0.0,2,8,7.733537,1


In [144]:
# 划分出20%作为测试集, 80%作为训练集和验证集

test_data = df[:int(0.2*total)]
total_test = len(test_data)
test_data.to_csv( "test_data_" + str(total_test)+ ".csv",header = True,index = False,sep='\t')
# test_data = pd.read_csv("test_data_800.csv",header = 0,sep='\t') # 读取测试数据

data = df[int(0.2*total):]
total_data = len(data)
data.to_csv( "data_" + str(total_data)+ ".csv",header = True,index = False,sep='\t')

## 读取训练集数据,并划分为80%的训练集,20%的验证集

In [145]:
# 训练集数据
data = pd.read_csv("data_3200.csv",header = 0,sep='\t') # 读取训练数据

# 将特征划分到 X 中，标签划分到 Y 中
x = data.iloc[:, :-1]
y = data.iloc[:,-1]
# 使用train_test_split函数随机划分数据集(训练集占80%，验证集占20%)
train_X,test_X, train_y, test_y = train_test_split(x,
                                                   y,
                                                   test_size = 0.2,
                                                   random_state = 0)

## 标准化特征

In [146]:
# 为了追求机器学习和最优化算法的最佳性能，我们将特征缩放
sc = StandardScaler()
sc.fit(train_X) # 估算每个特征的平均值和标准差

train_X_std = sc.transform(train_X)
test_X_std = sc.transform(test_X)  # 这里用同样的参数来标准化测试集，使得测试集和训练集之间有可比性

# 存储和加载标准化
joblib.dump(sc, "StandardScaler_model.m")
# sc = joblib.load("StandardScaler_model.m")

  return self.partial_fit(X, y)
  """
  


['StandardScaler_model.m']

## 训练感知机模型

In [147]:
# n_iter：可以理解成梯度下降中迭代的次数
# eta0：可以理解成梯度下降中的学习率
# random_state：设置随机种子的，为了每次迭代都有相同的训练集顺序
ppn = Perceptron(n_iter=4000, eta0=0.05, random_state=0)
ppn.fit(train_X_std, train_y)
y_pred = ppn.predict(test_X_std)
accuracy_score(test_y, y_pred)



0.965625

## 5-fold 交叉验证

In [148]:
x = sc.transform(x)
accs = cross_val_score(ppn, x, y, cv = 5)
print('交叉验证结果:',accs)

  """Entry point for launching an IPython kernel.


交叉验证结果: [0.95625   0.9546875 0.9203125 0.8859375 0.9421875]


## 存储和加载模型

In [149]:
# 感知器模型
joblib.dump(ppn, "train_model.m")
# ppn = joblib.load("train_model.m")

['train_model.m']

## 测试集上的准确度

In [150]:
# 读取测试集
test_data = pd.read_csv("test_data_800.csv",header = 0,sep='\t') # 读取测试数据
test_data_x = test_data.iloc[:, :-1]
label = test_data.iloc[:,-1]

# 特征标准化
sc = joblib.load("StandardScaler_model.m")
test_data_x_std = sc.transform(test_data_x)  # 这里用同样的参数来标准化测试集，使得测试集和训练集之间有可比性

# 加载模型
ppn = joblib.load("train_model.m")
y_pred = ppn.predict(test_data_x_std)
accuracy_score(label, y_pred)  # 计算模型在测试集上的准确性

  


0.95