# 划分数据集

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',header=None)

In [4]:
df_wine.columns = ['Class label','Alcohol',
                  'Malic acid','Ash',
                  'Alcalinity of ash', 'Magnesium',
                  'Total phenols', 'Flavanoids',
                  'Nonflavanoid phenols',
                  'Proanthocyanins',
                  'Color intensity', 'Hue',
                  'OD280/OD315 of diluted wines',
                  'Proline'] #设置列标题

In [5]:
print('Class labels',np.unique(df_wine['Class label'])) #查看类标数量

Class labels [1 2 3]


In [6]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X,y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

# 特征缩放

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
mms = MinMaxScaler() #特征归一化0~1，特征标准化-1~1，一般来说标准化呈正态分布，更新权重更方便

In [12]:
X_train_norm = mms.fit_transform(X_train)

In [13]:
X_test_norm = mms.transform(X_test)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
stdsc = StandardScaler()

In [16]:
X_train_std = stdsc.fit_transform(X_train)

In [17]:
X_test_std = stdsc.transform(X_test)

In [19]:
X_train_norm[0:5]

array([[0.83947368, 0.64229249, 0.52941176, 0.10638298, 0.68493151,
        0.69655172, 0.56962025, 0.13207547, 0.65354331, 0.32593857,
        0.2991453 , 0.88627451, 0.3381295 ],
       [0.31052632, 0.08893281, 0.03267974, 0.29787234, 1.        ,
        0.3       , 0.19831224, 0.01886792, 0.81889764, 0.13395904,
        0.63247863, 0.70588235, 0.30791367],
       [0.21315789, 0.42490119, 0.34640523, 0.36170213, 0.46575342,
        0.25517241, 0.20675105, 0.56603774, 0.20866142, 0.1168942 ,
        0.35897436, 0.49019608, 0.15107914],
       [0.65      , 0.47035573, 0.60130719, 0.68085106, 0.61643836,
        0.14482759, 0.25949367, 0.16981132, 0.32677165, 0.62457338,
        0.04273504, 0.01176471, 0.15107914],
       [0.71578947, 0.19565217, 0.46405229, 0.25531915, 0.15068493,
        0.55862069, 0.51054852, 0.30188679, 0.54724409, 0.36860068,
        0.52136752, 0.63921569, 0.74100719]])

In [20]:
X_train_std[:5]

array([[ 1.42262271,  1.38799354,  0.5100237 , -1.88704901,  2.04812621,
         1.07444098,  0.96290566, -1.33529508,  0.90011585,  0.05779626,
        -0.35491152,  1.32889244,  0.02112007],
       [-0.96894942, -1.06435676, -2.41361583, -0.83394403,  3.69587118,
        -0.76908307, -0.76739102, -1.81407076,  1.67113614, -0.97520541,
         1.35630695,  0.65162274, -0.10444307],
       [-1.40918907,  0.42457021, -0.56710665, -0.48290903,  0.90186883,
        -0.97748144, -0.72806609,  0.5000117 , -1.17429588, -1.06702778,
        -0.04776975, -0.15815625, -0.75617558],
       [ 0.56594015,  0.62601327,  0.93318206,  1.27226595,  1.68992078,
        -1.49046204, -0.48228531, -1.17570318, -0.6235671 ,  1.66468774,
        -1.67123343, -1.95439328, -0.75617558],
       [ 0.86339937, -0.59140349,  0.12533429, -1.06796736, -0.74587614,
         0.43321522,  0.68763119, -0.61713156,  0.40445995,  0.28735218,
         0.78590079,  0.40132742,  1.6952953 ]])

# 特征选择

### L1正则化满足数据稀疏化

In [21]:
from sklearn.linear_model import LogisticRegression

In [37]:
lr = LogisticRegression(penalty='l1',C=1.0,multi_class='auto',solver='liblinear')

In [38]:
lr.fit(X_train_std,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
print ('Training accuracy:',lr.score(X_train_std,y_train))

Training accuracy: 1.0


In [41]:
print ('test accuracy:',lr.score(X_test_std,y_test))

test accuracy: 1.0


In [42]:
lr.intercept_ #获得截距项

array([-1.18717509, -1.13245193, -2.52229744])

In [43]:
lr.coef_ #获得权重系数，有三个权重是针对多分类的情况所采用的一对多方法OvR

array([[ 1.13436625,  0.        ,  0.79723613, -1.37919061,  0.        ,
         0.        ,  0.59539487,  0.        ,  0.        ,  0.        ,
         0.        ,  1.0694832 ,  2.32098364],
       [-1.35875765, -0.32683693, -1.14239746,  0.6864539 ,  0.        ,
         0.06930373,  0.67975579,  0.10279708,  0.        , -1.90581578,
         0.90729652,  0.        , -2.07025126],
       [ 0.13821619,  0.1311182 ,  0.42639177,  0.        ,  0.        ,
         0.        , -2.38590563,  0.        ,  0.        ,  1.47293043,
        -0.86049005, -0.60210613,  0.        ]])

### 正则化效果图（省略）

### 序列特征选择算法SBS