# python 基础

## 字典

In [None]:
map_abc = {
    "a" : 0,
    "b" : [1,2,3],
    "c" : {"cc" : 4}
}

In [None]:
print(map_abc["a"])
print(map_abc["b"], map_abc["b"][1])
print(map_abc["c"]["cc"])
for k in map_abc:
    print("key:",k, "\nvalue:", map_abc[k])

## numpy 处理缺失数据

In [None]:
import numpy as np


np_temp = [36.2, 36.3, 36.4, np.nan, 36.3, 36.2, 36.4]
print(np.mean(np_temp))
print(np.nanmean(np_temp))
np_temp[3] = np.nanmean(np_temp)
print(np.mean(np_temp))



## pandas one hot 编码

In [None]:
import pandas as pd

df_onehot_example = pd.DataFrame({"name" : [u"张三", u"李四", u"王五", u"李武"], 
                                  "job" : [u"工人", u"农民", u"军人", u"工人"]})

pd.get_dummies(df_onehot_example, columns=["job"])

## matplotlib 作图与过拟合欠拟合

In [None]:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

def true_fun(X):
    return np.cos(1.5 * np.pi * X)

np.random.seed(42)

n_samples = 30
degrees = [1, 4, 15]

X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                             scoring="neg_mean_squared_error", cv=10)

    X_test = np.linspace(0, 1, 100)
    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    plt.plot(X_test, true_fun(X_test), label="True function")
    plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
        degrees[i], -scores.mean(), scores.std()))
plt.show()

# 鸢尾花数据集

## 预览数据

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import datasets

%matplotlib inline
sns.set_style("white")

data = datasets.load_iris()
for k in data:
    print("############\n##%s##\n############\n" % k)
    print(data[k])

In [None]:
df = pd.DataFrame(data.data)
df.columns = data.feature_names
df['species'] = [ data['target_names'][x] for x in data.target ]
df.head()

In [None]:
df_cnt = df['species'].value_counts().reset_index()
df_cnt

In [None]:
sns.barplot(data=df_cnt, x='index', y='species')

### 均值方差

In [None]:
df.describe()

In [None]:
pd.melt(df, id_vars=['species'])

### 正态分布检验

In [None]:
from scipy import stats

for i in range(4):
    name = data.feature_names[i]
    ax = plt.subplot(2,2,i+1)
    stats.probplot(df[name], plot=ax)
    ax.set_title(name)


### 分组观察

In [None]:
pd.melt(df, id_vars=['species']).pivot_table(index=['species'], columns=['variable'], aggfunc=[np.mean, np.var])

In [None]:
sns.pairplot(df, hue="species")

In [None]:
fig = plt.figure(figsize=(12,4))

for i in range(3):
    name = data.target_names[i]
    ax = plt.subplot(1,3,i+1)
    stats.probplot(df[df['species']==name][data.feature_names[2]], plot=ax)
    ax.set_title(name)


### 基于 PCA 数据降维

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
df_sub = df[data.feature_names[0:3]]
pca.fit(df_sub)
pca_result = pca.transform(df_sub)
fig = plt.figure(figsize=(4,4))
ax = fig.add_subplot(111)
ax.scatter(pca_result[:, 0], pca_result[:, 1], c=data.target, cmap=plt.cm.Set3)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

plane_show_size_ratio = 5
plane_show_shift = df_sub.mean().values
pca_score = pca.explained_variance_ratio_
V = pca.components_
l_pca_axis = V.T * plane_show_size_ratio
l_pca_plane = []
for pca_axis in l_pca_axis:
    l_pca_plane.append(np.r_[pca_axis[:2], - pca_axis[1::-1]].reshape(2,2))

fig = plt.figure(figsize=(4,4))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=150, azim=-34 )
ax.scatter(df_sub.values[:,0], df_sub.values[:,1],df_sub.values[:,2], '.', c=data.target, cmap=plt.cm.Set3)
ax.plot_surface(l_pca_plane[0]+plane_show_shift[0],
                l_pca_plane[1]+plane_show_shift[1],
                l_pca_plane[2]+plane_show_shift[2], alpha=0.1)

### 基于流形假设降维

In [None]:
from sklearn.manifold import Isomap, MDS, SpectralEmbedding

n_components = 2
n_neighbors = 10
X = df.drop(['species'], axis=1)
color = data.target

fig = plt.figure(figsize=(12, 4))


Y = Isomap(n_neighbors, n_components).fit_transform(X)
ax = fig.add_subplot(131)
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)
ax.set_title("Isomap")

Y = MDS(n_components, max_iter=100, n_init=1).fit_transform(X)
ax = fig.add_subplot(132)
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)
ax.set_title("MDS")

Y = SpectralEmbedding(n_components=n_components,n_neighbors=n_neighbors).fit_transform(X)
ax = fig.add_subplot(133)
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)
ax.set_title("Isomap")

## 数据预处理

### 划分训练集验证集并进行标准化

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

# df 作为初步训练集，划分 80% 作为训练集， 20% 作为验证集
df_train, df_val = train_test_split(df, train_size=0.8, random_state=0)

# 提前特征，这里是将分类结果舍弃
X_train = df_train.drop(['species'], axis=1)
X_val   = df_val.drop(['species'], axis=1)
# 提取分类结果
Y_train = df_train['species']
Y_val   = df_val['species']

# 设定分布 X_scaler，用训练集估计(fit)分布，然后对验证集进行转换(transform)
X_scaler = StandardScaler()
X_trainT = X_scaler.fit_transform(X_train)
X_valT   = X_scaler.transform(X_val)

In [None]:
# 这里将保证训练集是标准正态分布，验证集不一定满足这个条件，但不会差很多
print(X_trainT.mean(axis=0), X_trainT.var(axis=0))
print(X_valT.mean(axis=0), X_valT.var(axis=0))

# 特征挖掘

## 观察特征后，基于数据性质手动组合重要特征

In [None]:
from sklearn.utils import shuffle
import matplotlib as mpl
from cycler import cycler
mpl.rcParams['axes.prop_cycle'] = cycler(color='rb')

np.random.seed(42)
pseudoNum1 = 300
pseudoNum2 = 300
np_pho1   = 4.5 + np.random.rand(pseudoNum1)*2
np_pho2   = 0.5 + np.random.rand(pseudoNum2)*2
np_theta1 = np.random.rand(pseudoNum1)*360 / 2*np.pi
np_theta2 = np.random.rand(pseudoNum2)*360 / 2*np.pi

np_x1 = np_pho1 * np.cos(np_theta1)
np_y1 = np_pho1 * np.sin(np_theta1)
np_x2 = np_pho2 * np.cos(np_theta2)
np_y2 = np_pho2 * np.sin(np_theta2)

pd_circ = shuffle(pd.DataFrame({
    "X" : list(np_x1)+list(np_x2), 
    "Y" : list(np_y1)+list(np_y2), 
    "label" : ["Class1" for x in range(pseudoNum1)] + ["Class2" for x in range(pseudoNum2)]
}), random_state=0).reset_index().drop(['index'],axis=1)
pd_circ0 = pd_circ.copy()
pd_circ.head()

In [None]:
for sub in ["Class1", "Class2"]:
    pd_sub = pd_circ[pd_circ['label']==sub]
    plt.plot(pd_sub["X"], pd_sub["Y"], ".", label=sub)

plt.legend()

In [None]:
sns.pairplot(pd_circ, hue="label")

In [None]:
pd_circ['X_add_Y'] = pd_circ['X'] + pd_circ['Y']
pd_circ['X_time_Y'] = pd_circ['X'] * pd_circ['Y']
pd_circ['X2_add_Y2'] = pd_circ['X'] * pd_circ['X'] + pd_circ['Y'] * pd_circ['Y']
sns.pairplot(pd_circ, hue="label")

## 基于正态分布的模型，可以用正态分布的特性理解非线性数据

In [None]:
pd_circ_melt = pd_circ0.melt(id_vars=['label']).pivot_table(index=['variable'], columns=['label'], aggfunc=[np.mean, np.var])
pd_circ_melt

![](https://docs.scipy.org/doc/numpy/_images/math/3f40671c78b1cb1d6a6f4a306a2b39a6d55921cf.png)

In [None]:
mean_X = pd_circ_melt['mean']['value'].loc['X'].values.reshape(2,1)
mean_Y = pd_circ_melt['mean']['value'].loc['Y'].values.reshape(2,1)
var_X  = pd_circ_melt['var']['value'].loc['X'].values.reshape(2,1)
var_Y  = pd_circ_melt['var']['value'].loc['Y'].values.reshape(2,1)

In [None]:
probX = 1 / np.sqrt(2*np.pi*var_X) * np.exp(-1*(np.array([pd_circ['X'].values,pd_circ['X'].values]).reshape(2,600)-mean_X)**2 / (2*var_X))
probY = 1 / np.sqrt(2*np.pi*var_Y) * np.exp(-1*(np.array([pd_circ['Y'].values,pd_circ['Y'].values]).reshape(2,600)-mean_Y)**2 / (2*var_Y))

In [None]:
pd2 = pd.DataFrame(probX.T*probY.T)
pd_circ['pred'] = pd2.apply(lambda x: "Class2" if x[0] < x[1] else "Class1", axis=1)
pd_circ

In [None]:
pd_circ[pd_circ['label']!=pd_circ['pred']]

# 模型训练

## 交叉熵

In [None]:
np_yhat = np.linspace(0, 1, 101)
np_h = -(0*np.log(np_yhat) + 1*np.log(1-np_yhat))
plt.plot(np_yhat, np_h)

## 动手写简单的逻辑斯蒂回归

In [None]:
y = data.target
y[y==2] = 1
X = np.hstack([data.data, np.ones_like(y).reshape(len(y),1)])
C = 1
alpha = 0.1

In [None]:
np.random.seed(42)
omega = np.random.random(X.shape[1]).reshape(5, 1)
for i in range(10):
    y_hat = 1 / (1+np.exp(-X.dot(omega)))
    dL = X.T.dot(C * (y.reshape(-1,1) - y_hat))
    omega += alpha * dL 
    
omega

In [None]:
plt.plot( 1 / (1+np.exp(-X.dot(omega))))

### 考虑 L2 正则化

In [None]:
np.random.seed(42)
omega = np.random.random(X.shape[1]).reshape(5, 1)
for i in range(10):
    y_hat = 1 / (1+np.exp(-X.dot(omega)))
    dL = X.T.dot(C * (y.reshape(-1,1) - y_hat)) + omega
    omega += alpha * dL
    
omega

In [None]:
plt.plot(1 / (1+np.exp(-X.dot(omega))))

## 使用 sklearn 库进行逻辑斯蒂回归

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(C=1)
model.fit(X, y)
plt.plot(model.predict(X))

## 10折交叉验证与网格搜索 

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
model = LogisticRegression()
clf = GridSearchCV(model, parameters, cv=10)
clf.fit(X, y)

In [None]:
plt.plot( np.log10(np.array([0.0001, 0.001, 0.01, 0.1, 1, 10, 100])), clf.cv_results_['mean_train_score'], label="train")
plt.plot( np.log10(np.array([0.0001, 0.001, 0.01, 0.1, 1, 10, 100])), clf.cv_results_['mean_test_score'], label="test")
plt.xlabel("log10 C")
plt.legend()
plt.ylabel("Cross Validation Accuracy")

## AUC值用于评价数据分布不均情况下的模型质量

In [None]:
from sklearn.metrics import roc_curve,auc
np.random.seed(42)
# 假如真实情况1万个病人，有10个是有病的
np_real = np.array([0.0  for i in range(9990)] + [1.0 for i in range(10) ], dtype=bool)

# 预测1， 全预测为没问题，准确率 99.90%
np_pred_allf = 0.1*np.random.random(10000)

# 预测2:，准确预测使用情况，准确率 100%
np_pred_true = np_pred_allf.copy()
np_pred_true[-10:] = 0.99

fpr, tpr, thresholds    = roc_curve(np.array(np_real, dtype=int),np_pred_allf )
AUC_value = auc(fpr, tpr)

fpr2, tpr2, thresholds2 = roc_curve(np.array(np_real, dtype=int),np_pred_true )
AUC_value2 = auc(fpr2, tpr2)

# 虽然准确率差不多，但是AUC值差异巨大
print(AUC_value, AUC_value2)

In [None]:
fpr2 = np.array([0] + list(fpr2))
tpr2 = np.array([0] + list(tpr2))

plt.figure(figsize=(5,5))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve 1 (area = %0.2f)' % AUC_value)
plt.plot(fpr2, tpr2, color='g',
         lw=lw, label='ROC curve 2 (area = %0.2f)' % AUC_value2)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()