# mmhelper

---

## Medical Manuscript Helper

## 医学论文助手

### 1. 导入数据
* 将pd.read_csv('...')内数据文件链接替换为相关数据文件的网址
* 数据第一列为分组标识

In [5]:
import pandas as pd
data = pd.read_csv('./data/duct/duct.csv') # excel表格应用 pd.read_excel('...')

### 2. 数据基本信息

* 数据前5项

In [6]:
data.head(5)

Unnamed: 0,group,age,menopause,gestation,pain,ectasia,multiple,distance,size
0,0,47,0,1,1,0,1,0.0,1.0
1,0,47,0,1,1,0,1,0.0,0.0
2,0,44,0,1,0,0,1,0.0,0.0
3,0,44,0,1,0,0,1,,1.0
4,0,44,0,1,0,0,1,0.0,1.0


* 行数，列数

In [None]:
data.shape

* 数据基本特征

In [None]:
data.info()
# data.dtypes # 数据类型

* 数据基本统计信息

In [None]:
data.describe()

* 分组情况（按第一列分组）

In [None]:
chisq = data.groupby([data['group'], data['gestation']]).size()
print(chisq)

In [None]:
pd.crosstab(data['group'],data['gestation'], margins=True)

* 卡方检验（2*2表格）
    - 所有的理论数T≥5并且总样本量n≥40，用Pearson卡方进行检验。
    - 如果理论数T＜5但T≥1，并且1≥40，用连续性校正的卡方进行检验。
    - 如果有理论数T＜1或n＜40，则用Fisher’s检验。
    - 非2*2表格直接使用stats.chi2_contingency(chisq_matrix)

* R×C表卡方检验应用条件：
    - R×C表中理论数小于5的格子不能超过1／5；
    - 不能有小于1的理论数。如果实验中有不符合R×C表的卡方检验，可以通过增加样本数、列合并来实现。

In [None]:
from scipy import stats
import numpy as np
chisq_matrix = np.array([[chisq[0,0],chisq[0,1]],[chisq[1,0],chisq[1,1]]])
stats.chi2_contingency(chisq_matrix, correction=False) # Pearson
# stats.chi2_contingency(chisq_matrix) # 连续性校正
# stats.fisher_exact(chisq_matrix)

* 小数位数设置

In [None]:
pd.set_option('precision', 3) 

* 数据相关性

In [None]:
data.corr()

* 数据高斯分布偏离

In [None]:
data.skew()

### 3. 数据可视化

In [None]:
from matplotlib import pyplot
%config InlineBackend.figure_format = 'svg'
import seaborn as sns

* 单变量直方图

In [None]:
data.hist(figsize=(10,10));

* 单变量密度图

In [None]:
data.plot(kind='density', subplots=True, figsize=(10,10), layout=(3,3), sharex=False);

* 单变量箱图

In [None]:
data.plot(kind='box', subplots=True, figsize=(10,10), layout=(3,3), sharex=False);

* 多变量相关矩阵图

In [None]:
import numpy as np
correlations = data.corr()
fig = pyplot.figure()
ax = fig.add_subplot('111')
ax.matshow(correlations, vmin=-1, vmax=1)
ticks = np.arange(0, 9, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(data.columns)
ax.set_yticklabels(data.columns)
fig.set_size_inches(10,10)

* 散点矩阵图

In [None]:
# pd.plotting.scatter_matrix(data, figsize=(15,15));

### 4. 数据预处理

* 数据缺失值处理

In [None]:
traindata = data.dropna()

* 分离数据

In [None]:
from sklearn.model_selection import train_test_split
array = traindata.values
X = array[:,1:]
Y = array[:,0]

1. 调整尺度

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# transformer = MinMaxScaler(feature_range=(0,1))
# X = transformer.fit_transform(X)

2. 正态化数据

In [None]:
# from sklearn.preprocessing import StandardScaler
# transformer = StandardScaler().fit(X)
# X = transformer.transform(X)

3. 标准化数据

In [None]:
# from sklearn.preprocessing import Normalizer
# transformer = Normalizer().fit(X)
# X = transformer.transform(X)

4. 二值数据

In [None]:
# from sklearn.preprocessing import Binarizer
# transformer = Binarizer().fit(X)
# X = transformer.transform(X)

### 5. 特征选择

* 单变量特征选择

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# testchi = chi2(X, Y)
# test = SelectKBest(score_func=chi2, k=2)
# fit = test.fit(X, Y)
# print(fit.scores_)
# features = fit.transform(X)
# test.get_support()

* 递归特征消除（RFE）

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression(solver='lbfgs')
# rfe = RFE(model, 3)
# fit = rfe.fit(X, Y)
# print(fit.n_features_)
# print(fit.support_)
# print(fit.ranking_)

* 主要成分分析（PCA）

In [None]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=3)
# fit = pca.fit(X)
# print(fit.explained_variance_ratio_)
# print(fit.components_)

* 特征重要性

In [None]:
# from sklearn.ensemble import ExtraTreesClassifier
# model = ExtraTreesClassifier(n_estimators=100)
# fit = model.fit(X, Y)
# print(fit.feature_importances_)

### 6. 评估算法

* 数据分组

In [None]:
testsize = 0.2
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=testsize, random_state=seed)

* 评估模型

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {}
models['LR'] = LogisticRegression(solver='liblinear')
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier()
models['NB'] = GaussianNB()
models['SVM'] = SVC(gamma='scale')

results = []
for key in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)

* 箱图比较算法

In [None]:
pyplot.boxplot(results);

### 7. 预测

In [None]:
svm = SVC(gamma='scale')
svm.fit(X=X_train, y=y_train)
predictions = svm.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))