In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix 
from sklearn.feature_selection import RFE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectPercentile 

In [2]:
# 读取数据
ori_df = pd.read_table('data_for_ml_after_filter', header = None, sep=' ', dtype = 'str')

In [3]:
# replace nan str to np.nan
replace_df = ori_df.replace('nan', np.nan)

In [4]:
# 分特征和标签
all_features = replace_df.iloc[:, 1:].values
all_labels = replace_df.iloc[:, :1].values.ravel()

In [5]:
# impute缺失值
imputer = KNNImputer(n_neighbors=5, weights="uniform")
all_features_imputed_df=imputer.fit_transform(all_features)

In [6]:
# 数据分割
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    all_features_imputed_df, all_labels, random_state=1, test_size=0.2)

In [7]:
# 使用f_classif（默认值）和SelectPercentile来选择1%的特征 
select = SelectPercentile(percentile=1) 
select.fit(X_train, y_train) 

SelectPercentile(percentile=1)

In [8]:
# 对训练集进行变换 
X_train_selected = select.transform(X_train) 
X_test_selected = select.transform(X_test) 
 
print("X_train.shape: {}".format(X_train.shape)) 
print("X_train_selected.shape: {}".format(X_train_selected.shape))
print("X_test_selected.shape: {}".format(X_test_selected.shape))

X_train.shape: (182, 117382)
X_train_selected.shape: (182, 1174)
X_test_selected.shape: (46, 1174)


In [20]:
forest = RandomForestClassifier(n_estimators=3000, max_features=300, random_state=1) 
forest.fit(X_train_selected, y_train) 
print("Accuracy on training set: {:.4f}".format(forest.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(forest.score(X_test_selected, y_test)))

Accuracy on training set: 1.0000
Accuracy on test set: 0.9348


In [10]:
from sklearn.ensemble import GradientBoostingClassifier 
gbrt = GradientBoostingClassifier(random_state=0) 
gbrt.fit(X_train_selected, y_train) 
 
print("Accuracy on training set: {:.4f}".format(gbrt.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(gbrt.score(X_test_selected, y_test)))

Accuracy on training set: 1.0000
Accuracy on test set: 0.8261


In [12]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_selected, y_train)
print("Accuracy on training set: {:.4f}".format(knn.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(knn.score(X_test_selected, y_test)))

Accuracy on training set: 1.0000
Accuracy on test set: 0.5217


In [13]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_selected, y_train)
print("Accuracy on training set: {:.4f}".format(gnb.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(gnb.score(X_test_selected, y_test)))

Accuracy on training set: 0.9945
Accuracy on test set: 0.3913


In [14]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_selected, y_train)
print("Accuracy on training set: {:.4f}".format(svc.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(svc.score(X_test_selected, y_test)))

Accuracy on training set: 0.7527
Accuracy on test set: 0.5217


In [23]:
from sklearn.tree import DecisionTreeClassifier 
tree = DecisionTreeClassifier(random_state=1) 
tree.fit(X_train_selected, y_train) 
print("Accuracy on training set: {:.4f}".format(tree.score(X_train_selected, y_train))) 
print("Accuracy on test set: {:.4f}".format(tree.score(X_test_selected, y_test)))

Accuracy on training set: 1.0000
Accuracy on test set: 0.7174
