In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

hyperparameters

In [2]:
random_seed = 42
kf=10
score = 'f1_weighted'

#### somte sampling

In [3]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print("rs:", random_lst)
    sm = SMOTE(random_state=random_lst[2], kind = 0.24)
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

# load data_four_features

In [4]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.dtypes)
print(data_four_features.shape)

pH             float64
DO(mg/l)       float64
CODMn(mg/l)    float64
NH3-N(mg/l)    float64
本周水质             int64
dtype: object
(33612, 5)


In [5]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series

In [6]:
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
2    13272
3     8797
4     5472
1     2438
6     2146
5     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [7]:
data_four_features.head()

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质
0,7.09,10.0,5.7,0.33,3
1,6.94,12.0,5.4,0.4,3
2,7.2,9.6,4.9,0.34,3
3,6.8,11.6,6.3,0.59,4
4,6.75,11.0,6.2,0.64,4


In [8]:
print("============ train_test_split ============")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("67%% train: %d/%d, 33%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))

67% train: 26889/33612, 33% test: 6723/33612


### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [9]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

In [10]:
print("============ train_valid_split ============")

X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)




## model selection based on 5 flod cross validation

In [11]:
models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    LinearSVC(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=random_seed),
    ExtraTreesClassifier(random_state=random_seed),
]
# CV = 5

# entries = []
# for model in models:
#     model_name = model.__class__.__name__
#     accuracies = cross_val_score(model, X_train, y_train, scoring='f1_weighted', cv=CV)
#     for fold_idx, accuracy in enumerate(accuracies):
#         entries.append((model_name, fold_idx, accuracy))
# cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'f1_weighted'])

In [12]:
# print(cv_df)

In [13]:
# entries = []
# for model in models:
#     model_name = model.__class__.__name__
#     accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV)
#     for fold_idx, accuracy in enumerate(accuracies):
#         entries.append((model_name, fold_idx, accuracy))
# acc_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [14]:
# print(acc_df)

## metrics on test set

In [15]:
test_entries = []
for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train2, y_train2)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
    test_entries.append((model_name, f1, acc))

test_df = pd.DataFrame(test_entries, columns=['model_name', 'f1_weighted', 'accuracy'])

  'precision', 'predicted', average, warn_for)


In [16]:
print("model_test")
print(test_df)

model_test
                   model_name  f1_weighted  accuracy
0          LogisticRegression     0.620574  0.670980
1  LinearDiscriminantAnalysis     0.576099  0.625019
2                   LinearSVC     0.587685  0.640042
3      DecisionTreeClassifier     0.970106  0.969954
4         ExtraTreeClassifier     0.914437  0.914324
5                  GaussianNB     0.822296  0.824037
6        KNeighborsClassifier     0.897799  0.897813
7      RandomForestClassifier     0.976184  0.976052
8        ExtraTreesClassifier     0.962632  0.962517


In [17]:
for model in models:
    model_name = model.__class__.__name__
    y_pred = model.predict(X_test)
    print("=================" + model_name + "=================")
    print(classification_report(y_test, y_pred, digits=4))

    cm = confusion_matrix(y_test, y_pred)
    i=0
    acc_all = np.zeros(6)
    for c in cm:
        acc_all[i] = c[i]/np.sum(c)
        print("%d accuaracy: %f" %(i+1, acc_all[i]))
        i=i+1
    print("acc:", np.sum(y_test == y_pred)/y_pred.shape[0])
    print('f1_weighted', f1_score(y_test, y_pred, average='weighted'))
    
        
    f1 = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
#     print((model_name, f1, acc, acc_all[5]))
    test_entries.append((model_name, f1, acc, acc_all[5]))

test_df = pd.DataFrame(test_entries, columns=['model_name', 'f1_weighted', 'accuracy', "6_accuracy"])

             precision    recall  f1-score   support

          1     0.9333    0.0574    0.1081       488
          2     0.6672    0.9951    0.7988      2655
          3     0.6628    0.5517    0.6022      1760
          4     0.6360    0.4808    0.5476      1094
          5     0.0000    0.0000    0.0000       297
          6     0.7818    0.8019    0.7917       429

avg / total     0.6581    0.6710    0.6206      6723

1 accuaracy: 0.057377
2 accuaracy: 0.995104
3 accuaracy: 0.551705
4 accuaracy: 0.480804
5 accuaracy: 0.000000
6 accuaracy: 0.801865
acc: 0.6709802171649561
f1_weighted 0.6205742071342153
             precision    recall  f1-score   support

          1     0.0000    0.0000    0.0000       488
          2     0.6016    0.9992    0.7510      2655
          3     0.6036    0.3858    0.4707      1760
          4     0.7075    0.5283    0.6049      1094
          5     0.5361    0.2997    0.3844       297
          6     0.9902    0.4732    0.6404       429

avg / total  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


f1_weighted 0.8977994538909129
             precision    recall  f1-score   support

          1     0.9065    0.9939    0.9482       488
          2     0.9927    0.9755    0.9840      2655
          3     0.9835    0.9801    0.9818      1760
          4     0.9779    0.9698    0.9738      1094
          5     0.9295    0.9327    0.9311       297
          6     0.9593    0.9883    0.9736       429

avg / total     0.9767    0.9761    0.9762      6723

1 accuaracy: 0.993852
2 accuaracy: 0.975518
3 accuaracy: 0.980114
4 accuaracy: 0.969835
5 accuaracy: 0.932660
6 accuaracy: 0.988345
acc: 0.976052357578462
f1_weighted 0.9761838790295564
             precision    recall  f1-score   support

          1     0.8905    0.9836    0.9348       488
          2     0.9803    0.9721    0.9762      2655
          3     0.9710    0.9528    0.9619      1760
          4     0.9506    0.9497    0.9502      1094
          5     0.9189    0.9158    0.9174       297
          6     0.9701    0.9837    0

In [18]:
test_df

Unnamed: 0,model_name,f1_weighted,accuracy,6_accuracy
0,LogisticRegression,0.620574,0.67098,
1,LinearDiscriminantAnalysis,0.576099,0.625019,
2,LinearSVC,0.587685,0.640042,
3,DecisionTreeClassifier,0.970106,0.969954,
4,ExtraTreeClassifier,0.914437,0.914324,
5,GaussianNB,0.822296,0.824037,
6,KNeighborsClassifier,0.897799,0.897813,
7,RandomForestClassifier,0.976184,0.976052,
8,ExtraTreesClassifier,0.962632,0.962517,
9,LogisticRegression,0.620574,0.67098,0.801865
