In [1]:
import os
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

#### somte sampling

In [2]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print("rs:", random_lst)
    sm = SMOTE(random_state=random_lst[2], kind = 0.24)
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

# load all data

In [3]:
path = os.getcwd()+'/../data/water/csv/20122018freshwater.csv'

data = pd.read_csv(path, na_values = np.nan)

print(data.dtypes)
print(data.shape)

水系              object
点位名称            object
河流名称            object
pH             float64
DO(mg/l)       float64
CODMn(mg/l)    float64
NH3-N(mg/l)    float64
本周水质            object
dtype: object
(33614, 8)


In [4]:
drop_columns = []
continuous_features = ['pH', 'DO(mg/l)', 'CODMn(mg/l)', 'NH3-N(mg/l)']
cat_features =['水系', '点位名称', '河流名称']

In [5]:
# 独热编码
data_dummies = pd.get_dummies(data, columns=cat_features)

In [6]:
# 舍弃无用特征
data_dummies.drop(drop_columns, 1, inplace=True)

In [7]:
# 删除空行
data_dummies = data_dummies.dropna(axis=0)

In [8]:
data_dummies[data_dummies.isnull().values==True]

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质,水系_内陆河流太湖流域,水系_太湖流域,水系_巢湖流域,水系_巢湖流域滇池流域,水系_松花江流域,...,河流名称_闽江,河流名称_颖河,河流名称_额尔古纳河,河流名称_额尔齐斯河,河流名称_饮马河,河流名称_鸭绿江,河流名称_黄河,河流名称_黎河,河流名称_黑茨河,河流名称_黑龙江


In [9]:
X = data_dummies.drop(['本周水质'], axis=1) # Series
y = data_dummies['本周水质'] # Series

In [10]:
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_dummies.dtypes.value_counts())

水质分布情况:
2    13272
3     8797
4     5472
1     2438
6     2146
5     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
uint8      338
float64      4
object       1
dtype: int64


In [11]:
data_dummies.head()

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质,水系_内陆河流太湖流域,水系_太湖流域,水系_巢湖流域,水系_巢湖流域滇池流域,水系_松花江流域,...,河流名称_闽江,河流名称_颖河,河流名称_额尔古纳河,河流名称_额尔齐斯河,河流名称_饮马河,河流名称_鸭绿江,河流名称_黄河,河流名称_黎河,河流名称_黑茨河,河流名称_黑龙江
0,7.09,10.0,5.7,0.33,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6.94,12.0,5.4,0.4,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7.2,9.6,4.9,0.34,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6.8,11.6,6.3,0.59,4,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6.75,11.0,6.2,0.64,4,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
output_path = os.getcwd()+'/../data/water/csv/20122018freshwater_dummies.csv'
data_dummies.to_csv(output_path, encoding='utf-8', index=False)

In [13]:
# 再存储只保存数字类特征的data
drop_columns = ['水系', '点位名称', '河流名称']
data.drop(drop_columns, 1, inplace=True)
# 删除空行
data = data.dropna(axis=0)

In [14]:
data[data.isnull().values==True]

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质


In [15]:
output_path = os.getcwd()+'/../data/water/csv/20122018freshwater_four_feature.csv'
data.to_csv(output_path, encoding='utf-8', index=False)