In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# 鸢尾花数据集:load_iris()
# 手写数字数据集:load_digitals()
# 糖尿病数据集:load_diabetes()
# 乳腺癌数据集:load_breast_cancer()
# 波士顿房价数据集:load_boston()
# 体能训练数据集:load_linnerud()

# 载入原始数据

In [3]:
iris = pd.read_csv("../data/iris/iris.data", header=None)

adult_names = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum', 'maritalstatus', 'occupation', 'relationship', 'race',
        'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry', 'label']
adult = pd.read_csv("../data/adult/adult.data", header=None, names=adult_names)
adult_test = pd.read_csv("../data/adult/adult.test", header=None, skiprows=[0],names=adult_names)

letter = pd.read_csv("../data/letter/letter-recognition.data", header=None)

yeast = pd.read_csv("../data/yeast/yeast.data", header=None, sep="\s+")

# 数据预处理

https://anifacc.github.io/deeplearning/machinelearning/python/2017/08/14/dlwp-project01-iris-data/

### 处理iris

In [4]:
iris.head()
print("Iris", iris.shape)

Iris (150, 5)


In [5]:
le = preprocessing.LabelEncoder()

In [6]:
iris[4] = le.fit_transform(iris[4].values)
iris['label'] = iris[4]
iris = iris.drop([4])
iris.head()

Unnamed: 0,0,1,2,3,4,label
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
5,5.4,3.9,1.7,0.4,0,0


### 处理letter

In [7]:
print("Letter", letter.shape)
tmp = letter.pop(0)
letter.insert(16, 'label', tmp)

Letter (20000, 17)


In [8]:
letter['label'] = le.fit_transform(letter['label'].values)
letter.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,label
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8,19
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,8
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,3
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,13
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,6


### 处理adult

https://blog.csdn.net/haluoluo211/article/details/78943332

In [9]:
print("Adult_train:", adult.shape)
print("Adult_test:", adult_test.shape)

Adult_train: (32561, 15)
Adult_test: (16281, 15)


In [10]:
# 含有空缺值的样本删除
adult = adult.replace(' ?', np.nan).dropna()
adult_test = adult_test.replace(' ?', np.nan).dropna()

In [11]:
print("drop nan")
print("Adult_train:", adult.shape)
print("Adult_test:", adult_test.shape)

drop nan
Adult_train: (30162, 15)
Adult_test: (15060, 15)


In [12]:
adult_test['label'] = adult_test.label.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})

> 字符串类型转化为数值类型,为了保证测试集和训练集的encoding类型一致，我们首先将两个表join，编码完成之后，在分开到原始的表中:

In [13]:
combined_set = pd.concat([adult, adult_test], axis=0)

In [14]:
# combined_set.info()

In [15]:
for feature in combined_set.columns: 
    if combined_set[feature].dtype == 'object': 
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes

In [16]:
adult = combined_set[:adult.shape[0]]
adult_test = combined_set[adult.shape[0]:]
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,label
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


### 处理yeast

In [17]:
yeast = yeast.drop([0], axis=1)
print(yeast.shape)

(1484, 9)


In [18]:
yeast[yeast.shape[1]] = le.fit_transform(yeast[yeast.shape[1]].values)

In [19]:
yeast['label'] = yeast[9]
yeast = yeast.drop([9])
yeast.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,label
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6,6


### 保存为csv

In [20]:
iris.to_csv('../data/iris/iris.csv', index=False, encoding="utf-8")
adult.to_csv('../data/adult/adult.csv', index=False, encoding="utf-8")
adult_test.to_csv('../data/adult/adult_test.csv', index=False, encoding="utf-8")
letter.to_csv('../data/letter/letter.csv', index=False, encoding="utf-8")
yeast.to_csv('../data/yeast/yeast.csv', index=False, encoding="utf-8")