# 读取数据

In [1]:
import pandas as pd 
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(url)
wine.head()

Unnamed: 0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
0,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
1,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
2,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
3,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
4,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450


# 删除 某几列  

首先看一下drop函数

DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')

这是drop函数的所有参数

labels是指要删除的标签，一个或者是列表形式的多个，axis是指处哪一个轴，columns是指某一列或者多列，level是指等级，针对多重索引的情况，inplaces是否替换原来的dataframe，具体更详细的可以参阅官网https://pandas


In [2]:
wine = wine.drop(wine.columns[[0,3,6,8,11,12,13]], axis = 1)

wine.head()

Unnamed: 0,14.23,1.71,15.6,127,3.06,2.29,5.64
0,13.2,1.78,11.2,100,2.76,1.28,4.38
1,13.16,2.36,18.6,101,3.24,2.81,5.68
2,14.37,1.95,16.8,113,3.49,2.18,7.8
3,13.24,2.59,21.0,118,2.69,1.82,4.32
4,14.2,1.76,15.2,112,3.39,1.97,6.75


# 重命名列名

In [3]:
wine.columns = ['alcohol', 'malic_acid', 'alcalinity_of_ash', 'magnesium', 'flavanoids', 'proanthocyanins', 'hue']
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,13.2,1.78,11.2,100,2.76,1.28,4.38
1,13.16,2.36,18.6,101,3.24,2.81,5.68
2,14.37,1.95,16.8,113,3.49,2.18,7.8
3,13.24,2.59,21.0,118,2.69,1.82,4.32
4,14.2,1.76,15.2,112,3.39,1.97,6.75


# 把前三行数据赋值为NaN

In [4]:
import numpy as np
wine.iloc[0:3, 0] = np.nan
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,,1.78,11.2,100,2.76,1.28,4.38
1,,2.36,18.6,101,3.24,2.81,5.68
2,,1.95,16.8,113,3.49,2.18,7.8
3,13.24,2.59,21.0,118,2.69,1.82,4.32
4,14.2,1.76,15.2,112,3.39,1.97,6.75


# 赋值magnesium as 列的3到4行为nan  

In [5]:
wine.iloc[2:4, 3] = np.nan
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,,1.78,11.2,100.0,2.76,1.28,4.38
1,,2.36,18.6,101.0,3.24,2.81,5.68
2,,1.95,16.8,,3.49,2.18,7.8
3,13.24,2.59,21.0,,2.69,1.82,4.32
4,14.2,1.76,15.2,112.0,3.39,1.97,6.75


# 填充 缺失数据

In [6]:
wine.alcohol.fillna(10, inplace = True)

wine.magnesium.fillna(100, inplace = True)

wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.78,11.2,100.0,2.76,1.28,4.38
1,10.0,2.36,18.6,101.0,3.24,2.81,5.68
2,10.0,1.95,16.8,100.0,3.49,2.18,7.8
3,13.24,2.59,21.0,100.0,2.69,1.82,4.32
4,14.2,1.76,15.2,112.0,3.39,1.97,6.75


# 缺失值计数

In [7]:
wine.isnull().sum()

alcohol              0
malic_acid           0
alcalinity_of_ash    0
magnesium            0
flavanoids           0
proanthocyanins      0
hue                  0
dtype: int64

# 使用dropna  删除含有nan 的行 

使用DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
参数说明：

axis:
axis=0: 删除包含缺失值的行

axis=1: 删除包含缺失值的列

how: 与axis配合使用

how=‘any’ :只要有缺失值出现，就删除该行货列

how=‘all’: 所有的值都缺失，才删除行或列

thresh： axis中至少有thresh个非缺失值，否则删除

比如 axis=0，thresh=10：标识如果该行中非缺失值的数量小于10，将删除改行
subset: list

在哪些列中查看是否有缺失值

inplace: 是否在原数据上操作。如果为真，返回None否则返回新的copy，去掉了缺失值


In [8]:
wine = wine.dropna(axis = 0, how = "any")
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.78,11.2,100.0,2.76,1.28,4.38
1,10.0,2.36,18.6,101.0,3.24,2.81,5.68
2,10.0,1.95,16.8,100.0,3.49,2.18,7.8
3,13.24,2.59,21.0,100.0,2.69,1.82,4.32
4,14.2,1.76,15.2,112.0,3.39,1.97,6.75


In [9]:
wine.iloc[2:4, 3] = np.nan
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.78,11.2,100.0,2.76,1.28,4.38
1,10.0,2.36,18.6,101.0,3.24,2.81,5.68
2,10.0,1.95,16.8,,3.49,2.18,7.8
3,13.24,2.59,21.0,,2.69,1.82,4.32
4,14.2,1.76,15.2,112.0,3.39,1.97,6.75


# 删除 某列含有nan 的行

In [10]:
wine[pd.notnull(wine['magnesium'])]


Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.00,1.78,11.2,100.0,2.76,1.28,4.38
1,10.00,2.36,18.6,101.0,3.24,2.81,5.68
4,14.20,1.76,15.2,112.0,3.39,1.97,6.75
5,14.39,1.87,14.6,96.0,2.52,1.98,5.25
6,14.06,2.15,17.6,121.0,2.51,1.25,5.05
...,...,...,...,...,...,...,...
172,13.71,5.65,20.5,95.0,0.61,1.06,7.70
173,13.40,3.91,23.0,102.0,0.75,1.41,7.30
174,13.27,4.28,20.0,120.0,0.69,1.35,10.20
175,13.17,2.59,20.0,120.0,0.68,1.46,9.30


# 或者这样 

In [11]:
wine.drop(wine[np.isnan(wine['magnesium'])].index, inplace=True)


In [12]:
wine

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.00,1.78,11.2,100.0,2.76,1.28,4.38
1,10.00,2.36,18.6,101.0,3.24,2.81,5.68
4,14.20,1.76,15.2,112.0,3.39,1.97,6.75
5,14.39,1.87,14.6,96.0,2.52,1.98,5.25
6,14.06,2.15,17.6,121.0,2.51,1.25,5.05
...,...,...,...,...,...,...,...
172,13.71,5.65,20.5,95.0,0.61,1.06,7.70
173,13.40,3.91,23.0,102.0,0.75,1.41,7.30
174,13.27,4.28,20.0,120.0,0.69,1.35,10.20
175,13.17,2.59,20.0,120.0,0.68,1.46,9.30


 #  参考
https://blog.csdn.net/dss_dssssd/article/details/82814673
https://blog.csdn.net/weixin_42575020/article/details/95338407
