In [0]:
import pandas as pd
import numpy as np
import io

In [0]:
data = pd.DataFrame({
    'id': np.arange(101, 111),
    'date': pd.date_range(start = '20200310', periods = 10),
    'money': [5, 4, 65, -10, 15, 20, 35, 16, 6, 20],
    'department': ['饮料', '饮料', '零食', '调味品', '水果', 'nan', '日用品', '蔬菜', '日用品', '零食'],
    'product': ['苏打水', '可乐', '牛肉干', '老干妈', '菠萝', '冰激凌', '洗面奶', '洋葱','牙膏','薯片'],
    'origin': ['China','China','US','China','Thailand','China','us','China','China','Japan']
})

In [0]:
data.to_csv("shopping.csv",index=False)

In [0]:
data = pd.read_csv('shopping.csv')

In [5]:
data.shape               #行数列数
data.dtypes              #所有列的数据类型
data['id'].dtypes        #某一列的数据类型
data.ndim                #数据维度
data.index               #行索引
data.columns             #列索引
data.values              #对象值

array([[101, '2020-03-10', 5, '饮料', '苏打水', 'China'],
       [102, '2020-03-11', 4, '饮料', '可乐', 'China'],
       [103, '2020-03-12', 65, '零食', '牛肉干', 'US'],
       [104, '2020-03-13', -10, '调味品', '老干妈', 'China'],
       [105, '2020-03-14', 15, '水果', '菠萝', 'Thailand'],
       [106, '2020-03-15', 20, nan, '冰激凌', 'China'],
       [107, '2020-03-16', 35, '日用品', '洗面奶', 'us'],
       [108, '2020-03-17', 16, '蔬菜', '洋葱', 'China'],
       [109, '2020-03-18', 6, '日用品', '牙膏', 'China'],
       [110, '2020-03-19', 20, '零食', '薯片', 'Japan']], dtype=object)

In [6]:
data.head() 
data.tail()
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          10 non-null     int64 
 1   date        10 non-null     object
 2   money       10 non-null     int64 
 3   department  9 non-null      object
 4   product     10 non-null     object
 5   origin      10 non-null     object
dtypes: int64(2), object(4)
memory usage: 608.0+ bytes


Unnamed: 0,id,money
count,10.0,10.0
mean,105.5,17.6
std,3.02765,20.576146
min,101.0,-10.0
25%,103.25,5.25
50%,105.5,15.5
75%,107.75,20.0
max,110.0,65.0


In [7]:
#查看某一列的唯一值
for i in data:
  print(i +":" + str(data[i].unique()))

id:[101 102 103 104 105 106 107 108 109 110]
date:['2020-03-10' '2020-03-11' '2020-03-12' '2020-03-13' '2020-03-14'
 '2020-03-15' '2020-03-16' '2020-03-17' '2020-03-18' '2020-03-19']
money:[  5   4  65 -10  15  20  35  16   6]
department:['饮料' '零食' '调味品' '水果' nan '日用品' '蔬菜']
product:['苏打水' '可乐' '牛肉干' '老干妈' '菠萝' '冰激凌' '洗面奶' '洋葱' '牙膏' '薯片']
origin:['China' 'US' 'Thailand' 'us' 'Japan']


In [8]:
data.isnull()
data['department'].isnull()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: department, dtype: bool

In [9]:
data.isnull().sum().sort_values(ascending = False)

department    1
origin        0
product       0
money         0
date          0
id            0
dtype: int64

In [10]:
data.sort_values(by='money', ascending=False)

Unnamed: 0,id,date,money,department,product,origin
2,103,2020-03-12,65,零食,牛肉干,US
6,107,2020-03-16,35,日用品,洗面奶,us
5,106,2020-03-15,20,,冰激凌,China
9,110,2020-03-19,20,零食,薯片,Japan
7,108,2020-03-17,16,蔬菜,洋葱,China
4,105,2020-03-14,15,水果,菠萝,Thailand
8,109,2020-03-18,6,日用品,牙膏,China
0,101,2020-03-10,5,饮料,苏打水,China
1,102,2020-03-11,4,饮料,可乐,China
3,104,2020-03-13,-10,调味品,老干妈,China


**空值处理**

pandas.DataFrame.fillna（value = None，method = None，inplace = False）

value：用于填充的值，可以是具体值、字典和数组，不能是列表；

method：填充方法，有 ffill 和 bfill 等；

inplace默认无False，如果为True，则将修改此对象上的所有其他视图。

In [0]:
data['department'].fillna(method = 'ffill') # 向上填充
data['department'].fillna(method = 'bfill')
data['department'].fillna(value='冷冻食品', inplace=True) #替换为具体值，并在原对象上修改

In [12]:
#空格处理，只针对object类型数据
for i in data:
  if pd.api.types.is_object_dtype(data[i]):
    data[i]= data[i].str.strip() #去空格


data['origin'].unique  #验证

<bound method Series.unique of 0       China
1       China
2          US
3       China
4    Thailand
5       China
6          us
7       China
8       China
9       Japan
Name: origin, dtype: object>

In [13]:
#首字母大写
data['origin'].str.title() 
data['origin'].str.capitalize()
data['origin'].str.upper() #全部大写

0       CHINA
1       CHINA
2          US
3       CHINA
4    THAILAND
5       CHINA
6          US
7       CHINA
8       CHINA
9       JAPAN
Name: origin, dtype: object

In [14]:
data['origin'].str.lower() #全部小写

0       china
1       china
2          us
3       china
4    thailand
5       china
6          us
7       china
8       china
9       japan
Name: origin, dtype: object

In [0]:
#先替换后大小写，先大小写后替换结果不一样
data['origin'].replace('us', 'US', inplace=True)

In [16]:
data['origin']

0       China
1       China
2          US
3       China
4    Thailand
5       China
6          US
7       China
8       China
9       Japan
Name: origin, dtype: object

In [17]:
data['money'].replace(-10, np.nan, inplace = True)
data['money'].replace(np.nan, data['money'].mean(), inplace= True)
data['money']

0     5.000000
1     4.000000
2    65.000000
3    20.666667
4    15.000000
5    20.000000
6    35.000000
7    16.000000
8     6.000000
9    20.000000
Name: money, dtype: float64

In [18]:
data1 = data[data.origin != 'US'] #去掉origin为us的行 /方法1
data1

Unnamed: 0,id,date,money,department,product,origin
0,101,2020-03-10,5.0,饮料,苏打水,China
1,102,2020-03-11,4.0,饮料,可乐,China
3,104,2020-03-13,20.666667,调味品,老干妈,China
4,105,2020-03-14,15.0,水果,菠萝,Thailand
5,106,2020-03-15,20.0,冷冻食品,冰激凌,China
7,108,2020-03-17,16.0,蔬菜,洋葱,China
8,109,2020-03-18,6.0,日用品,牙膏,China
9,110,2020-03-19,20.0,零食,薯片,Japan


In [19]:
data2 = data[(data != 'Japan').all(1)] #去掉所有包含japan的行/不等于Japan的行为真，则返回 /方法2
data2

  res_values = method(rvalues)


Unnamed: 0,id,date,money,department,product,origin
0,101,2020-03-10,5.0,饮料,苏打水,China
1,102,2020-03-11,4.0,饮料,可乐,China
2,103,2020-03-12,65.0,零食,牛肉干,US
3,104,2020-03-13,20.666667,调味品,老干妈,China
4,105,2020-03-14,15.0,水果,菠萝,Thailand
5,106,2020-03-15,20.0,冷冻食品,冰激凌,China
6,107,2020-03-16,35.0,日用品,洗面奶,US
7,108,2020-03-17,16.0,蔬菜,洋葱,China
8,109,2020-03-18,6.0,日用品,牙膏,China


In [24]:
data['origin'].drop_duplicates() #默认删除后面出现的重复值，既保留第一次出现的重复值

0       China
2          US
4    Thailand
9       Japan
Name: origin, dtype: object

In [25]:
data['origin'].drop_duplicates(keep = 'last') #保留最后一次出现的重复值

4    Thailand
6          US
8       China
9       Japan
Name: origin, dtype: object

In [28]:
data['id'].astype('str')  #将id列的类型转换为字符串类型

0    101
1    102
2    103
3    104
4    105
5    106
6    107
7    108
8    109
9    110
Name: id, dtype: object

In [29]:
data.rename(columns={'id': 'ID', 'origin':' 产地'}) 

Unnamed: 0,ID,date,money,department,product,产地
0,101,2020-03-10,5.0,饮料,苏打水,China
1,102,2020-03-11,4.0,饮料,可乐,China
2,103,2020-03-12,65.0,零食,牛肉干,US
3,104,2020-03-13,20.666667,调味品,老干妈,China
4,105,2020-03-14,15.0,水果,菠萝,Thailand
5,106,2020-03-15,20.0,冷冻食品,冰激凌,China
6,107,2020-03-16,35.0,日用品,洗面奶,US
7,108,2020-03-17,16.0,蔬菜,洋葱,China
8,109,2020-03-18,6.0,日用品,牙膏,China
9,110,2020-03-19,20.0,零食,薯片,Japan
