In [1]:
import pandas as pd 
import numpy as np

# 列表和字典均可传入DataFrame，我这里用的是字典传入：
data=pd.DataFrame({
    "id":np.arange(101,111),                                # np.arange会自动输出范围内的数据，这里会输出101~110的id号。
    "date":pd.date_range(start="20200310",periods=10),      # 输出日期数据，设置周期为10，注意这里的周期数应该与数据条数相等。
    "money":[5,4,65,-10,15,20,35,16,6,20],                  # 设置一个-10的坑，下面会填（好惨，自己给自己挖坑，幸亏不准备跳~）
    "product":['苏打水','可乐','牛肉干','老干妈','菠萝','冰激凌','洗面奶','洋葱','牙膏','薯片'],
    "department":['饮料','饮料','零食','调味品','水果',np.nan,'日用品','蔬菜','日用品','零食'],                # 再设置一个空值的坑
    "origin":['China',' China','America','China','Thailand','China','america','China','China','Japan']     # 再再设置一个america的坑
})

In [2]:
data              # 输出查看数据集

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5,苏打水,饮料,China
1,102,2020-03-11,4,可乐,饮料,China
2,103,2020-03-12,65,牛肉干,零食,America
3,104,2020-03-13,-10,老干妈,调味品,China
4,105,2020-03-14,15,菠萝,水果,Thailand
5,106,2020-03-15,20,冰激凌,,China
6,107,2020-03-16,35,洗面奶,日用品,america
7,108,2020-03-17,16,洋葱,蔬菜,China
8,109,2020-03-18,6,牙膏,日用品,China
9,110,2020-03-19,20,薯片,零食,Japan


In [3]:
data.to_excel('shopping.xlsx',index=False)
data = pd.read_excel('shopping.xlsx')

In [4]:
data.shape     # 行数列数

(10, 6)

In [5]:
data.dtypes  # 所有列的数据类型

id                     int64
date          datetime64[ns]
money                  int64
product               object
department            object
origin                object
dtype: object

In [6]:
data['id'].dtype

dtype('int64')

In [7]:
data.ndim

2

In [8]:
data.index

RangeIndex(start=0, stop=10, step=1)

In [9]:
data.columns

Index(['id', 'date', 'money', 'product', 'department', 'origin'], dtype='object')

In [10]:
data.values

array([[101, Timestamp('2020-03-10 00:00:00'), 5, '苏打水', '饮料', 'China'],
       [102, Timestamp('2020-03-11 00:00:00'), 4, '可乐', '饮料', ' China'],
       [103, Timestamp('2020-03-12 00:00:00'), 65, '牛肉干', '零食',
        'America'],
       [104, Timestamp('2020-03-13 00:00:00'), -10, '老干妈', '调味品',
        'China'],
       [105, Timestamp('2020-03-14 00:00:00'), 15, '菠萝', '水果',
        'Thailand'],
       [106, Timestamp('2020-03-15 00:00:00'), 20, '冰激凌', nan, 'China'],
       [107, Timestamp('2020-03-16 00:00:00'), 35, '洗面奶', '日用品',
        'america'],
       [108, Timestamp('2020-03-17 00:00:00'), 16, '洋葱', '蔬菜', 'China'],
       [109, Timestamp('2020-03-18 00:00:00'), 6, '牙膏', '日用品', 'China'],
       [110, Timestamp('2020-03-19 00:00:00'), 20, '薯片', '零食', 'Japan']],
      dtype=object)

In [11]:
data.describe()

Unnamed: 0,id,money
count,10.0,10.0
mean,105.5,17.6
std,3.02765,20.576146
min,101.0,-10.0
25%,103.25,5.25
50%,105.5,15.5
75%,107.75,20.0
max,110.0,65.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          10 non-null     int64         
 1   date        10 non-null     datetime64[ns]
 2   money       10 non-null     int64         
 3   product     10 non-null     object        
 4   department  9 non-null      object        
 5   origin      10 non-null     object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 608.0+ bytes


In [13]:
for i in data:
    print(i+':'+str(data[i].unique()))

id:[101 102 103 104 105 106 107 108 109 110]
date:['2020-03-10T00:00:00.000000000' '2020-03-11T00:00:00.000000000'
 '2020-03-12T00:00:00.000000000' '2020-03-13T00:00:00.000000000'
 '2020-03-14T00:00:00.000000000' '2020-03-15T00:00:00.000000000'
 '2020-03-16T00:00:00.000000000' '2020-03-17T00:00:00.000000000'
 '2020-03-18T00:00:00.000000000' '2020-03-19T00:00:00.000000000']
money:[  5   4  65 -10  15  20  35  16   6]
product:['苏打水' '可乐' '牛肉干' '老干妈' '菠萝' '冰激凌' '洗面奶' '洋葱' '牙膏' '薯片']
department:['饮料' '零食' '调味品' '水果' nan '日用品' '蔬菜']
origin:['China' ' China' 'America' 'Thailand' 'america' 'Japan']


In [14]:
data.isnull()

Unnamed: 0,id,date,money,product,department,origin
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,True,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [16]:
data.isnull().sum().sort_values(ascending=False)

department    1
origin        0
product       0
money         0
date          0
id            0
dtype: int64

In [17]:
data['department'].fillna(method='ffill')

0     饮料
1     饮料
2     零食
3    调味品
4     水果
5     水果
6    日用品
7     蔬菜
8    日用品
9     零食
Name: department, dtype: object

In [18]:
data['department'].fillna(value='冷冻食品',inplace=True)

In [19]:
data

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5,苏打水,饮料,China
1,102,2020-03-11,4,可乐,饮料,China
2,103,2020-03-12,65,牛肉干,零食,America
3,104,2020-03-13,-10,老干妈,调味品,China
4,105,2020-03-14,15,菠萝,水果,Thailand
5,106,2020-03-15,20,冰激凌,冷冻食品,China
6,107,2020-03-16,35,洗面奶,日用品,america
7,108,2020-03-17,16,洋葱,蔬菜,China
8,109,2020-03-18,6,牙膏,日用品,China
9,110,2020-03-19,20,薯片,零食,Japan


In [21]:
for i in data:
    if pd.api.types.is_object_dtype(data[i]):
        data[i]=data[i].str.strip()
data['origin'].unique()

array(['China', 'America', 'Thailand', 'america', 'Japan'], dtype=object)

In [22]:
data['origin'].str.title()

0       China
1       China
2     America
3       China
4    Thailand
5       China
6     America
7       China
8       China
9       Japan
Name: origin, dtype: object

In [23]:
data['origin'].replace('america','America',inplace=True)

In [24]:
data

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5,苏打水,饮料,China
1,102,2020-03-11,4,可乐,饮料,China
2,103,2020-03-12,65,牛肉干,零食,America
3,104,2020-03-13,-10,老干妈,调味品,China
4,105,2020-03-14,15,菠萝,水果,Thailand
5,106,2020-03-15,20,冰激凌,冷冻食品,China
6,107,2020-03-16,35,洗面奶,日用品,America
7,108,2020-03-17,16,洋葱,蔬菜,China
8,109,2020-03-18,6,牙膏,日用品,China
9,110,2020-03-19,20,薯片,零食,Japan


In [26]:
data['money'].replace(-10,np.nan,inplace=True)

In [27]:
data

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5.0,苏打水,饮料,China
1,102,2020-03-11,4.0,可乐,饮料,China
2,103,2020-03-12,65.0,牛肉干,零食,America
3,104,2020-03-13,,老干妈,调味品,China
4,105,2020-03-14,15.0,菠萝,水果,Thailand
5,106,2020-03-15,20.0,冰激凌,冷冻食品,China
6,107,2020-03-16,35.0,洗面奶,日用品,America
7,108,2020-03-17,16.0,洋葱,蔬菜,China
8,109,2020-03-18,6.0,牙膏,日用品,China
9,110,2020-03-19,20.0,薯片,零食,Japan


In [28]:
data['money'].replace(np.nan,data['money'].mean(),inplace=True)

In [29]:
data

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5.0,苏打水,饮料,China
1,102,2020-03-11,4.0,可乐,饮料,China
2,103,2020-03-12,65.0,牛肉干,零食,America
3,104,2020-03-13,20.666667,老干妈,调味品,China
4,105,2020-03-14,15.0,菠萝,水果,Thailand
5,106,2020-03-15,20.0,冰激凌,冷冻食品,China
6,107,2020-03-16,35.0,洗面奶,日用品,America
7,108,2020-03-17,16.0,洋葱,蔬菜,China
8,109,2020-03-18,6.0,牙膏,日用品,China
9,110,2020-03-19,20.0,薯片,零食,Japan


In [34]:
data1=data[data['origin'] !='America']
data1

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5.0,苏打水,饮料,China
1,102,2020-03-11,4.0,可乐,饮料,China
3,104,2020-03-13,20.666667,老干妈,调味品,China
4,105,2020-03-14,15.0,菠萝,水果,Thailand
5,106,2020-03-15,20.0,冰激凌,冷冻食品,China
7,108,2020-03-17,16.0,洋葱,蔬菜,China
8,109,2020-03-18,6.0,牙膏,日用品,China
9,110,2020-03-19,20.0,薯片,零食,Japan


In [33]:
data2=data[(data !='Japan').all(1)]
data2

Unnamed: 0,id,date,money,product,department,origin
0,101,2020-03-10,5.0,苏打水,饮料,China
1,102,2020-03-11,4.0,可乐,饮料,China
2,103,2020-03-12,65.0,牛肉干,零食,America
3,104,2020-03-13,20.666667,老干妈,调味品,China
4,105,2020-03-14,15.0,菠萝,水果,Thailand
5,106,2020-03-15,20.0,冰激凌,冷冻食品,China
6,107,2020-03-16,35.0,洗面奶,日用品,America
7,108,2020-03-17,16.0,洋葱,蔬菜,China
8,109,2020-03-18,6.0,牙膏,日用品,China


In [35]:
data['origin'].drop_duplicates()

0       China
2     America
4    Thailand
9       Japan
Name: origin, dtype: object

In [36]:
data['id'].astype('str')

0    101
1    102
2    103
3    104
4    105
5    106
6    107
7    108
8    109
9    110
Name: id, dtype: object

In [37]:
data.rename(columns={'id':'ID','origin':'产地'})

Unnamed: 0,ID,date,money,product,department,产地
0,101,2020-03-10,5.0,苏打水,饮料,China
1,102,2020-03-11,4.0,可乐,饮料,China
2,103,2020-03-12,65.0,牛肉干,零食,America
3,104,2020-03-13,20.666667,老干妈,调味品,China
4,105,2020-03-14,15.0,菠萝,水果,Thailand
5,106,2020-03-15,20.0,冰激凌,冷冻食品,China
6,107,2020-03-16,35.0,洗面奶,日用品,America
7,108,2020-03-17,16.0,洋葱,蔬菜,China
8,109,2020-03-18,6.0,牙膏,日用品,China
9,110,2020-03-19,20.0,薯片,零食,Japan
