In [1]:
import numpy as np
import pandas as pd

In [2]:
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name")

data = {
    "age": [18, 30, 25, 40],
    "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"],
    "sex": ["male", "male", "female", "male"]
}

user_info = pd.DataFrame(data=data, index=index)
user_info

Unnamed: 0_level_0,age,city,sex
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tom,18,BeiJing,male
Bob,30,ShangHai,male
Mary,25,GuangZhou,female
James,40,ShenZhen,male


In [3]:
user_info.info() # 一般拿到数据，我们第一步需要做的是了解下数据的整体情况，可以使用 info 方法来查看。

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Tom to James
Data columns (total 3 columns):
age     4 non-null int64
city    4 non-null object
sex     4 non-null object
dtypes: int64(1), object(2)
memory usage: 128.0+ bytes


In [5]:
"""
    如果我们的数据量非常大，我想看看数据长啥样，我当然不希望查看所有的数据了，
    这时候我们可以采用只看头部的 n 条或者尾部的 n 条。
    查看头部的 n 条数据可以使用 head 方法，
    查看尾部的 n 条数据可以使用 tail 方法。
"""
user_info.head(2)
user_info.tail(2)

Unnamed: 0_level_0,age,city,sex
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tom,18,BeiJing,male
Bob,30,ShangHai,male


In [6]:
user_info.shape # 通过 .shape 获取数据的形状

(4, 3)

In [7]:
user_info.T # 通过 .T 获取数据的转置

name,Tom,Bob,Mary,James
age,18,30,25,40
city,BeiJing,ShangHai,GuangZhou,ShenZhen
sex,male,male,female,male


In [8]:
user_info.values # 通过 .values 来获取，获取后的数据类型其实是一个 ndarray

array([[18, 'BeiJing', 'male'],
       [30, 'ShangHai', 'male'],
       [25, 'GuangZhou', 'female'],
       [40, 'ShenZhen', 'male']], dtype=object)

In [11]:
print(user_info.age.max()) # 最大值
print(user_info.age.min()) # 最小值
print(user_info.age.mean()) # 均值
print(user_info.age.quantile()) # 中位数
print(user_info.age.sum()) # 求和

40
18
28.25
27.5
113


In [12]:
print(user_info.age.describe()) # 想要一次性获取多个统计指标，只需调用 describe 方法即可

count     4.000000
mean     28.250000
std       9.251126
min      18.000000
25%      23.250000
50%      27.500000
75%      32.500000
max      40.000000
Name: age, dtype: float64


In [13]:
user_info.describe(include=["object"]) # 如果想要查看非数字类型的列的统计指标，可以设置 include=["object"] 来获得。

Unnamed: 0,city,sex
count,4,4
unique,4,2
top,GuangZhou,male
freq,1,3


In [14]:
user_info.sort_index()

Unnamed: 0_level_0,age,city,sex
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bob,30,ShangHai,male
James,40,ShenZhen,male
Mary,25,GuangZhou,female
Tom,18,BeiJing,male


In [15]:
user_info.sort_index(axis=1, ascending=False)

Unnamed: 0_level_0,sex,city,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tom,male,BeiJing,18
Bob,male,ShangHai,30
Mary,female,GuangZhou,25
James,male,ShenZhen,40


In [17]:
user_info.sort_values(by="age")
user_info.sort_values(by=["age", "city"]) # 注意：list 中每个元素的顺序会影响排序优先级的。

Unnamed: 0_level_0,age,city,sex
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tom,18,BeiJing,male
Mary,25,GuangZhou,female
Bob,30,ShangHai,male
James,40,ShenZhen,male


In [18]:
user_info.rename(columns={"age": "Age", "city": "City", "sex": "Sex"})

Unnamed: 0_level_0,Age,City,Sex
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tom,18,BeiJing,male
Bob,30,ShangHai,male
Mary,25,GuangZhou,female
James,40,ShenZhen,male


In [19]:
user_info["age"].astype(float)

name
Tom      18.0
Bob      30.0
Mary     25.0
James    40.0
Name: age, dtype: float64

In [20]:
user_info["height"] = ["178", "168", "178", "180cm"]

In [21]:
user_info

Unnamed: 0_level_0,age,city,sex,height
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tom,18,BeiJing,male,178
Bob,30,ShangHai,male,168
Mary,25,GuangZhou,female,178
James,40,ShenZhen,male,180cm


In [22]:
"""
     设置 errors='coerce' 可以在强转失败时将有问题的元素赋值为 pd.NaT（对于datetime和timedelta）或 np.nan（数字）
"""
pd.to_numeric(user_info.height, errors="coerce") 

name
Tom      178.0
Bob      168.0
Mary     178.0
James      NaN
Name: height, dtype: float64

In [23]:
pd.to_numeric(user_info.height, errors="ignore") # 设置 errors='ignore' 可以在强转失败时返回原有的数据。

name
Tom        178
Bob        168
Mary       178
James    180cm
Name: height, dtype: object

In [24]:
pd.to_numeric(user_info.height)  #这种会强转失败后直接抛出异常

ValueError: Unable to parse string "180cm" at position 3