# 数据处理


## math 数学工具

$\text{\color{red}只适用于单个数值计算}$


### 特殊数值


In [1]:
import math

vals = {
    "pi": math.pi,  # 圆周率，3.141592653589793
    "e": math.e,  # 自然常数e，2.718281828459045
    "tau": math.tau,  # 圆周率的两倍（2π）
    "inf": math.inf,  # 无穷大
    "nan": math.nan,  # 非数字（Not a Number）
}
for key, value in vals.items():
    print(f"{key}: {value}")

pi: 3.141592653589793
e: 2.718281828459045
tau: 6.283185307179586
inf: inf
nan: nan


In [2]:
from math import (
    gcd,
    lcm,
    factorial,
    sqrt,
    isqrt,
    comb,
    ceil,
    floor,
    trunc,
    fabs,
    exp,
    log,
    log2,
    log10,
    pow,
)

### 数值操作


In [3]:
x = 12.23
y = 702

operations = {
    "ceil(x)": ceil(x),  # 向上取整
    "floor(x)": floor(x),  # 向下取整
    "trunc(x)": trunc(x),  # 截断为整数（向0取整，也是去掉小数）
    "fabs(y)": fabs(y),  # 绝对值（返回float）
}

for key, value in operations.items():
    print(f"{key}: {value}")

ceil(x): 13
floor(x): 12
trunc(x): 12
fabs(y): 702.0


### math.gcd() 最大公因数 math.lcm() 最小公倍数


In [4]:
operations = {
    "24, 18最大公约数": gcd(24, 18),  # 最大公约数（仅限整数）
    "24, 18最小公倍数": lcm(24, 18),  # 最小公倍数
}
for key, value in operations.items():
    print(f"{key}: {value}")

24, 18最大公约数: 6
24, 18最小公倍数: 72


### 数值计算


In [5]:
x = 10
y = 2
operations = {
    "factorial(x)：": factorial(x),  # 120
    "exp(x)": exp(x),  # e的x次幂
    "sqrt(x)": sqrt(x),  # 平方根
    "isqrt(x)": isqrt(x),  # 整数平方根（返回整数）
    "exp(x)": exp(x),  # e的x次幂
    "log(x)": log(x),  # 自然对数 ln(x)
    "log10(x)": log10(x),  # 以10为底的对数
    "log2(x)": log2(x),  # 以2为底的对数
    "pow(x, y)": pow(x, y),  # x的y次幂（返回float
}
for key, value in operations.items():
    print(f"{key}: {value}")

factorial(x)：: 3628800
exp(x): 22026.465794806718
sqrt(x): 3.1622776601683795
isqrt(x): 3
log(x): 2.302585092994046
log10(x): 1.0
log2(x): 3.321928094887362
pow(x, y): 100.0


### math.comb(m,n) 组合

实现$C^{n}_{m}$，功能类似于 iterools.combinations()，只不过这个只计算组合数量


In [6]:
print(comb(5, 2))  # 10  (5 选 2 组合)

10


## statistics 统计工具

$\text{\color{red}适用于list和tuple小规模计算}$


In [7]:
import statistics as stats  # 统计模块，这种方式导入模块可以给它起个别名

### 中心趋势


In [8]:
data = [1, 3, 3, 6, 7, 8, 9, 10]
results = {
    "mean(data)": stats.mean(data),  # 平均值
    "median(data)": stats.median(data),  # 中位数
    "mode(data)": stats.mode(data),  # 众数（最常见的值）
    "multimode(data)": stats.multimode(data),  # 返回所有众数列表（Python 3.8+）
    "median_low(data)": stats.median_low(data),  # 取中间偏小
    "median_high(data)": stats.median_high(data),  # 取中间偏大
}
for key, value in results.items():
    print(f"{key}: {value}")

mean(data): 5.875
median(data): 6.5
mode(data): 3
multimode(data): [3]
median_low(data): 6
median_high(data): 7


### 离散程度


In [9]:
results = {
    "样本方差 variance": stats.variance(data),  # 样本方差（除以n-1）
    "总体方差 pvariance": stats.pvariance(data),  # 总体方差（除以n）
    "样本标准差 standard deviation": stats.stdev(data),  # 样本标准差 = sqrt(variance)
    "总体标准差 population standard deviation": stats.pstdev(
        data
    ),  # 总体标准差 = sqrt(pvariance)
}
for key, value in results.items():
    print(f"{key}: {value}")

样本方差 variance: 10.410714285714286
总体方差 pvariance: 9.109375
样本标准差 standard deviation: 3.2265638511757806
总体标准差 population standard deviation: 3.018174116912409


## numpy 大规模计算工具

$\text{\color{red}支持大规模数据，数组和矩阵的计算}$


In [10]:
import numpy as np

data = [1, 2, 3, 4]

results = {
    "numpy.mean(data)": np.mean(data),  # 平均值
    "numpy.median(data)": np.median(data),  # 中位数
    "numpy.std(data)": np.std(data),  # 标准差
    "numpy.var(data)": np.var(data),  # 方差
    "numpy.sqrt(data)": np.sqrt(data),  # 平方根
    "numpy.log(data)": np.log(data),  # 自然对数
    "numpy.log10(data)": np.log10(data),  # 以10为底的对数
    "numpy.log2(data)": np.log2(data),  # 以2为底的对数
    "numpy.exp(data)": np.exp(data),  # e的x次幂
    "numpy.power(data, 2)": np.power(data, 2),  # x的y次幂
    "numpy.ceil(data)": np.ceil(data),  # 向上取整
    "numpy.floor(data)": np.floor(data),  # 向下取整
    "numpy.trunc(data)": np.trunc(data),  # 截断为整数（向0取整）
    "numpy.abs(data)": np.abs(data),  # 绝对值
    "numpy.sum(data)": np.sum(data),  # 求和
    "numpy.prod(data)": np.prod(data),  # 乘积
    "numpy.min(data)": np.min(data),  # 最小值
    "numpy.max(data)": np.max(data),  # 最大值
}
for key, value in results.items():
    print(f"{key}: {value}")
import pandas as pd

numpy.mean(data): 2.5
numpy.median(data): 2.5
numpy.std(data): 1.118033988749895
numpy.var(data): 1.25
numpy.sqrt(data): [1.         1.41421356 1.73205081 2.        ]
numpy.log(data): [0.         0.69314718 1.09861229 1.38629436]
numpy.log10(data): [0.         0.30103    0.47712125 0.60205999]
numpy.log2(data): [0.        1.        1.5849625 2.       ]
numpy.exp(data): [ 2.71828183  7.3890561  20.08553692 54.59815003]
numpy.power(data, 2): [ 1  4  9 16]
numpy.ceil(data): [1 2 3 4]
numpy.floor(data): [1 2 3 4]
numpy.trunc(data): [1 2 3 4]
numpy.abs(data): [1 2 3 4]
numpy.sum(data): 10
numpy.prod(data): 24
numpy.min(data): 1
numpy.max(data): 4


## pandas 数据处理库


### Series 一维带标签的数组（类似于 Excel 的一列）


In [11]:
import pandas as pd

a = pd.Series([1, 2, 3])  # 创建一个Series对象
print(a)  # 打印Series对象
print(a[0])  # 访问第一个元素

0    1
1    2
2    3
dtype: int64
1


In [12]:
# 创建一个带有索引的Series对象
s = pd.Series([10, 20, 30], index=["a", "b", "c"])
print(s)

a    10
b    20
c    30
dtype: int64


In [13]:
print(s.index)  # 输出索引

Index(['a', 'b', 'c'], dtype='object')


In [14]:
print(s.values)  # 输出值

[10 20 30]


In [15]:
print(s["a"])  # 访问索引为"a"的值

10


In [16]:
print(s[1])  # 访问第二个值

20


  print(s[1])  # 访问第二个值


In [17]:
print(s[1:])  # 切片访问

b    20
c    30
dtype: int64


In [18]:
print(s[["a", "b"]])  # 访问多个索引

a    10
b    20
dtype: int64


### DataFrame 二维表格数据结构


#### DataFrame 创建


In [19]:
# 方式一
data = {"Name": ["Alice", "Bob"], "Age": [25, 30]}
df = pd.DataFrame(data)
print(df)

    Name  Age
0  Alice   25
1    Bob   30


In [20]:
# 方式二
data = [
    {"Name": "Alice", "Age": 25},
    {"Name": "Bob", "Age": 30},
]
df = pd.DataFrame(data)
print(df)

    Name  Age
0  Alice   25
1    Bob   30


In [21]:
# 方式三
data = [
    ["Alice", 25],
    ["Bob", 30],
]
columns = ["Name", "Age"]
df = pd.DataFrame(data, columns=columns)
print(df)

    Name  Age
0  Alice   25
1    Bob   30


#### DataFrame 查看数据


##### 数据整体描述


In [22]:
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "Marone", "Jack", "Tom"],
        "Age": [25, 30, 35, 40, 45, 50],
        "Salary": [50000, 60000, 70000, 80000, 90000, 100000],
        "City": [
            "New York",
            "Los Angeles",
            "Chicago",
            "Houston",
            "Phoenix",
            "Philadelphia",
        ],
    }
)
df.head()  # 查看前5行

Unnamed: 0,Name,Age,Salary,City
0,Alice,25,50000,New York
1,Bob,30,60000,Los Angeles
2,Charlie,35,70000,Chicago
3,Marone,40,80000,Houston
4,Jack,45,90000,Phoenix


In [23]:
df.tail(3)  # 查看后3行

Unnamed: 0,Name,Age,Salary,City
3,Marone,40,80000,Houston
4,Jack,45,90000,Phoenix
5,Tom,50,100000,Philadelphia


In [24]:
df.shape  # 查看维度

(6, 4)

In [25]:
df.info()  # 数据结构概览

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    6 non-null      object
 1   Age     6 non-null      int64 
 2   Salary  6 non-null      int64 
 3   City    6 non-null      object
dtypes: int64(2), object(2)
memory usage: 324.0+ bytes


In [26]:
df.describe()  # 数值列的统计描述

Unnamed: 0,Age,Salary
count,6.0,6.0
mean,37.5,75000.0
std,9.354143,18708.286934
min,25.0,50000.0
25%,31.25,62500.0
50%,37.5,75000.0
75%,43.75,87500.0
max,50.0,100000.0


##### 查看列数据


In [27]:
# 查看某一列数据
print(df["Name"])  # 打印姓名列
print(df.Age)  # 打印年龄列

0      Alice
1        Bob
2    Charlie
3     Marone
4       Jack
5        Tom
Name: Name, dtype: object
0    25
1    30
2    35
3    40
4    45
5    50
Name: Age, dtype: int64


In [28]:
# 查看多列数据
print(df[["Name", "Age"]])  # 打印姓名和年龄列

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3   Marone   40
4     Jack   45
5      Tom   50


In [29]:
# 查看特定列的统计描述
df["Age"].describe()  # 年龄列的统计描述

count     6.000000
mean     37.500000
std       9.354143
min      25.000000
25%      31.250000
50%      37.500000
75%      43.750000
max      50.000000
Name: Age, dtype: float64

##### 查看行数据


.iloc[] 是基于位置索引查找，不包含结束位置


In [30]:
# 查看行数据
print(df.iloc[0])  # 打印第一行数据

Name         Alice
Age             25
Salary       50000
City      New York
Name: 0, dtype: object


In [31]:
print(df.iloc[1:3])  # 打印第二到第三行数据

      Name  Age  Salary         City
1      Bob   30   60000  Los Angeles
2  Charlie   35   70000      Chicago


In [32]:
print(df.iloc[[0, 2, 4]])  # 打印第一、第三和第五行数据

      Name  Age  Salary      City
0    Alice   25   50000  New York
2  Charlie   35   70000   Chicago
4     Jack   45   90000   Phoenix


In [33]:
print(df.iloc[:, 0])  # 打印第一列数据

0      Alice
1        Bob
2    Charlie
3     Marone
4       Jack
5        Tom
Name: Name, dtype: object


In [34]:
print(df.iloc[:, 1:3])  # 打印第二到第三列数据

   Age  Salary
0   25   50000
1   30   60000
2   35   70000
3   40   80000
4   45   90000
5   50  100000


In [35]:
print(df.iloc[:, [0, 2]])  # 打印第一和第三列数据

      Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   70000
3   Marone   80000
4     Jack   90000
5      Tom  100000


In [36]:
print(df.iloc[0:3, 1:3])  # 打印第一到第三行和第二到第三列的数据

   Age  Salary
0   25   50000
1   30   60000
2   35   70000


In [37]:
print(df.iloc[[0, 2], [1, 3]])  # 打印第一和第三行的第二和第四列数据

   Age      City
0   25  New York
2   35   Chicago


In [38]:
print(df.iloc[0:3, [1, 3]])  # 打印第一到第三行的第二和第四列数据

   Age         City
0   25     New York
1   30  Los Angeles
2   35      Chicago


.loc[] 是基于标签查找，包含结束位置，使用行列名


In [39]:
print(df.index.values)  # 查看索引值： [0 1 2 3 4 5]
df = df.rename(index={3: "我"})  # 修改索引
df

[0 1 2 3 4 5]


Unnamed: 0,Name,Age,Salary,City
0,Alice,25,50000,New York
1,Bob,30,60000,Los Angeles
2,Charlie,35,70000,Chicago
我,Marone,40,80000,Houston
4,Jack,45,90000,Phoenix
5,Tom,50,100000,Philadelphia


In [40]:
df.loc["我"]  # 查看索引为"我"的行数据

Name       Marone
Age            40
Salary      80000
City      Houston
Name: 我, dtype: object

In [41]:
df.loc["我", "Name"]  # 查看索引为"我"的行数据的姓名列数据

'Marone'

In [42]:
df.loc["我", "Name":"Salary"]  # 查看索引为"我"的行数据的姓名到薪水列数据

Name      Marone
Age           40
Salary     80000
Name: 我, dtype: object

#### DataFrame 编辑


In [43]:
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25,50000,New York
1,Bob,30,60000,Los Angeles
2,Charlie,35,70000,Chicago
我,Marone,40,80000,Houston
4,Jack,45,90000,Phoenix
5,Tom,50,100000,Philadelphia


替换某一列为索引

In [44]:
df.set_index("Name", inplace=True)  # 将姓名列设置为索引
print(df)  # 查看数据
df.index  # 查看索引

         Age  Salary          City
Name                              
Alice     25   50000      New York
Bob       30   60000   Los Angeles
Charlie   35   70000       Chicago
Marone    40   80000       Houston
Jack      45   90000       Phoenix
Tom       50  100000  Philadelphia


Index(['Alice', 'Bob', 'Charlie', 'Marone', 'Jack', 'Tom'], dtype='object', name='Name')

还原索引

In [45]:
df_reset = df.reset_index()  # 重置索引
print(df)  # 查看旧数据
print(df_reset)  # 查看新数据

         Age  Salary          City
Name                              
Alice     25   50000      New York
Bob       30   60000   Los Angeles
Charlie   35   70000       Chicago
Marone    40   80000       Houston
Jack      45   90000       Phoenix
Tom       50  100000  Philadelphia
      Name  Age  Salary          City
0    Alice   25   50000      New York
1      Bob   30   60000   Los Angeles
2  Charlie   35   70000       Chicago
3   Marone   40   80000       Houston
4     Jack   45   90000       Phoenix
5      Tom   50  100000  Philadelphia


In [46]:
# df.reindex()  # 重新索引

#### DataFrame 读取文件

可以读 csv，xlsx 文件


## re, textwrap 字符串处理


In [47]:
import re
import textwrap

text = "hello 123 world 456"
print(re.findall(r"\d+", text))  # ['123', '456']
print(re.sub(r"\d+", "X", text))  # 'hello X world X'

wrapped_text = textwrap.wrap(
    "This is a very long sentence that needs wrapping.", width=10
)
print("\n".join(wrapped_text))  # 自动换行

['123', '456']
hello X world X
This is a
very long
sentence
that needs
wrapping.
