In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# 数据预处理
# 导入数据并显示前6行数据来检查数据是否正确
from matplotlib import category


df = pd.read_csv('死亡人数.csv',encoding='gbk')
bf = pd.read_csv('出生人数.csv',encoding='gbk')
pf = pd.read_csv('population-and-demography.csv',encoding='gbk')
pd.set_option('display.max_columns',100000)
display(df.head(6))
display(bf.head(6))
display(pf.head(6))

In [None]:
# 合并数据集
df = df.set_index(['Country name', 'Year'], drop=True)
bf = bf.set_index(['Country name', 'Year'], drop=True)
pf = pf.set_index(['Country name', 'Year'], drop =True)
bd = pd.merge(df,bf,on=['Country name','Year'])
bd = pd.merge(bd,pf,on=['Country name','Year'])
display(bd)

In [None]:
# 数据清洗
def data_fill(data):
    data.isnull().sum() # 查看缺失值
    for i in data.select_dtypes(include=['number']).columns:
        x = data[i].mean()# 计算每列的均值
        data.fillna({i:x}, inplace=True) # 用均值填充缺失值
    data.isnull().sum() # 查看缺失值是否全为0
    return data
bd = data_fill(bd)


In [None]:
# 清洗字符串格式
import dis
import re

from numpy import disp
for i in bd.columns:
    if bd[i].dtype == 'object':
        bd[i] = bd[i].apply(lambda x: re.sub(r'[^\w\s]','',x))

# 显示数据
pd.set_option('display.max_columns',100)
display(bd.head())

In [None]:
# 数据分析
# 总览分析
# 计算全球总出生人数和总死亡人数
total_birth = bd.loc[('World'), 'Births'].astype('float64').sum() # type: ignore
display(total_birth)
total_death = bd.loc[('World'), 'Deaths'].astype('float64').sum() # type: ignore
display(total_death)
# 计算各国的总出生人数和总死亡人数
index = bd.index.levels[0] # type: ignore  levels在新版本pandas中被移除
display(index)
a = []
b = []
for i in index:
    values = bd.loc[i, 'Births'].astype('float64').sum() # type: ignore
    values2 = bd.loc[i, 'Deaths'].astype('float64').sum() # type: ignore
    b.append(values2)
    a.append(values)
pd.set_option('display.float_format', lambda x: '{:f}'.format(x))  # 将 '{:f}' 替换为您所需的格式
births_sum = pd.Series(a, index=index) 
deaths_sum = pd.Series(b, index=index) 
bd_sum = pd.DataFrame({'Births_sum': births_sum, 'Deaths_sum': deaths_sum})
display(bd_sum)

In [None]:
# 排序并输出死亡人数,出生人数的前十国家
bd_sum.sort_values(by='Births_sum', ascending=False, inplace=True)
b_country = bd_sum.head(10).index
print("出生人数前十:")
display(b_country)
bd_sum.sort_values(by='Deaths_sum', ascending=False, inplace=True)
d_country = bd_sum.head(10).index
print("死亡人数前十:")
display(d_country)

In [None]:
#时间序列分析
# 分析全球每年出生人数和死亡人数的变化趋势

world_bd = bd.loc['World', ['Births', 'Deaths','Population']]  # type: ignore  # 选取全球数据
world_bd = data_fill(world_bd)  # 处理数据
world_bd['births_rate'] = world_bd['Births'] / world_bd['Population']   # 计算的出生率
world_bd['deaths_rate'] = world_bd['Deaths'] / world_bd['Population']   # 计算的死亡率
display(world_bd.describe(include='all'))  # 探索数据
display(world_bd.head())  # 显示前几行数据

world_bd[['Births', 'Deaths']].plot(figsize=(12, 6))    # 绘制折线图
plt.title('World Births and Deaths')  # 设置标题
plt.show()  # 显示图表

# 分析全球每年出生率和死亡率的变化趋势
world_bd[['births_rate', 'deaths_rate']].plot(figsize=(12, 6))    # 绘制折线图
plt.title('World Births and Deaths Rate')  # 设置标题
plt.show()  # 显示图表



In [None]:
def data_select(Country_name):
    bds = data_fill(bd.loc[Country_name, ['Births', 'Deaths','Population']])
    bds['births_rate'] = bds['Births']/bds['Population']
    bds['deaths_rate'] = bds['Deaths']/bds['Population']
    display(Country_name)
    display(bds.describe(include='all'))
    bds[['Births', 'Deaths']].plot(figsize=(12, 6))# 绘制Births_Deaths折线图
    plt.title('Births_Deaths')
    plt.show()
    bds[['births_rate', 'deaths_rate']].plot(figsize=(12, 6))# 绘制births_rate_deaths_rate折线图
    plt.title('births_rate_deaths_rate')
    plt.show()

index = ['China', 'India', 'United States','United Kingdom']
for i in index:
    data_select(i)
