# 数据处理

## 数据处理 - 寿命

FEH_00200502_190805180824.csv

标准化死亡比，h20-24表

>     col 0     不同级别的地点，目前关心县数据编号[01,02,03,...,47]  
>     col 3,4   癌症总标准化死亡率  
>     col 5,6   胃癌标准化死亡率  
>     col 7,8   肠癌标准化死亡率  
>     col 9,10  肝癌标准化死亡率  
>     col 11,12 肺癌标准化死亡率

In [78]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
pd.set_option('display.width', 200)   # 每行最大字符
pd.set_option('precision', 3)         # 显示数字精度
pd.set_option('display.max_rows', 25) # 预览时最多显示行数
pd.set_option('display.float_format', lambda x : '%.2f' % x)  # 不使用科学计数法

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms

plt.rcParams['figure.figsize'] = 18, 9
plt.rcParams['axes.unicode_minus'] = False     # 显示数字负号
plt.rcParams['font.sans-serif'] = ['SimHei']   # 显示中文字体
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['savefig.dpi'] = 100
mpl.rcParams['font.size'] = 12
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.titlesize'] = 'medium'
plt.style.use('seaborn-whitegrid')

import seaborn as sns

def explore(df, n=5, describe=False, info=False):
  print('rows x cols:', df.shape)
  print('column name:', '\t'.join(df.columns))
  print()
  if describe:
    print('==== describe ====')
    print(df.describe())
  if info:
    print('==== info ====')
    print(df.info())
  print('==== sample ====')
  return df.sample(n=n, random_state=100)

# explore_df(df)

def find_in(df, col, pat):
  ''' 在 col 中 查找命中 pat 的数据
      如果 pat 是 字符串, 视为模糊查找 
      如果 pat 是 list, 或者非字符串型, 视为精确查找 '''
  if isinstance(pat, str):
    return df[df[col].str.contains(pat)]
  if not isinstance(pat, (list, set)):
    pat = [pat]
  return df[df[col].isin(pat)]


### 预览 FEH_00200502_190805180824.csv

In [79]:
life_df = pd.read_excel("../h20-24＿第５表.xlsx", header=4)
life_df.reset_index(inplace=True)

In [80]:
life_df.head(18)

Unnamed: 0,level_0,level_1,level_2,総 数,Unnamed: 1,胃,Unnamed: 3,大 腸,Unnamed: 5,肝及び肝内胆管,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,,男性,女性,男性,女性,男性,女性,男性,女性,男性,...,男性,女性,男性,女性,男性,女性,男性,女性,男性,女性
1,全 国,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,01北海道,101,97.60,107.70,108,95.20,93.80,105.30,110.20,92.30,...,81.60,100.70,128.50,131.70,61.10,62.70,87.10,74.20,110.30,105.70
3,10　札幌市保健所,97.50,94.40,105.70,109.30,91.90,87,100,109,96.80,...,75.30,104.40,119.70,116.80,44.20,43.50,69.20,62,99.50,101.10
4,札幌市,97.50,94.40,105.70,109.30,91.90,87,100,109,96.80,...,75.30,104.40,119.70,116.80,44.20,43.50,69.20,62,99.50,101.10
5,01101　中央区,96.60,89.30,110.40,118.70,90.70,96.80,112.20,119.10,117.90,...,69.30,76.20,105.50,98.30,30.30,32.20,75.90,55.10,92.70,89.80
6,01102　北区,99.70,94.10,109.70,109.20,98.90,91.20,104.60,123.50,104.70,...,82.10,112.50,105.50,122.30,39.70,37.90,67.80,61.20,107,118.70
7,01103　東区,102,99.30,107.30,109,97.10,76.70,94.70,106.80,98.50,...,69.90,128.90,125.30,107.70,41.90,45.20,74,66.50,103.50,105.70
8,01104　白石区,110.70,101.90,113.80,118.60,109,89.10,107.50,117.40,103,...,96.10,122,156.90,142.50,54.60,35.40,85,72.40,111.80,120.60
9,01105　豊平区,97.40,97.30,104.90,111.20,83.20,95.70,113.90,112.10,106.80,...,64.80,100.90,131.10,122,48.70,61.80,71.80,64.10,100.30,101.40


In [81]:
# 保留 `癌症总标准化死亡率`, `胃癌` `肠癌` `肝癌` `肺癌`, 每种都分为 男 / 女
columns = [
  ('level_0',             '地区'), 
  ('level_1',             '总死亡率男'), 
  ('level_2',             '总死亡率女'), 
  
  ('総 数',             '癌症总死亡率男'), 
  ('Unnamed: 1',        '癌症总死亡率女'), 
  ('胃',                '胃癌死亡率男'), 
  ('Unnamed: 3',        '胃癌死亡率女'), 
  ('大 腸',             '肠癌死亡率男'), 
  ('Unnamed: 5',        '肠癌死亡率女'), 
  ('肝及び肝内胆管',     '肝癌死亡率男'), 
  ('Unnamed: 7',        '肝癌死亡率女'), 
  ('気管、気管支及び肺', '肺癌死亡率男'), 
  ('Unnamed: 9',        '肺癌死亡率女'), 
]

life_df = life_df[[line[0] for line in columns]]
life_df.columns = [line[1] for line in columns]

In [82]:
life_df.head(25)

Unnamed: 0,地区,总死亡率男,总死亡率女,癌症总死亡率男,癌症总死亡率女,胃癌死亡率男,胃癌死亡率女,肠癌死亡率男,肠癌死亡率女,肝癌死亡率男,肝癌死亡率女,肺癌死亡率男,肺癌死亡率女
0,,男性,女性,男性,女性,男性,女性,男性,女性,男性,女性,男性,女性
1,全 国,100,100,100,100,100,100,100,100,100,100,100,100
2,01北海道,101,97.60,107.70,108,95.20,93.80,105.30,110.20,92.30,84.90,116.70,123.40
3,10　札幌市保健所,97.50,94.40,105.70,109.30,91.90,87,100,109,96.80,90,113.60,127.90
4,札幌市,97.50,94.40,105.70,109.30,91.90,87,100,109,96.80,90,113.60,127.90
5,01101　中央区,96.60,89.30,110.40,118.70,90.70,96.80,112.20,119.10,117.90,99.90,106.90,136.20
6,01102　北区,99.70,94.10,109.70,109.20,98.90,91.20,104.60,123.50,104.70,78.30,116.90,128.50
7,01103　東区,102,99.30,107.30,109,97.10,76.70,94.70,106.80,98.50,85.30,121.50,121.20
8,01104　白石区,110.70,101.90,113.80,118.60,109,89.10,107.50,117.40,103,82.70,116.90,146.10
9,01105　豊平区,97.40,97.30,104.90,111.20,83.20,95.70,113.90,112.10,106.80,96,108.50,131


In [83]:
# 从 `地区` 中, 保留 "两位数字开头, 紧接着数字的不是空格" 的 rows

def is_county(s):
  return isinstance(s, str) and (s[:2].isdigit()) and (s[2] !='　')

life_df = life_df[health_df.地区.map(is_county)]
life_df

Unnamed: 0,地区,总死亡率男,总死亡率女,癌症总死亡率男,癌症总死亡率女,胃癌死亡率男,胃癌死亡率女,肠癌死亡率男,肠癌死亡率女,肝癌死亡率男,肝癌死亡率女,肺癌死亡率男,肺癌死亡率女
2,01北海道,101,97.60,107.70,108,95.20,93.80,105.30,110.20,92.30,84.90,116.70,123.40
222,02青　森,119.80,109.60,116.40,108.10,117.90,112.70,135.90,122.60,95.30,88.90,113.90,96.20
270,03岩　手,113.70,110.40,98.40,96.20,94.10,85.20,108.30,115.10,73.30,75.80,94.80,83.60
314,04宮　城,107,111.90,97.50,99.10,95.20,93.90,96.10,109.60,71.70,81.60,101.70,97.40
367,05秋　田,109.60,104.20,108.20,100.40,132.60,131.40,121.70,111.10,65.20,64.20,92.80,81.90
402,06山　形,100.90,101.10,99.50,99.70,120.70,123,96.60,103.40,72.40,83.80,97.90,84.20
442,07福　島,106.30,104.80,99.30,95.30,105.60,101.50,105,101.40,80.60,82.60,97.30,88.90
510,08茨　城,104.20,105.50,100.70,98.50,114.70,110.40,105.40,99.50,93.70,85,96.60,86.80
567,09栃　木,105.60,107.60,99.40,99.70,112.20,109.40,107.20,105.40,96.30,85.20,96.60,89.10
600,10群　馬,100.70,102.40,96.90,97.40,102.90,102.50,106,102.70,89.70,99.60,96.80,90.60


In [84]:
# 修饰 `地区` 的文字, 去掉空格
life_df['地区ID'] = life_df['地区'].map(lambda s: s[:2])
life_df['地区'] = life_df['地区'].map(lambda s: s[2:].replace('　', ''))

correct_columns_seq = [life_df.columns[-1]] + list(life_df.columns[:-1])
life_df = life_df[correct_columns_seq]  # 把 `地区ID` 挪到第一列

life_df.reset_index(drop=True, inplace=True)
life_df

Unnamed: 0,地区ID,地区,总死亡率男,总死亡率女,癌症总死亡率男,癌症总死亡率女,胃癌死亡率男,胃癌死亡率女,肠癌死亡率男,肠癌死亡率女,肝癌死亡率男,肝癌死亡率女,肺癌死亡率男,肺癌死亡率女
0,01,北海道,101,97.60,107.70,108,95.20,93.80,105.30,110.20,92.30,84.90,116.70,123.40
1,02,青森,119.80,109.60,116.40,108.10,117.90,112.70,135.90,122.60,95.30,88.90,113.90,96.20
2,03,岩手,113.70,110.40,98.40,96.20,94.10,85.20,108.30,115.10,73.30,75.80,94.80,83.60
3,04,宮城,107,111.90,97.50,99.10,95.20,93.90,96.10,109.60,71.70,81.60,101.70,97.40
4,05,秋田,109.60,104.20,108.20,100.40,132.60,131.40,121.70,111.10,65.20,64.20,92.80,81.90
5,06,山形,100.90,101.10,99.50,99.70,120.70,123,96.60,103.40,72.40,83.80,97.90,84.20
6,07,福島,106.30,104.80,99.30,95.30,105.60,101.50,105,101.40,80.60,82.60,97.30,88.90
7,08,茨城,104.20,105.50,100.70,98.50,114.70,110.40,105.40,99.50,93.70,85,96.60,86.80
8,09,栃木,105.60,107.60,99.40,99.70,112.20,109.40,107.20,105.40,96.30,85.20,96.60,89.10
9,10,群馬,100.70,102.40,96.90,97.40,102.90,102.50,106,102.70,89.70,99.60,96.80,90.60


In [85]:
print(life_df.to_csv(), file=open('寿命.csv', 'w'))