In [None]:
import os
import sys
import numpy as np
import pandas as pd
import re
import json
import time
from datetime import date, datetime

In [None]:
stime = time.time() 

# 【df_survey_combine.csv】

In [None]:
# 目錄
folder_src = ['farmer_income_survey', 'resources']
folder_input = folder_src + ['input']
folder_output = folder_src + ['output']
year = '109'
# 今年的幾個目錄
folder_csv = folder_input + [year, 'csv']
folder_db_cache = folder_input + [year, 'db_cache']

In [None]:
# csv目錄下的檔名
csv_list = [
    '00_update', # 109用不到這個檔案
    '01_調查名冊',
    '02_農保',
    '03_農職',
    '04_國保給付1-8月',
    '05_勞就保給付1-8',
    '06_勞退1-8月退休金',
    '07_農保給付1-8月',
    '08_住院',
    '09_門診',
    '10_健保應收保險費',
    '11_勞就保1-8月應計保險費',
    '12_國保1-8月實收保險費',
    # --------------------------------------
    '990_主選名冊',
    '991_備選3套名冊',
    '992_備選4-6套名冊',
]
path_csv_list = []
# 確認檔案是否存在
tmp = []
for name in csv_list:
    path_csv = os.path.join(*folder_csv, f'{name}.csv')
    path_csv_list.append(path_csv)
    if not os.path.isfile(path_csv):
        tmp.append(f'{name}.csv')
if tmp:
    print(tmp, '不存在')
else:
    print('csv目錄下的檔案都存在')

In [None]:
# db_cache目錄下的檔名
db_cache_list = [
    'household',  # 全戶戶籍檔
    'fallow_declare',  # 休耕轉作申報
    'fallow_transfer_subsidy',  # 休耕轉作補貼
    'small_large_tenant_information_landlord_id',  # 小大地主id
    'small_large_tenant_information_tenant_id',  # 小大承租人id
    'small_large_tenant_transfer',  # 小大轉作補貼
    'small_large_landlord_rent',  # 小大出租給付
    'small_large_landlord_retire',  # 小大離農獎勵
    'disaster_subsidy',  # 天災救助
    'child_scholarship',  # 子女獎助學金
    'livestock',  # 畜牧
    'elder_allowance',  # 老農津貼
]
#
path_db_cache = {}
tmp = []
for name in db_cache_list:
    path_name = os.path.join(*folder_db_cache, f'db_{name}.csv')
    path_db_cache[name] = path_name
    if not os.path.isfile(path_name):
        tmp.append(f'db_{name}.csv')
if tmp:
    print(tmp, '不存在')
else:
    print('db_cache目錄下的檔案都存在')

In [None]:
# 讀取csv檔的共同參數
init = {
    # 'engine': 'python',
    'encoding': 'utf-8',
    'sep': ',',
    'keep_default_na': False,  # 放棄自動轉NA
}

# ID欄位的統一字串處理
def SE_upper(se):
    try:
        return se.str.strip().str.replace('[次]', '??').str.replace('[ \u3000　]', '').str.upper()  # 去空白，轉大寫
    except:
        print(f'{se.name}: SE_upper有問題，dtype={se.dtype}')
        return se

# 戶號集中戶員資訊後，輸出主備
def combine_to_csv(df1, df2, fname, killhhn=False):
    df1 = df1.merge(df2, on='householdNumber', how='left')
    dropcols = killhhn and ['樣本套號', 'householdNumber'] or ['樣本套號']
    # 主選
    where = True
    where &= df1['樣本套號'].str.contains('0', regex=True)
    if killhhn:
        where = df1['link_num'].str[-1] == '0'
    df1[where].drop(columns=dropcols).to_csv(f'new_{fname}_0.csv', index=False)
    # 備選
    where = True
    where &= df1['樣本套號'].str.contains('[123456]', regex=True)
    if killhhn:
        where = df1['link_num'].str[-1].str.contains('[123456]', regex=True)
    df1[where].drop(columns=dropcols).to_csv(f'new_{fname}_1.csv', index=False)

# 0.調查名冊 df_survey

In [None]:
cols = [
    'SEQ', '編號', '層別', '樣本套號', '戶長姓名', '市內電話', '分機', '手機號碼', '地址',
    '總銷售收入組距', '非對象註記', '總處連結編號', 'ID', '連結編號', '主備標記', '縣市', '區鄉鎮名稱',
    'adcode', '農戶編號', '可耕作地面積', '主要經營型態', '戶內15歲以上人口',
    '調查員', '刪除',
    'from', 'tel_u', 'phone_u', 'addr_u', 'row_u', 'tel_new', 'phone_new', 'addr_new', '更新市內', '更新手機', '更新地址'
]
dtype = {col: str for col in cols}
#
usecols = ['農戶編號', 'ID', '戶長姓名', '層別', '樣本套號', '主備標記', '連結編號', 'tel_new', 'phone_new', 'addr_new', '地址']
usecols10 = usecols[:7]+['電話', '地址', 'county']
#
df_survey = pd.read_csv(path_csv_list[1], **init, dtype=dtype, usecols=usecols)[usecols]
# 欄位處理
df_survey['連結編號'] = df_survey['連結編號'].str.zfill(5)  # 左邊補0到五碼
df_survey['電話'] = (df_survey['tel_new']+'/\n'+df_survey['phone_new']).str.strip('/\n')  # 市內+手機合併到一個欄位
# 地址處理，county以名冊舊地址為準，主力勞動力的地址很常沒縣市
df_survey['county_new'] = df_survey['addr_new'].str.replace('^.*?(..[縣市]).*$', r'\1', regex=True).str[:3]
df_survey['county'] = df_survey['地址'].str.replace('^.*?(..[縣市]).*$', r'\1', regex=True).str[:3]
where = df_survey['county_new'] != df_survey['county']
df_survey['addr_new'] = np.where(where, df_survey['county']+df_survey['addr_new'], df_survey['addr_new'])
del df_survey['地址']
del df_survey['county_new']
df_survey.rename(columns={'addr_new': '地址'}, inplace=True)
# 同一人可能在不同套，以排序處理，同一ID只留套號最小者。原本沒ID者，ID都補'ZID'開頭
print(df_survey.shape)
df_survey = df_survey.sort_values(by=['ID', '樣本套號']).drop_duplicates(subset=['ID'], keep='first').reset_index(drop=True)[usecols10]
print(df_survey.shape)
df_survey[:3]

In [None]:
county_all = [
    '臺北市',
    '新北市',
    '桃園市',
    '臺中市',
    '臺南市',
    '高雄市',
    '新竹縣',
    '苗栗縣',
    '彰化縣',
    '南投縣',
    '雲林縣',
    '嘉義縣',
    '屏東縣',
    '宜蘭縣',
    '花蓮縣',
    '臺東縣',
    '澎湖縣',
    '金門縣',
    '連江縣',
    '基隆市',
    '新竹市',
    '嘉義市',
]
# 檢查有無奇怪縣市
where = df_survey['county'].isin(county_all)
df_survey[~where]

# 1.全戶戶籍檔 df_hh

In [None]:
cols = ['pid', 'birth', 'householdNumber', 'address', 'role', 'annotation']
dtype = {col: str for col in cols}
cols_map = {'pid': 'ID'}
# (1) 讀檔，去重複ID
df_hh = pd.read_csv(path_db_cache['household'], **init, dtype=dtype).rename(columns=cols_map).drop_duplicates(subset='ID')
# (2) 前三碼民國年，後四碼月日，有 -02 代表民國前2年，去首0
df_hh['birth'] = df_hh['birth'].str.zfill(7).str[:3].str.lstrip('0').str.replace('-0', '-').astype(int)  # 出生民國年
df_hh['age'] = (date.today().year - 1911) - df_hh['birth']
# (3) 戶長處理
df_hh['role_order'] = np.where(df_hh['role'] == '戶長', 0, 1)  # 戶長排第一
df_hh['ID_leader'] = np.where(df_hh['role'] == '戶長', df_hh['ID'], np.nan)
df_hh = df_hh.sort_values(by=['householdNumber', 'role_order'])  # 以戶號及角色排序時，讓戶長在第一
df_hh['ID_leader'] = df_hh['ID_leader'].fillna(method='ffill')  # 前向填充戶長ID
del df_hh['role_order']
df_hh = df_hh.sort_values(by=['householdNumber', 'ID']).reset_index(drop=True)
#
print(df_hh.shape)  # 全戶籍檔的人數
df_hh[:3]

In [None]:
df_hh.annotation.value_counts()

# 2.調查名冊 + 【戶籍資料】

In [None]:
# 調查名冊每個ID去查他的戶籍資料
on_how = {'on': 'ID', 'how': 'left'}
orderby = ['householdNumber', 'role']
df_survey = df_survey.merge(df_hh, **on_how).sort_values(by=orderby).reset_index(drop=True)
#
print(df_survey.shape)
df_survey[:2]

In [None]:
# 故意讓劉永昌的戶號 = ID
where = df_survey['ID'] == 'T102811151'
idx = df_survey[where].index
df_survey['householdNumber'][idx] = 'T102811151'
df_survey[where]

# 3.戶籍檔 +【樣本套號】

In [None]:
# 戶籍檔每個ID一定跟名冊至少一個ID同戶，以該名冊ID之套號為戶籍檔ID之套號
# 調查名冊可能有兩人同一戶，而兩人同套號或不同套號，如名冊中H121797480,U220247830，夫妻各為0,5套
# 所以戶籍檔中，戶員的樣本套號可以一個以上，以sum組字串
df_survey_flag = df_survey[['householdNumber', '樣本套號']].dropna(subset=['householdNumber']).drop_duplicates()
df_survey_flag = df_survey_flag.groupby('householdNumber', as_index=False).sum()
#
df_hh = df_hh.merge(df_survey_flag, on='householdNumber', how='left')
df_hh

# 4.勞健保資料 (讀取csv目錄中的11個.csv)

### 農保 / 農職

In [None]:
# 02_農保________________________________________________________________________________
df_02 = pd.read_csv(path_csv_list[2], **init, usecols=['身份證字號']).drop_duplicates()
df_02['身份證字號'] = SE_upper(df_02['身份證字號'])
df_02.sort_values('身份證字號', inplace=True)

# 03_農職________________________________________________________________________________
df_03 = pd.read_csv(path_csv_list[3], **init, usecols=['身份證字號']).drop_duplicates()
df_03['身份證字號'] = SE_upper(df_03['身份證字號'])
df_03.sort_values('身份證字號', inplace=True)

### 國保 / 勞就保

In [None]:
# 04_國保給付1-8月________________________________________________________________________________
df_04 = pd.read_csv(path_csv_list[4], **init, dtype={'給付種類': str, '核付年月': str})
df_04['身分證號'] = SE_upper(df_04['身分證號'])
df_04 = df_04.sort_values(by=['身分證號', '給付種類', '核付年月']).reset_index(drop=True)
# (1)每個ID/給付種類的最後一個月的金額，再去補四個月，除了60:生育給付, 66:喪葬給付, 此兩種給付為一次性，只有一個月
delta = df_04.groupby(by=['身分證號', '給付種類']).tail(1).reset_index(drop=True)
delta = delta['金額']*(12-delta['核付年月'].str[-2:].astype(int)) * ~delta['給付種類'].isin(['60', '66'])
df_04 = df_04.groupby(by=['身分證號', '給付種類'], as_index=False).sum()
df_04['金額'] = df_04['金額'] + delta
# (2)每個ID不分給付種類再加總
df_04 = df_04.groupby('身分證號').sum().reset_index()
df_04.rename(columns={'身分證號': 'ID', '金額': 'df_04'}, inplace=True)
#
# 05_勞就保給付1-8________________________________________________________________________________
# 勞保給付1-8月.csv (只要勞保)，P120440487的金額是負的，轉csv加了括弧(xxx)，excel要先修改儲存格格式
df_05 = pd.read_csv(path_csv_list[5], **init, dtype={'給付種類': str, '核付年月': str, '金額（元）': int})
df_05['身分證號'] = SE_upper(df_05['身分證號'])
df_05 = df_05.sort_values(by=['身分證號', '給付種類', '核付年月']).reset_index(drop=True)
# (1)每個ID/給付種類的最後一個月的金額，再去補四個月，isin以外的都是一次性
delta = df_05.groupby(by=['身分證號', '給付種類']).tail(1).reset_index(drop=True)
isin = ['45', '48', '35', '36', '37', '38', '55', '56', '57', '59']
delta = delta['金額（元）']*(12-delta['核付年月'].str[-2:].astype(int)) * delta['給付種類'].isin(isin)
df_05 = df_05.groupby(by=['身分證號', '給付種類'], as_index=False).sum()
df_05['金額（元）'] = df_05['金額（元）'] + delta
# (2)每個ID不分給付種類再加總
df_05 = df_05.groupby('身分證號').sum().reset_index()
df_05.rename(columns={'身分證號': 'ID', '金額（元）': 'df_05'}, inplace=True)

### 剩下 7 個 / 老農津貼

In [None]:
# 06_勞退1-8月退休金________________________________________________________________________________
df_06 = pd.read_csv(path_csv_list[6], **init, dtype={'請領種類': str, '核付年月': str})
df_06['身分證'] = SE_upper(df_06['身分證'])
df_06 = df_06.sort_values('身分證').groupby(['身分證'], as_index=False).sum()  # 不分請領種類
df_06.rename(columns={'身分證': 'ID', '金額(元)': 'df_06'}, inplace=True)

# 07_農保給付1-8月________________________________________________________________________________
df_07 = pd.read_csv(path_csv_list[7], **init, dtype={'給付種類': str, '給付年月': str})
df_07['身分證字號'] = SE_upper(df_07['身分證字號'])
df_07 = df_07.sort_values('身分證字號').groupby(['身分證字號'], as_index=False).sum()  # 不分給付種類/年月
df_07.rename(columns={'身分證字號': 'ID', '核付總金額(元)': 'df_07'}, inplace=True)

# 08_住院________________________________________________________________________________
df_08 = pd.read_csv(path_csv_list[8], **init, usecols=['ID', '住院日數'])
df_08['ID'] = SE_upper(df_08['ID'])
df_08 = df_08.sort_values('ID').rename(columns={'住院日數': 'df_08'})

# 09_門診________________________________________________________________________________
df_09 = pd.read_csv(path_csv_list[9], **init, usecols=['ID', '1-8月門診件數'])
df_09['ID'] = SE_upper(df_09['ID'])
df_09 = df_09.sort_values('ID').rename(columns={'1-8月門診件數': 'df_09'})

# 10_健保應收保險費________________________________________________________________________________
cols_map = {
    '受訪者身分證字號': 'ID',
    '本會調查表健保身分別': 'df_10a', '被保險人註記': 'df_10b',
    '應繳眷口數(人)': 'df_10c', '1-8月自付金額': 'df_10d',
}
df_10 = pd.read_csv(path_csv_list[10], **init, dtype={'本會調查表健保身分別': str}, usecols=cols_map.keys())
df_10['受訪者身分證字號'] = SE_upper(df_10['受訪者身分證字號'])
df_10 = df_10.sort_values('受訪者身分證字號').rename(columns=cols_map)

# 11_勞就保1-8月應計保險費________________________________________________________________________________
first = ['身分證字號']
rest = ['被保險人負擔勞保費', '被保險人負擔就保費', '保費年月']
df_11 = pd.read_csv(path_csv_list[11], **init, dtype={'保費年月': str}, usecols=first+rest)
df_11['身分證字號'] = SE_upper(df_11['身分證字號'])
df_11.sort_values(by=['身分證字號', '保費年月'], inplace=True)
df_11 = df_11.groupby('身分證字號').tail(1)  # 只留最後一個月份即可
df_11['被保險人負擔勞保費'] = df_11['被保險人負擔勞保費'].replace('^\s*$', 0, regex=True).astype(int)
df_11['被保險人負擔就保費'] = df_11['被保險人負擔就保費'].replace('^\s*$', 0, regex=True).astype(int)
df_11['df_11'] = df_11['被保險人負擔勞保費'] + df_11['被保險人負擔就保費']
df_11 = df_11.drop(columns=rest).rename(columns={'身分證字號': 'ID'})

# 12_國保1-8月實收保險費________________________________________________________________________________
usecols = ['被保險人IDN', '實收保險費(元)', '繳費年月']
df_12 = pd.read_csv(path_csv_list[12], **init, dtype={'實收保險費(元)': int, '繳費年月': str}, usecols=usecols)
df_12['被保險人IDN'] = SE_upper(df_12['被保險人IDN'])
df_12.sort_values(by=['被保險人IDN', '繳費年月'], inplace=True)
df_12 = df_12.groupby('被保險人IDN').tail(1)  # 只留最後一個月份即可
df_12 = df_12.drop(columns=['繳費年月']).rename(columns={'被保險人IDN': 'ID', '實收保險費(元)': 'df_12'})

# 老農津貼，pid沒有重複====================================================================================
df_EA = pd.read_csv(path_db_cache['elder_allowance'], **init)
df_EA['pid'] = SE_upper(df_EA['pid'])
df_EA.rename(columns={'pid': 'ID', 'amount': 'df_EA'}, inplace=True)

## 4-1: 戶籍檔 + 【勞健保資料】

In [None]:
df_hh['annotation'] = df_hh['annotation'].str.replace('現住人口', '')  # 只看死亡等少數，現住人口居多不看
# df_0203_農職保_至少要 15 歲__________________________________________________________________________________
df_hh['df_02'] = np.where(df_hh['ID'].isin(df_02['身份證字號']), 'Y', 'N')
df_hh['df_03'] = np.where(df_hh['ID'].isin(df_03['身份證字號']), '/Y', '/N')
df_hh['df_0203'] = np.where(df_hh['age'] >= 15, df_hh['df_02'] + df_hh['df_03'], '')
df_hh.drop(columns=['df_02', 'df_03'], inplace=True)
# df_EA_老農津貼至少 65 歲
df_hh = df_hh.merge(df_EA, on='ID', how='left')
df_hh['df_EA'] = np.where(df_hh['age'] >= 65, df_hh['df_EA'], np.nan)
df_hh['df_EA'] = df_hh['df_EA'].fillna(-123).astype(int).replace(-123, '')
# df_04_國保給付
df_hh = df_hh.merge(df_04, on='ID', how='left')
df_hh['df_04'] = df_hh['df_04'].fillna(-123).astype(int).replace(-123, '')
# df_05_勞保給付
df_hh = df_hh.merge(df_05, on='ID', how='left')
df_hh['df_05'] = df_hh['df_05'].fillna(-123).astype(int).replace(-123, '')
# df_06_勞退1-8月退休金
df_hh = df_hh.merge(df_06, on='ID', how='left')
df_hh['df_06'] = df_hh['df_06'].fillna(-123).astype(int).replace(-123, '')
# df_07_農保給付1-8月
df_hh = df_hh.merge(df_07, on='ID', how='left')
df_hh['df_07'] = df_hh['df_07'].fillna(-123).astype(int).replace(-123, '')
# df_08_住院日數
df_hh = df_hh.merge(df_08, on='ID', how='left')
df_hh['df_08'] = df_hh['df_08'].fillna(0).astype(int)
# df_09_門診次數
df_hh = df_hh.merge(df_09, on='ID', how='left')
df_hh['df_09'] = df_hh['df_09'].fillna(0).astype(int)
# df_10_健保: 健保身分別/健保被保險人註記/應繳眷口數(人)/健保自付金額
df_hh = df_hh.merge(df_10, on='ID', how='left')
df_hh['df_10a'].fillna('', inplace=True)
df_hh['df_10b'].fillna('', inplace=True)
df_hh['df_10c'] = df_hh['df_10c'].fillna(-123).astype(int).replace(-123, '')
df_hh['df_10d'] = df_hh['df_10d'].fillna(0).astype(int)
# df_11_最後一個月的勞保費與就保費相加
df_hh = df_hh.merge(df_11, on='ID', how='left')
df_hh['df_11'] = df_hh['df_11'].fillna(-123).astype(int).replace(-123, '')
# df_12_最後一個月的國保實收保費
df_hh = df_hh.merge(df_12, on='ID', how='left')
df_hh['df_12'] = df_hh['df_12'].fillna(-123).astype(int).replace(-123, '')
#
del df_hh['address']
df_hh.sort_values(by=['householdNumber', 'birth', 'role', 'ID'], inplace=True)
#
df_hh[:3]

## 4-2: 每一戶號，集中戶員勞健保資訊

In [None]:
cols_tar = [
    'birth', 'role', 'annotation',
    'df_0203', 'df_EA', 'df_04', 'df_05', 'df_06', 'df_07', 'df_08',
    'df_09', 'df_10a', 'df_10b', 'df_10c', 'df_10d', 'df_11', 'df_12',
    'ID',
]
df_hh_combine = pd.DataFrame({'householdNumber': df_hh['householdNumber'], 'df_hh': df_hh[cols_tar].values[:, None, :].tolist()})
df_hh_combine = df_hh_combine.groupby('householdNumber', as_index=False).sum()
#
print(df_hh_combine.shape)
df_hh_combine[:3]

# 5.休耕轉作【申報】 db_fallow_declare

## 5-1: 作物名稱處理

In [None]:
df_FD = pd.read_csv(path_db_cache['fallow_declare'], **init)
df_FD['applicantId'] = SE_upper(df_FD['applicantId'])
df_FD = df_FD.merge(df_hh[['ID', 'householdNumber', 'age']], left_on='applicantId', right_on='ID', how='left').drop(columns=['ID'])
df_FD = df_FD[df_FD['age'] >= 20]  # 申報年齡至少20歲(通常是戶長申請, 戶長年齡至少要成年)
# 根據面積處理稻米名稱。有可能面積0，但申報不過而仍有名稱，所以只取 >0 為True者有名稱，其餘 ''
df_FD['Rice1'] = df_FD['japonicaApproveArea'].astype(bool) * np.array(['梗稻'], dtype=object)
df_FD['Rice2'] = df_FD['indicaApproveArea'].astype(bool) * np.array(['秈稻'], dtype=object)
df_FD['Rice3'] = df_FD['glutinousApproveArea'].astype(bool) * np.array(['糯稻'], dtype=object)
# 根據核定面積處理轉作名稱
df_FD['TCrop1'] = df_FD['approveTransferArea1'].astype(bool) * df_FD['approveTransferCrop1']
df_FD['TCrop2'] = df_FD['approveTransferArea2'].astype(bool) * df_FD['approveTransferCrop2']
df_FD['TCrop3'] = df_FD['approveTransferArea3'].astype(bool) * df_FD['approveTransferCrop3']
#
cols_tar = ['householdNumber', 'Rice1', 'Rice2', 'Rice3', 'TCrop1', 'TCrop2', 'TCrop3']
df_FD = df_FD[cols_tar].sort_values('householdNumber')
#
print(df_FD.shape)
df_FD[:3]

## 5-2: 每一戶號，集中戶員的作物名稱

In [None]:
df_FD_combine = pd.DataFrame({'householdNumber': df_FD['householdNumber'], 'df_FD': df_FD[cols_tar[1:]].values.tolist()})
df_FD_combine = df_FD_combine.groupby('householdNumber', as_index=False).sum()
df_FD_combine['df_FD'] = df_FD_combine['df_FD'].map(lambda x: ', '.join(sorted(list(filter(None, set(x))))))
#
print(df_FD_combine.shape)
df_FD_combine[:3]

# 6.休耕轉作【補貼】 db_fallow_transfer_subsidy

## 6-1: 讀檔

In [None]:
df_FTS = pd.read_csv(path_db_cache['fallow_transfer_subsidy'], **init)
df_FTS['farmerId'] = SE_upper(df_FTS['farmerId'])
df_FTS = df_FTS.merge(df_hh[['ID', 'householdNumber', 'age']], left_on='farmerId', right_on='ID', how='left').drop(columns=['ID'])
#
cols = ['householdNumber', 'period', 'subName']
df_FTS = df_FTS[df_FTS['age'] >= 20].drop(columns=['age']).sort_values(by=cols)  # 申報年齡至少20歲(通常是戶長申請, 戶長年齡至少要成年)
#
print(df_FTS.shape)
df_FTS[:3]

## 6-2: 每一戶號，集中戶員的補貼資訊

In [None]:
# 每戶的同期別同作物做處理
df_FTS_combine = df_FTS.groupby(by=cols, as_index=False).sum()
tmp1 = df_FTS_combine[['subName', 'subsidy', 'period']].values.tolist()
def tmp2(x):
    return [dict(zip(['crop', 'amount', 'period'], x))]
#
df_FTS_combine = pd.DataFrame({'householdNumber': df_FTS_combine['householdNumber'], 'df_FTS': pd.Series(tmp1).map(tmp2)})
df_FTS_combine = df_FTS_combine.groupby('householdNumber', as_index=False).sum()  # 每戶號的所有字典再集中
# #
print(df_FTS_combine.shape)
df_FTS_combine[:3]

# 7.天然災害現金救助 db_disaster_subsidy

## 7-1: 讀檔

In [None]:
# 申請年齡不確定
df_DS = pd.read_csv(path_db_cache['disaster_subsidy'], **init)
df_DS['applicantId'] = SE_upper(df_DS['applicantId'])
df_DS = df_DS.merge(df_hh[['ID', 'householdNumber']], left_on='applicantId', right_on='ID', how='left').drop(columns=['ID'])
#
cols = ['householdNumber', 'eventName', 'approveCrop']
df_DS.sort_values(by=cols, inplace=True)
#
print(df_DS.shape)
df_DS[:3]

## 7-2: 每一戶號，集中戶員的天災救助資訊

In [None]:
# 每戶的同事件同作物做處理
df_DS_combine = df_DS.groupby(by=cols, as_index=False).sum()
tmp1 = df_DS_combine.iloc[:, 1:].values.tolist()
def tmp2(x):
    return [dict(zip(['event_name', 'crop', 'area', 'amount'], x))]
#
df_DS_combine = pd.DataFrame({'householdNumber': df_DS_combine['householdNumber'], 'df_DS': pd.Series(tmp1).map(tmp2)})
df_DS_combine = df_DS_combine.groupby('householdNumber', as_index=False).sum()  # 每戶號的所有字典再集中
#
print(df_DS_combine.shape)
df_DS_combine[:3]

# 8.畜牧資料 livestock

## 8-1: 讀檔

In [None]:
df_LS = pd.read_csv(path_db_cache['livestock'], **init, dtype={'investigateYear': 'str'})
df_LS['farmerId'] = SE_upper(df_LS['farmerId'])
df_LS = df_LS.merge(df_hh[['ID', 'householdNumber', 'age']], left_on='farmerId', right_on='ID', how='left').drop(columns=['ID'])
# 申報年齡至少18，多補劉永昌
where = df_LS['age'] >= 18
where |= df_LS['farmerId'] == 'T102811151'
df_LS = df_LS[where].drop(columns=['age'])
# 欄位處理
df_LS['investigateSeason'] = SE_upper(df_LS['investigateSeason'])
df_LS['fieldName'] = SE_upper(df_LS['fieldName'])
df_LS['animalName'] = SE_upper(df_LS['animalName'])
# (1)有蛋的，在養量=0 且 屠宰量>0，在養顯示'出清'
where = True
where &= df_LS['animalName'].str.match('^.*蛋.*(雞|鴨|鵝|鵪鶉|鴿)')
where &= df_LS['raiseCount'] == 0
where &= (df_LS['slaughterCount'] != 0) & (df_LS['slaughterCount'] != '')
df_LS['raiseCount'] = df_LS['raiseCount'].where(~where, '出清')  # 用np.where，0變'0'
# (2)兩隻腳的非蛋X，不顯示屠宰量
where = True
where &= df_LS['animalName'].str.match('^[^蛋]*(雞|鴨|鵝|鵪鶉|鴿)')
df_LS['slaughterCount'] = df_LS['slaughterCount'].where(~where, '')
# (3)產乳量 & 鹿茸 & 蛋(是除過1000)
df_LS['MAE'] = None
df_LS['MAE'] = np.where(df_LS['milkCount'] > 0, '產乳量\n(公斤)', df_LS['MAE'])
df_LS['MAE'] = np.where(df_LS['antlerCount'] > 0, '鹿茸', df_LS['MAE'])
df_LS['MAE'] = np.where(df_LS['eggCount'] > 0, '產蛋量\n(千個)', df_LS['MAE'])
df_LS['MAE_count'] = 0
df_LS['MAE_count'] = df_LS['MAE_count'].astype(object)   # 要容納整數跟float，讓整數不會顯示小數點
df_LS['MAE_count'] = np.where(df_LS['milkCount'] > 0, df_LS['milkCount'], df_LS['MAE_count'])
df_LS['MAE_count'] = np.where(df_LS['antlerCount'] > 0, df_LS['antlerCount'], df_LS['MAE_count'])
df_LS['MAE_count'] = np.where(df_LS['eggCount'] > 0, df_LS['eggCount'], df_LS['MAE_count'])
# (4) 針對乳牛/羊排序
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '泌乳牛', 0, 9)
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '乾乳牛', 1, df_LS['nameOrder'])
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '未產女牛', 2, df_LS['nameOrder'])
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '泌乳羊', 3, df_LS['nameOrder'])
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '乾乳羊', 4, df_LS['nameOrder'])
df_LS['nameOrder'] = np.where(df_LS['animalName'] == '未產女羊', 5, df_LS['nameOrder'])
# ______________________________________________________________________________________________________
cols_tar = [
    'householdNumber', 'fieldName',
    'investigateYear', 'investigateSeason',
    'nameOrder', 'animalName',
    'raiseCount', 'slaughterCount', 'MAE', 'MAE_count',
    'milkCount', 'antlerCount', 'eggCount', 'farmerId'
]
df_LS = df_LS[cols_tar].sort_values(by=cols_tar).drop(columns=['nameOrder'])
cols_tar.remove('nameOrder')
#
print(df_LS.shape)
df_LS[:5]

In [None]:
# 補劉永昌，這樣df_survey就可以用戶號來碰df_LS
where = df_LS.farmerId == 'T102811151'
idx = df_LS[where]['householdNumber'].index
df_LS['householdNumber'][idx] = 'T102811151'
df_LS[where]

## 8-2: 每一戶號，集中畜牧場資訊

In [None]:
df_LS_combine = df_LS[cols_tar[:2]]
df_LS_combine['df_LS'] = df_LS[cols_tar[2:-4]].values[:, None, :].tolist()
df_LS_combine = df_LS_combine.groupby(by=cols_tar[:2], as_index=False).sum()
df_LS_combine['df_LS'] = df_LS_combine.iloc[:, 1:].values[:, None, :].tolist()
del df_LS_combine['fieldName']
df_LS_combine = df_LS_combine.groupby('householdNumber', as_index=False).sum()
df_LS_combine['df_LS'] = df_LS_combine['df_LS'].map(lambda x: dict(x))  # 以牧場名稱為key
#
print(df_LS_combine.shape)
df_LS_combine[:3]

# 9.小大 small_large_data

## 9-1: 讀檔

In [None]:
# 地主ID,佃農ID
df_LID = SE_upper(pd.read_csv(path_db_cache['small_large_tenant_information_landlord_id'], **init)['ownerId'])
df_TID = SE_upper(pd.read_csv(path_db_cache['small_large_tenant_information_tenant_id'], **init)['tenantId'])
# ====================================================================================
# >>> 三個df各自補齊所缺的兩個欄位: 大專業農轉契作 / 小地主出租給付 / 離農獎勵
cols_tar = ['pid', 'period', 'small_large_tenant_transfer', 'small_large_landlord_rent', 'small_large_landlord_retire']
df_SL1 = pd.read_csv(path_db_cache['small_large_tenant_transfer'], **init).rename(columns={"subsidy": "small_large_tenant_transfer"})
df_SL2 = pd.read_csv(path_db_cache['small_large_landlord_rent'], **init).rename(columns={"subsidy": "small_large_landlord_rent"})
df_SL3 = pd.read_csv(path_db_cache['small_large_landlord_retire'], **init).rename(columns={"subsidy": "small_large_landlord_retire"})
df_SL1 = df_SL1.reindex(cols_tar, axis=1).fillna(0)
df_SL2 = df_SL2.reindex(cols_tar, axis=1).fillna(0)
df_SL3 = df_SL3.reindex(cols_tar, axis=1).fillna(0)
# ====================================================================================
# >>> 三表垂直相接
df_SL = pd.concat([df_SL1, df_SL2, df_SL3]).rename(columns={'pid': 'ID'}).astype(dict(zip(cols_tar[-3:], [int]*3)))
df_SL['ID'] = SE_upper(df_SL['ID'])
df_SL = df_SL.groupby(['ID', 'period'], as_index=False).sum()  # 每人每期的三種金額
df_SL = df_SL.merge(df_hh[['ID', 'householdNumber', 'role']], on='ID', how='left').merge(df_survey[['ID', '戶長姓名']], on='ID', how='left')
df_SL['name_or_role'] = np.where(df_SL['戶長姓名'].notna(), df_SL['戶長姓名'], df_SL['role'])  # 每人的名字有就秀，沒有就角色
# landlord_or_tenant
df_SL['landlord_or_tenant'] = np.where(df_SL['ID'].isin(df_LID), '小地主', '')
df_SL['landlord_or_tenant'] = np.where(df_SL['ID'].isin(df_TID), df_SL['landlord_or_tenant']+'大專業農', df_SL['landlord_or_tenant'])
df_SL['landlord_or_tenant'] = df_SL['landlord_or_tenant'].str.replace('小地主大專業農', '小地主/大專業農')
# 將 0 換成 ''，以符合練兄寫法
df_SL['small_large_tenant_transfer'] = df_SL['small_large_tenant_transfer'].replace(0, '')
df_SL['small_large_landlord_rent'] = df_SL['small_large_landlord_rent'].replace(0, '')
df_SL['small_large_landlord_retire'] = df_SL['small_large_landlord_retire'].replace(0, '')
# 欄位調整
cols_tar = [
    'householdNumber', 'name_or_role',
    'small_large_tenant_transfer', 'small_large_landlord_rent', 'small_large_landlord_retire',
    'period', 'landlord_or_tenant'
]
df_SL = df_SL[cols_tar].sort_values(by=cols_tar[:2]+['period']).reset_index(drop=True)  # 索引重編，讓後面合併Se時都從0開始
#
df_SL[:3]

## 9-2: 每一戶號，集中小大資訊

In [None]:
tmpSe = pd.Series(df_SL[cols_tar[1:]].values.tolist()).map(lambda x: [dict(zip(cols_tar[1:], x))])
df_SL_combine = pd.DataFrame({'householdNumber': df_SL['householdNumber'], 'df_SL': tmpSe})  # 合併的兩個Se的index都要一樣從0開始流水號
df_SL_combine = df_SL_combine.groupby('householdNumber', as_index=False).sum()
#
print(df_SL_combine.shape)
df_SL_combine[:3]

# 10.子女獎助學金 child_scholarship

In [None]:
usecols = ['applicantId', 'studentName', 'amount']
df_CS = pd.read_csv(path_db_cache['child_scholarship'], **init, usecols=usecols).rename(columns={'applicantId': 'ID'})
df_CS['ID'] = SE_upper(df_CS['ID'])
df_CS['studentName'] = SE_upper(df_CS['studentName'])
# 父母應該至少要有30歲(因為小孩15歲以上才能申請獎學金(高中))
df_CS = df_CS.merge(df_hh[['ID', 'householdNumber', 'age']], on='ID', how='left').sort_values(by=['householdNumber', 'studentName'])
df_CS = df_CS[df_CS['age'] >= 30].drop(columns=['age'])
# 同戶同人不分期別加總
df_CS = df_CS.groupby(['householdNumber', 'studentName'], as_index=False).sum()
df_CS['studentName'] = df_CS['studentName'] + '-' + df_CS['amount'].astype(str) + ', '
del df_CS['amount']
# 同戶彙整__________________________________________________
df_CS_combine = df_CS.groupby('householdNumber', as_index=False).sum().rename(columns={'studentName': 'df_CS'})
df_CS_combine['df_CS'] = df_CS_combine['df_CS'].str.replace(', $', '')  # 去掉最後的逗號
#
print(df_CS_combine.shape)
df_CS_combine[:3]

# 11.調查名冊總整理 df_survey_combine

In [None]:
cols_src = [
    '農戶編號', 'ID', '戶長姓名', 'birth', '層別', '連結編號', '電話', '地址',
    'householdNumber', 
    'county',
    # 'name', 'tel', 'phone', 'addr'
]
# county後再加8欄
cols_tar = [
    'farmer_num', 'id', 'name', 'birth', 'layer', 'link_num', 'tel', 'addr',
    'householdNumber',
    'county',    
    'household', 'fallow_declare', 'fallow_transfer_subsidy', 'disaster_subsidy',
    'livestock', 'small_large_data', 'crop_name', 'child_scholarship',
]

## 11-1: 名冊每個人增加戶內人口資料

In [None]:
df_survey_combine = df_survey[cols_src].reset_index(drop=True)
# 欄位處理
df_survey_combine['birth'] = df_survey_combine['birth'].fillna(-123).astype(int).replace(-123, '')
# src的10欄名更新為tar的前10個欄名
df_survey_combine = df_survey_combine.set_axis(cols_tar[:10], axis=1)
df_survey_combine = df_survey[cols_src].reset_index(drop=True)
# 欄位處理
df_survey_combine['birth'] = df_survey_combine['birth'].fillna(-123).astype(int).replace(-123, '')
# src的10欄名更新為tar的前10個欄名
df_survey_combine = df_survey_combine.set_axis(cols_tar[:10], axis=1)
# combine 8 個欄位__________________________________________________________________________
# 1.戶籍檔同一戶的所有成員資料，不留戶號
df_survey_combine = df_survey_combine.merge(df_hh_combine, on='householdNumber', how='left')
df_survey_combine['df_hh'] = np.where(df_survey_combine['df_hh'].isna(), pd.Series([[]]), df_survey_combine['df_hh'])
# 2.每戶的休耕轉作【申報】
df_survey_combine = df_survey_combine.merge(df_FD_combine, on='householdNumber', how='left')
df_survey_combine['df_FD'].fillna('', inplace=True)
# 3.每戶的休耕轉作【補貼】
df_survey_combine = df_survey_combine.merge(df_FTS_combine, on='householdNumber', how='left')
df_survey_combine['df_FTS'] = np.where(df_survey_combine['df_FTS'].isna(), pd.Series([[]]), df_survey_combine['df_FTS'])
# 4.每戶的天然災害救助
df_survey_combine = df_survey_combine.merge(df_DS_combine, on='householdNumber', how='left')
df_survey_combine['df_DS'] = np.where(df_survey_combine['df_DS'].isna(), pd.Series([[]]), df_survey_combine['df_DS'])
# 5.每戶的畜牧資料 livestock
df_survey_combine = df_survey_combine.merge(df_LS_combine, on='householdNumber', how='left')
df_survey_combine['df_LS'] = np.where(df_survey_combine['df_LS'].isna(), pd.Series([{}]), df_survey_combine['df_LS'])
# 6.每戶的小大資料
df_survey_combine = df_survey_combine.merge(df_SL_combine, on='householdNumber', how='left')
df_survey_combine['df_SL'] = np.where(df_survey_combine['df_SL'].isna(), pd.Series([[]]), df_survey_combine['df_SL'])
# 7.crop，合併【申報/補貼/天災救助】的作物名稱
df_survey_combine['df_FD_crop'] = df_survey_combine['df_FD'].map(lambda x: x.split(', '))
df_survey_combine['df_FTS_crop'] = df_survey_combine['df_FTS'].map(lambda x: list(set([item['crop'] for item in x])))
df_survey_combine['df_DS_crop'] = df_survey_combine['df_DS'].map(lambda x: list(set([item['crop'] for item in x])))
df_survey_combine['crop'] = df_survey_combine['df_FD_crop'] + df_survey_combine['df_FTS_crop'] + df_survey_combine['df_DS_crop']
df_survey_combine['crop'] = df_survey_combine['crop'].map(lambda x: ', '.join(sorted(list(filter(None, set(x))))))
df_survey_combine.drop(columns=['df_FD_crop', 'df_FTS_crop', 'df_DS_crop'], inplace=True)
# 8.子女獎助學金
df_survey_combine = df_survey_combine.merge(df_CS_combine, on='householdNumber', how='left')
df_survey_combine['df_CS'].fillna('', inplace=True)
# ________________________________________________________________________
# 欄名更新
df_survey_combine = df_survey_combine.set_axis(cols_tar, axis=1).sort_values(by='farmer_num')
#
print(df_survey_combine.shape)
df_survey_combine[:2]

## 11-2: 增加欄位【調查員】 (man)

In [None]:
df_survey_combine['man'] = '暫時'
#
df_survey_combine[:1]

## 11-3: 輸出【df_survey_combine_fromall.csv】

In [None]:
df_survey_combine.to_csv('df_survey_combine.csv', index=False)

In [None]:
time.time() - stime