#### 気象庁からDLしてきたデータを整形する

日別気温と日別湿度について、同地域を4か月毎に分けてDL  
対象地域と対象期間が分かるようにあらかじめファイル名へ埋め込み

In [251]:
import pandas as pd
import numpy as np
import glob

In [256]:
# リザルトをキャッシュする箱を生成
all_location_df = pd.DataFrame(columns=["ymd", "pref", "pref_sub", "area", "temperature", "humidity"])

In [257]:
# 拾ってくる地点をセット
t_pref = [
    "aichi",
    "akita",
    "aomori",
    "ehime",
    "fukushima",
    "gifu",
    "gunma",
    "hirosima",
    "hokkaido_abashiri",
    "hokkaido_hidaka",
    "hokkaido_hiyama",
    "hokkaido_iburi",
    "hokkaido_ishikari",
    "hokkaido_kamikawa",
    "hokkaido_kushiro",
    "hokkaido_nemuro",
    "hokkaido_oshima",
    "hokkaido_rumoi",
    "hokkaido_shiribeshi",
    "hokkaido_sorachi",
    "hokkaido_soya",
    "hokkaido_tokachi",
    "hukui",
    "hukuoka",
    "hyogo",
    "ibaraki",
    "isikawa",
    "iwate",
    "kagawa",
    "kagosima",
    "kanagawa",
    "kochi",
    "kumamoto",
    "kyoto",
    "mie",
    "miyagi",
    "miyazaki",
    "nagano",
    "nagasaki",
    "nara",
    "nigata",
    "oita",
    "okayama",
    "okinawa",
    "osaka",
    "saga",
    "saitama",
    "siga",
    "simane",
    "sizuoka",
    "tiba",
    "tochigi",
    "tokushima",
    "tokyo",
    "tottori",
    "toyama",
    "wakayama",
    "yamagata",
    "yamaguchi",
    "yamanashi"]

In [242]:
# 気象庁の物故割れ気味のデータをまずいい感じに整形する
def create_cleaned_df(t_name):

    location_df = pd.DataFrame(columns=["ymd", "pref", "pref_sub", "area", "temperature", "humidity"])
    
    # ファイルロード
    df_input_1 = pd.read_csv("../data/temperature_humidity_index/raw/{0}_2018_1_4.csv".format(t_name), 
                 encoding="sjis",
                 skiprows=2,
                 header=None
                )
    df_input_2 = pd.read_csv("../data/temperature_humidity_index/raw/{0}_2018_5_8.csv".format(t_name), 
                 encoding="sjis",
                 skiprows=2,
                 header=None
                )
    df_input_3 = pd.read_csv("../data/temperature_humidity_index/raw/{0}_2018_9_12.csv".format(t_name), 
                 encoding="sjis",
                 skiprows=2,
                 header=None
                )
    df = pd.concat([df_input_1, df_input_2[4:], df_input_3[4:]])
    df = df.reset_index(drop=True)
    
    # 少なくとも温度を観測している気象台のデータだけに絞ってまとめる
    for c_i in range(0, df.shape[1] - 1, 6):
        # dfの中にある1地域をロード
        area_df = df.loc[:, [0, c_i+1, c_i+2, c_i+3, c_i+4, c_i+5, c_i+6]]
        
        # もし気温が集計項目になっていなければ、skipする
        if area_df.iloc[4,2] == "0":
            continue

        # 必要な分類値を回収して、分類のための行を捨てる
        pref_sub = area_df.iloc[0, 1] # 都道府県（気象庁区切り）
        area = area_df.iloc[1, 1] # 地域
        area_df = area_df.loc[4:,:]

        # 整形する
        area_df = area_df.iloc[:, [0, 1, 4]]
        area_df.columns = ["ymd", "temperature", "humidity"]
        area_df = area_df.reset_index(drop=True)

        # 追加列をぶち込む
        area_df["pref"] = t_name
        area_df["pref_sub"] = pref_sub
        area_df["area"] = area
        area_df = area_df[["ymd", "pref", "pref_sub", "area", "temperature", "humidity"]]
        
        # 都道府県dfへキャッシュ
        location_df = pd.concat([location_df, area_df])
    
    return location_df

In [None]:
# 全データを整形
for p_i in t_pref:
    all_location_df = pd.concat([all_location_df, create_cleaned_df(p_i)])

In [None]:
# とりあえずセーブ
all_location_df.to_csv("../data/temperature_humidity_index/processed/all_location.csv", index=False)

In [334]:
# 湿度が観測できたところだけ、不快指数を計算してセーブ
humidity_df = all_location_df[all_location_df["humidity"].isnull() != True].copy()
# 計算前に型変換
humidity_df["temperature"] = humidity_df["temperature"].astype(np.float64)
humidity_df["humidity"] = humidity_df["humidity"].astype(np.float64)

# 不快指数を計算 
humidity_df["hukai_index"] = (humidity_df['temperature'].apply(lambda x: x * 0.81) + 
                              humidity_df['humidity'].apply(lambda x: x * 0.01) *
                              (humidity_df['temperature'].apply(lambda x: x * 0.99) - 14.3) +
                              46.3)

humidity_df.to_csv("./data/location_with_humidity.csv", index=False)