# New approach for O$_3$ analysis over Japan. Part 2

## Init

In [1]:
import urllib.request
import zipfile
import glob, os, sys
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

## Parameters

In [2]:
u_path = '../inp_data/2009-2017/uzip/'

## Data processing

### Prefectures of Japan

In [3]:
dic_p = {}
with open("../inp_data/pref.txt", "r", encoding='utf-8') as file:
    next(file)
    for line in file:
        key, jp, eng = line.strip().split("\t")
        dic_p[key] = eng

### Regions of Japan

In [4]:
def reg_names(col):
    if col in ['Hokkaido']:
        return 'Hokkaido'
    if col in ['Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima']:
        return 'Tohoku'
    if col in ['Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa']:
        return 'Kanto'
    if col in ['Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano', 'Gifu', 
               'Shizuoka', 'Aichi']:
        return 'Chubu'
    if col in ['Mie', 'Shiga', 'Kyoto', 'Osaka', 'Hyogo', 'Nara', 'Wakayama']:
        return 'Kansai'
    if col in ['Tottori', 'Shimane', 'Okayama', 'Hiroshima', 'Yamaguchi']:
        return 'Chugoku'
    if col in ['Tokushima', 'Kagawa', 'Ehime', 'Kochi']:
        return 'Shikoku'
    if col in ['Fukuoka', 'Saga', 'Nagasaki', 'Kumamoto', 'Oita', 'Miyazaki', 
               'Kagoshima', 'Okinawa']:
        return 'Kyushu'

### Collect df

In [5]:
def run_1df(year):
    
    # Read files and make df
    frames = []
    s_yr = str(year)
    for pr in range(1, pref_max+1):
        s_pr = str(pr).zfill(2)
        f_name = 'j' + s_pr + s_yr + '_' + '06.txt'
        f_name = u_path + s_pr + '/' + s_yr + '/' + f_name

        # --- read
        try:
            df_r = pd.read_csv(f_name, encoding='cp932')
            frames.append(df_r)
        except:
            print(f'No file: {f_name}')
            sys.exit()

    df_s = pd.concat(frames)
    df_s.drop(columns=['測定項目コード', '測定単位コード'], inplace=True)
    df = df_s.copy()
    print('\tFull df len:', len(df_s))
    
    # Add hour data
    df_t = df.copy()
    df_t = pd.melt(df_t, id_vars=['測定年度', '測定局コード', '市町村コード', '測定月', '測定日'], 
                 value_vars=['01h','02h','03h','04h','05h','06h','07h','08h','09h','10h','11h',
                             '12h','13h','14h','15h','16h','17h','18h','19h','20h','21h','22h','23h','24h'])
    df_t['hour'] = df_t.apply(lambda row: int(str(row['variable'])[:-1]) - 1, axis=1)
    
    # Cut wrong days, add index and drop columns
    df_t['index'] = pd.to_datetime(dict(year=df_t.測定年度, month=df_t.測定月, 
                                        day=df_t.測定日, hour=df_t.hour), errors='coerce')
    df_t.dropna(inplace=True)
    df_t.drop(columns=['variable'], inplace=True)
    df_t.set_index('index', inplace=True)
    df_t.rename(columns={'value':'OX'}, inplace=True)
    
    # Remove undef ~9999
    df_t['OX'] = df_t.OX.apply(lambda x: x if x < 9900 else np.nan)
    df_t.dropna(inplace=True)
    
    # Replace Japanese names
    df_t.rename(columns={'測定局コード':'site_id'}, inplace=True)
    df_t.rename(columns={'市町村コード':'city_id'}, inplace=True)
    df_t.rename(columns={'測定年度':'year'}, inplace=True)
    df_t.rename(columns={'測定月':'month'}, inplace=True)
    df_t.rename(columns={'測定日':'day'}, inplace=True)
    
    # Add prefecture names
    df_t['pref'] = df_t.apply(lambda row: str(row['site_id'])[:-6], axis=1)
    df_t['pref'] = df_t['pref'].map(dic_p)
    
    # Add region names
    df_t['region'] = df_t['pref'].apply(reg_names)
    
    # Check
    print(df_t.head())
    print(df_t.tail())
    
    # Save
    df_t.to_pickle('../out_data/jap_ox_' + s_yr)

### Main run

In [6]:
d_path = '../inp_data/2009-2017/'
year = [2017, 2017]
pref_max = 47
for yr in range(year[0], year[1]+1):
    print(yr)
    run_1df(yr)

2017
No file: ../inp_data/2009-2017/uzip/01/2017/j012017_06.txt


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
