## Run the following cells for either Experiment Code or Full Run

In [1]:
import numpy as np
import pandas as pd
import unidecode

In [2]:
# configure df options
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 20)

In [3]:
input_path = '../../dataset/bentre'
input_files = {
    'So Lieu Man Ben Tre 2016.xlsx': [0, 1, 3, 4, 6, 7],
}

In [4]:
output_path = '../../dataset/csv/type2'

In [5]:
output_columns = [
    'code',
    'date',
    'time',
    'min',
    'max',
    'h01',
    'h03',
    'h05',
    'h07',
    'h09',
    'h11',
    'h13',
    'h15',
    'h17',
    'h19',
    'h21',
    'h23'
]

## Experiment Code

In [None]:
file = list(input_files.keys())[0]
file

In [None]:
sheets = input_files[file]
sheets

In [None]:
# read excel file
filename = f'{input_path}/{file}'
excel = pd.ExcelFile(filename)

In [None]:
# read a sheet
df = pd.read_excel(excel, sheet_name=sheets[5])

df.info()

In [None]:
df.head()

In [None]:
# extract station and convert it to station code
station = df.iloc[2][0]
station = station[station.find(':') + 1:].strip()
station = unidecode.unidecode(station).upper().replace(' ', '')

station

In [None]:
# extract year
year = df.iloc[2][14]
year = int(year[-4:])

year

In [None]:
# use the 4th row as column names
df.columns = df.iloc[3]

df.head()

In [None]:
# drop non-data rows: first 4 ones
df.drop([*range(4)], inplace=True)
    
df.head()

In [None]:
# drop BQ/Mean column
df.drop(['BQ'], axis=1, errors='ignore', inplace=True)
df.drop(['Mean'], axis=1, errors='ignore', inplace=True)
    
df.head()

In [None]:
# add station code column
df.insert(0, output_columns[0], station)
    
df.head()

In [None]:
# add time column
df.insert(2, output_columns[2], value=np.nan)

df.head()

In [None]:
# rename columns
df.rename(columns={
        'Ngày/Giờ': output_columns[1], 
        'Date/Hour': output_columns[1], 
    
        'Min': output_columns[3],
        'Max': output_columns[4], 
        1.0: output_columns[5], 
        3.0: output_columns[6], 
        5.0: output_columns[7], 
        7.0: output_columns[8], 
        9.0: output_columns[9], 
        11.0: output_columns[10], 
        13.0: output_columns[11], 
        15.0: output_columns[12], 
        17.0: output_columns[13], 
        19.0: output_columns[14], 
        21.0: output_columns[15], 
        23.0: output_columns[16],
    }, errors='ignore', inplace=True)
    
df.head()

In [None]:
# convert string based date to Python date
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
df.head()

In [None]:
# drop all rows that have less than 3 columns filled out
df.dropna(thresh=3, inplace=True)

df.head()

In [None]:
# reorder columns
df = df[output_columns]

df.head()

In [None]:
# save to csv file
df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

## Full Run: Process all configured files and their sheets

In [6]:
for file in input_files.keys():
    # read excel file
    filename = f'{input_path}/{file}'
    print(f'Processing file: {filename}')
    
    excel = pd.ExcelFile(filename)
    
    for sheet in input_files[file]:
        print(f'Processing sheet: {sheet}')
        
        # read a sheet
        df = pd.read_excel(excel, sheet_name=sheet)
        
        # extract station and convert it to station code
        station = df.iloc[2][0]
        station = station[station.find(':') + 1:].strip()
        station = unidecode.unidecode(station).upper().replace(' ', '')
        
        # extract year
        year = df.iloc[2][14]
        year = int(year[-4:])
        
        # use the 4th row as column names
        df.columns = df.iloc[3]
        
        # drop non-data rows: first 4 ones
        df.drop([*range(4)], inplace=True)
        
        # drop BQ/Mean column
        df.drop(['BQ'], axis=1, errors='ignore', inplace=True)
        df.drop(['Mean'], axis=1, errors='ignore', inplace=True)        
        
        # add station code column
        df.insert(0, output_columns[0], station)
        
        # add time column
        df.insert(2, output_columns[2], value=np.nan)
              
        # rename columns
        df.rename(columns={
            'Ngày/Giờ': output_columns[1], 
            'Date/Hour': output_columns[1], 
            
            'Min': output_columns[3],
            'Max': output_columns[4], 
            1.0: output_columns[5], 
            3.0: output_columns[6], 
            5.0: output_columns[7], 
            7.0: output_columns[8], 
            9.0: output_columns[9], 
            11.0: output_columns[10], 
            13.0: output_columns[11], 
            15.0: output_columns[12], 
            17.0: output_columns[13], 
            19.0: output_columns[14], 
            21.0: output_columns[15], 
            23.0: output_columns[16], 
        }, errors='ignore', inplace=True)
           
        # convert string based date to Python date
        df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
           
        # drop all rows that have less than 3 columns filled out
        df.dropna(thresh=3, inplace=True)
            
        # reorder columns
        df = df[output_columns]
      
        # save to csv file
        df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2016.xlsx
Processing sheet: 0
Processing sheet: 1
Processing sheet: 3
Processing sheet: 4
Processing sheet: 6
Processing sheet: 7
