## Run the following cells for either Experiment Code or Full Run

In [1]:
import numpy as np
import pandas as pd

import unidecode
from datetime import datetime

In [2]:
# configure df options
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 20)

In [3]:
input_path = '../../dataset/bentre'
input_files = {
    'So Lieu Man Ben Tre 2016.xlsx': [2, 5],
}

In [4]:
output_path = '../../dataset/csv/type1'

In [5]:
output_columns = [
    'code',
    'date',
    'min',
    'max',
]

In [6]:
# define method to be applied toward new column 'date'
def calculate_date(row):
    date = None
    try: 
        date = datetime(year=year, month=(row['month'] // 2) + 1, day=row['day'])
    except ValueError:  # handle invalid days of months, such as 02/31
        pass
        
    return date

## Experiment Code

In [None]:
file = list(input_files.keys())[0]
file

In [None]:
sheets = input_files[file]
sheets

In [None]:
# read excel file
filename = f'{input_path}/{file}'
excel = pd.ExcelFile(filename)

In [None]:
# read a sheet
df = pd.read_excel(excel, sheet_name=sheets[1])

df.info()

In [None]:
df.head()

In [None]:
# extract station and convert it to station code
station = df.iloc[2][0]
station = station[station.find(':') + 1:].strip()
station = unidecode.unidecode(station).upper().replace(' ', '')

station

In [None]:
# extract year
year = df.iloc[2][11]
year = int(year[-4:])

year

In [None]:
# name columns
df.columns = [i for i in range(len(df.columns))]

df.head()

In [None]:
# drop non-data rows: first 5 ones
df.drop([*range(5)], inplace=True)
    
df.head()

In [None]:
# merge max + min column pairs into ones for unpivot
for c in range(1, len(df.columns), 2):
    df.iloc[:, c] = df.iloc[:, c + 1].astype('str') + '/' + df.iloc[:, c].astype('str')
    
df.head()

In [None]:
# drop even columns (min ones)
df.drop([*range(2, len(df.columns), 2)], axis=1, inplace=True)

df.head()

In [None]:
# unpivot the dataframe from wide (column based) format to long (row based) one
df = df.melt(id_vars=[0], var_name='month', value_name='min/max')

df.head()

In [None]:
# rename first column
df.rename(columns={ 0: 'day'}, inplace=True)

df.head()

In [None]:
# remove summary row if any
df.drop(df[df['day'] == 'Max/min'].index, errors='ignore', inplace=True)

df.tail()

In [None]:
# add station code column
df.insert(0, output_columns[0], station)
    
df.head()

In [None]:
# calculate 'date' column
df[output_columns[1]] = df.apply(calculate_date, axis=1)

df.head()

In [None]:
# reconstruct min and max columns
df[[output_columns[2], output_columns[3]]] = df['min/max'].str.split('/', expand=True)

df.head()

In [None]:
# drop unnecessary columns
df.drop(['day', 'month', 'min/max'], axis=1, inplace=True)

df.head()

In [None]:
# review invalid date rows
df[df['date'].isnull()]

In [None]:
# check row counts before dropping
df.info()

In [None]:
# drop all rows that invalid dates
df.drop(df[df['date'].isnull()].index, inplace=True)

df.info()

In [None]:
# review head rows
df.head()

In [None]:
# review tail rows
df.tail()

In [None]:
# convert min and max columns to float dtype
df['min'] = pd.to_numeric(df['min'], downcast='float', errors='coerce')
df['max'] = pd.to_numeric(df['max'], downcast='float', errors='coerce')

df.info()

In [None]:
# drop all rows that have less than 3 columns filled out: both min and max are missing
df.dropna(thresh=3, inplace=True)

df.info()

In [None]:
df.tail()

In [None]:
# save to csv file
df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

## Full Run: Process all configured files and their sheets

In [8]:
for file in input_files.keys():
    # read excel file
    filename = f'{input_path}/{file}'
    print(f'Processing file: {filename}')
    
    excel = pd.ExcelFile(filename)

    for sheet in input_files[file]:
        print(f'Processing sheet: {sheet}')
        
        # read a sheet
        df = pd.read_excel(excel, sheet_name=sheet)
        
        # extract station and convert it to station code
        station = df.iloc[2][0]
        station = station[station.find(':') + 1:].strip()
        station = unidecode.unidecode(station).upper().replace(' ', '')
        
        # extract year
        year = df.iloc[2][11]
        year = int(year[-4:])

        # name columns
        df.columns = [i for i in range(len(df.columns))]
        
        # drop non-data rows: first 5 ones
        df.drop([*range(5)], inplace=True)
        
        # merge max + min column pairs into ones for unpivot
        for c in range(1, len(df.columns), 2):
            df.iloc[:, c] = df.iloc[:, c + 1].astype('str') + '/' + df.iloc[:, c].astype('str')
            
        # drop even columns (min ones)
        df.drop([*range(2, len(df.columns), 2)], axis=1, inplace=True)
        
        # unpivot the dataframe from wide (column based) format to long (row based) one
        df = df.melt(id_vars=[0], var_name='month', value_name='min/max')
        
        # rename first column
        df.rename(columns={ 0: 'day'}, inplace=True)
        
        # remove summary row if any
        df.drop(df[df['day'] == 'Max/min'].index, errors='ignore', inplace=True)
        
        # add station code column
        df.insert(0, output_columns[0], station)
        
        # calculate 'date' column
        df[output_columns[1]] = df.apply(calculate_date, axis=1)
        
        # reconstruct min and max columns
        df[[output_columns[2], output_columns[3]]] = df['min/max'].str.split('/', expand=True)
        
        # drop unnecessary columns
        df.drop(['day', 'month', 'min/max'], axis=1, inplace=True)
        
        # drop all rows that invalid dates
        df.drop(df[df['date'].isnull()].index, inplace=True)
        
        # convert min and max columns to float dtype
        df['min'] = pd.to_numeric(df['min'], downcast='float', errors='coerce')
        df['max'] = pd.to_numeric(df['max'], downcast='float', errors='coerce')
        
        # drop all rows that have less than 3 columns filled out: both min and max are missing
        df.dropna(thresh=3, inplace=True)
        
        # save to csv file
        df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2016.xlsx
Processing sheet: 2
Processing sheet: 5
