## Run the following cells for either Experiment Code or Full Run

In [1]:
import numpy as np
import pandas as pd

import unidecode
from datetime import datetime

In [2]:
# configure df options
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 20)

In [3]:
input_path = '../../dataset/bentre'
input_files = {
    'So Lieu Man Ben Tre 2016.xlsx': [2, 5],  # GHoa, MHoa
    'So Lieu Man Ben Tre 2017.xlsx': [4],  # MHoa
    'So Lieu Man Ben Tre 2018.xlsx': [0, 1, 2, 3, 4, 5, 6, 7],  # Vam Giong, Binh Dai, Giao Hoa, An Thuan, Son Doc, My Hoa, Ben Trai, Huong My
    
    '/solieuman Do bo sung_2018/001 PHƯỚC LONG  NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/002 TIÊN THỦY NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/003 MỎ CÀY NĂM 2018.xlsx': [0], 
    '/solieuman Do bo sung_2018/004 TÂN THIỀNG NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/005 TÍCH THIỆN NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/006 SƠN PHÚ NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/007 SƠN ĐỐC NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/008 CÁI MÍT NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/009 CẨM SƠN NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/010 THÀNH THỚI A NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/011 CỐNG NÀNG ÂM NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/012 MỸ THÀNH NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/013 LỘC THUẬN NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/014 VŨNG LIÊM NĂM 2018.xlsx': [0],
    '/solieuman Do bo sung_2018/015 HƯNG KHÁNH TRUNG B NĂM 2018.xlsx': [0],
}

In [4]:
output_path = '../../dataset/csv/type1'

In [5]:
output_columns = [
    'code',
    'date',
    'min',
    'max',
]

In [6]:
# define method to be applied toward new column 'date'
def calculate_date(row):
    date = None
    try: 
        date = datetime(year=year, month=(int(row['month']) // 2) + 1, day=int(row['day']))
    except ValueError:  # handle invalid days of months, such as 02/31
        pass
        
    return date

## Experiment Code

In [7]:
file = list(input_files.keys())[17]

file

'/solieuman Do bo sung_2018/015 HƯNG KHÁNH TRUNG B NĂM 2018.xlsx'

In [8]:
sheets = input_files[file]

sheets

[0]

In [9]:
# read excel file
filename = f'{input_path}/{file}'
excel = pd.ExcelFile(filename)

filename

'../../dataset/bentre//solieuman Do bo sung_2018/015 HƯNG KHÁNH TRUNG B NĂM 2018.xlsx'

In [10]:
# read a sheet
df = pd.read_excel(excel, sheet_name=sheets[0], dtype=str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 11 columns):
KẾT QUẢ ĐO MẶN (g/l) TRẠM HƯNG KHÁNH TRUNG B NĂM 2018    35 non-null object
Unnamed: 1                                               34 non-null object
Unnamed: 2                                               33 non-null object
Unnamed: 3                                               31 non-null object
Unnamed: 4                                               30 non-null object
Unnamed: 5                                               34 non-null object
Unnamed: 6                                               33 non-null object
Unnamed: 7                                               33 non-null object
Unnamed: 8                                               32 non-null object
Unnamed: 9                                               34 non-null object
Unnamed: 10                                              33 non-null object
dtypes: object(11)
memory usage: 3.1+ KB


In [11]:
df.head()

Unnamed: 0,KẾT QUẢ ĐO MẶN (g/l) TRẠM HƯNG KHÁNH TRUNG B NĂM 2018,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,...,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Tháng,1,,2,,...,,4,,5,
1,Ngày,Max,Min,Max,Min,...,Min,Max,Min,Max,Min
2,1,0,0,0,0,...,0,0,0,0,0
3,2,0,0,0,0,...,0,0,0,0,0
4,3,0,0,0,0,...,0,0,0,0,0


In [12]:
# extract marker to detect what type of sheet
marker = df.iloc[0][0]

marker

'Tháng'

In [13]:
marker = unidecode.unidecode(str(marker)).upper().strip()
                             
if marker in ['THANG', 'MONTH']:
    marker = 18  # 2018 additional
else:    
    marker = 0  # the rest
    
marker

18

In [14]:
# extract station and convert it to station code
if marker == 18:  # 2018 additional
    station = unidecode.unidecode(df.columns[0]).upper()
    station = station[station.find('TRAM') + len('TRAM'): station.rfind('NAM')].strip()
else:
    station = unidecode.unidecode(df.iloc[2][0]).upper()
    station = station[station.find(':') + 1:].strip()
    
station = station.replace(' ', '')

station

'HUNGKHANHTRUNGB'

In [15]:
# extract year
if marker == 18:  # 2018 additional
    year = df.columns[0].split()[-1]
else:
    year = df.iloc[2][11]

year = int(year[-4:])

year

2018

In [16]:
# name columns
df.columns = [i for i in range(len(df.columns))]

df.head()

Unnamed: 0,0,1,2,3,4,...,6,7,8,9,10
0,Tháng,1,,2,,...,,4,,5,
1,Ngày,Max,Min,Max,Min,...,Min,Max,Min,Max,Min
2,1,0,0,0,0,...,0,0,0,0,0
3,2,0,0,0,0,...,0,0,0,0,0
4,3,0,0,0,0,...,0,0,0,0,0


In [17]:
# drop non-data rows
if marker == 18:  # first 2 rows
    df.drop([*range(2)], inplace=True)
else:  # first 5 ones
    df.drop([*range(5)], inplace=True)
    
df.head()

Unnamed: 0,0,1,2,3,4,...,6,7,8,9,10
2,1,0,0,0,0,...,0,0,0,0,0
3,2,0,0,0,0,...,0,0,0,0,0
4,3,0,0,0,0,...,0,0,0,0,0
5,4,0,0,0,0,...,0,0,0,0,0
6,5,0,0,0,0,...,0,0,0,0,0


In [18]:
# merge max + min column pairs into ones for unpivot
for c in range(1, len(df.columns), 2):
    df.iloc[:, c] = df.iloc[:, c + 1].astype('str') + '/' + df.iloc[:, c].astype('str')
    
df.head()

Unnamed: 0,0,1,2,3,4,...,6,7,8,9,10
2,1,0/0,0,0/0,0,...,0,0/0,0,0/0,0
3,2,0/0,0,0/0,0,...,0,0/0,0,0/0,0
4,3,0/0,0,0/0,0,...,0,0/0,0,0/0,0
5,4,0/0,0,0/0,0,...,0,0/0,0,0/0,0
6,5,0/0,0,0/0,0,...,0,0/0,0,0/0,0


In [19]:
# drop even columns (min ones)
df.drop([*range(2, len(df.columns), 2)], axis=1, inplace=True)

df.head()

Unnamed: 0,0,1,3,5,7,9
2,1,0/0,0/0,0/0,0/0,0/0
3,2,0/0,0/0,0/0,0/0,0/0
4,3,0/0,0/0,0/0,0/0,0/0
5,4,0/0,0/0,0/0,0/0,0/0
6,5,0/0,0/0,0/0,0/0,0/0


In [20]:
# unpivot the dataframe from wide (column based) format to long (row based) one
df = df.melt(id_vars=[0], var_name='month', value_name='min/max')

df.head(10)

Unnamed: 0,0,month,min/max
0,1,1,0/0
1,2,1,0/0
2,3,1,0/0
3,4,1,0/0
4,5,1,0/0
5,6,1,0/0
6,7,1,0/0
7,8,1,0/0
8,9,1,0/0
9,10,1,0/0


In [21]:
# rename first column
df.rename(columns={ 0: 'day'}, inplace=True)

df.head(10)

Unnamed: 0,day,month,min/max
0,1,1,0/0
1,2,1,0/0
2,3,1,0/0
3,4,1,0/0
4,5,1,0/0
5,6,1,0/0
6,7,1,0/0
7,8,1,0/0
8,9,1,0/0
9,10,1,0/0


In [22]:
# remove summary row if any
df.drop(df[df['day'] == 'Max/min'].index, errors='ignore', inplace=True)
df.drop(df[df['day'] == 'Maxtháng'].index, errors='ignore', inplace=True)
df.drop(df[df['day'] == 'Mintháng'].index, errors='ignore', inplace=True)

df.tail(10)

Unnamed: 0,day,month,min/max
153,22,9,0/0
154,23,9,0/0
155,24,9,0/0
156,25,9,0/0
157,26,9,0/0
158,27,9,0/0
159,28,9,0/0
160,29,9,0/0
161,30,9,0/0
162,31,9,0/0


In [23]:
# add station code column
df.insert(0, output_columns[0], station)
    
df.head(10)

Unnamed: 0,code,day,month,min/max
0,HUNGKHANHTRUNGB,1,1,0/0
1,HUNGKHANHTRUNGB,2,1,0/0
2,HUNGKHANHTRUNGB,3,1,0/0
3,HUNGKHANHTRUNGB,4,1,0/0
4,HUNGKHANHTRUNGB,5,1,0/0
5,HUNGKHANHTRUNGB,6,1,0/0
6,HUNGKHANHTRUNGB,7,1,0/0
7,HUNGKHANHTRUNGB,8,1,0/0
8,HUNGKHANHTRUNGB,9,1,0/0
9,HUNGKHANHTRUNGB,10,1,0/0


In [24]:
# calculate 'date' column
df[output_columns[1]] = df.apply(calculate_date, axis=1)

df.head(10)

Unnamed: 0,code,day,month,min/max,date
0,HUNGKHANHTRUNGB,1,1,0/0,2018-01-01
1,HUNGKHANHTRUNGB,2,1,0/0,2018-01-02
2,HUNGKHANHTRUNGB,3,1,0/0,2018-01-03
3,HUNGKHANHTRUNGB,4,1,0/0,2018-01-04
4,HUNGKHANHTRUNGB,5,1,0/0,2018-01-05
5,HUNGKHANHTRUNGB,6,1,0/0,2018-01-06
6,HUNGKHANHTRUNGB,7,1,0/0,2018-01-07
7,HUNGKHANHTRUNGB,8,1,0/0,2018-01-08
8,HUNGKHANHTRUNGB,9,1,0/0,2018-01-09
9,HUNGKHANHTRUNGB,10,1,0/0,2018-01-10


In [25]:
# reconstruct min and max columns
df[[output_columns[2], output_columns[3]]] = df['min/max'].str.split('/', expand=True)

df.head()

Unnamed: 0,code,day,month,min/max,date,min,max
0,HUNGKHANHTRUNGB,1,1,0/0,2018-01-01,0,0
1,HUNGKHANHTRUNGB,2,1,0/0,2018-01-02,0,0
2,HUNGKHANHTRUNGB,3,1,0/0,2018-01-03,0,0
3,HUNGKHANHTRUNGB,4,1,0/0,2018-01-04,0,0
4,HUNGKHANHTRUNGB,5,1,0/0,2018-01-05,0,0


In [26]:
# drop unnecessary columns
df.drop(['day', 'month', 'min/max'], axis=1, inplace=True)

df.head()

Unnamed: 0,code,date,min,max
0,HUNGKHANHTRUNGB,2018-01-01,0,0
1,HUNGKHANHTRUNGB,2018-01-02,0,0
2,HUNGKHANHTRUNGB,2018-01-03,0,0
3,HUNGKHANHTRUNGB,2018-01-04,0,0
4,HUNGKHANHTRUNGB,2018-01-05,0,0


In [27]:
# review invalid date rows
df[df['date'].isnull()]

Unnamed: 0,code,date,min,max
61,HUNGKHANHTRUNGB,NaT,,
62,HUNGKHANHTRUNGB,NaT,,
63,HUNGKHANHTRUNGB,NaT,,
129,HUNGKHANHTRUNGB,NaT,,


In [28]:
# check row counts before dropping
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155 entries, 0 to 162
Data columns (total 4 columns):
code    155 non-null object
date    151 non-null datetime64[ns]
min     155 non-null object
max     155 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 6.1+ KB


In [29]:
# drop all rows that invalid dates
df.drop(df[df['date'].isnull()].index, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151 entries, 0 to 162
Data columns (total 4 columns):
code    151 non-null object
date    151 non-null datetime64[ns]
min     151 non-null object
max     151 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 5.9+ KB


In [30]:
# review head rows
df.head()

Unnamed: 0,code,date,min,max
0,HUNGKHANHTRUNGB,2018-01-01,0,0
1,HUNGKHANHTRUNGB,2018-01-02,0,0
2,HUNGKHANHTRUNGB,2018-01-03,0,0
3,HUNGKHANHTRUNGB,2018-01-04,0,0
4,HUNGKHANHTRUNGB,2018-01-05,0,0


In [31]:
# review tail rows
df.tail()

Unnamed: 0,code,date,min,max
158,HUNGKHANHTRUNGB,2018-05-27,0,0
159,HUNGKHANHTRUNGB,2018-05-28,0,0
160,HUNGKHANHTRUNGB,2018-05-29,0,0
161,HUNGKHANHTRUNGB,2018-05-30,0,0
162,HUNGKHANHTRUNGB,2018-05-31,0,0


In [32]:
# convert min and max columns to float dtype
df['min'] = pd.to_numeric(df['min'], downcast='float', errors='coerce')
df['max'] = pd.to_numeric(df['max'], downcast='float', errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151 entries, 0 to 162
Data columns (total 4 columns):
code    151 non-null object
date    151 non-null datetime64[ns]
min     151 non-null float32
max     151 non-null float32
dtypes: datetime64[ns](1), float32(2), object(1)
memory usage: 4.7+ KB


In [33]:
# drop all rows that have less than 3 columns filled out: both min and max are missing
df.dropna(thresh=3, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151 entries, 0 to 162
Data columns (total 4 columns):
code    151 non-null object
date    151 non-null datetime64[ns]
min     151 non-null float32
max     151 non-null float32
dtypes: datetime64[ns](1), float32(2), object(1)
memory usage: 4.7+ KB


In [34]:
df.tail()

Unnamed: 0,code,date,min,max
158,HUNGKHANHTRUNGB,2018-05-27,0.0,0.0
159,HUNGKHANHTRUNGB,2018-05-28,0.0,0.0
160,HUNGKHANHTRUNGB,2018-05-29,0.0,0.0
161,HUNGKHANHTRUNGB,2018-05-30,0.0,0.0
162,HUNGKHANHTRUNGB,2018-05-31,0.0,0.0


In [35]:
# save to csv file
df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

## Automation Code

In [36]:
for file in input_files.keys():
    # read excel file
    filename = f'{input_path}/{file}'
    print(f'Processing file: {filename}')
 
    for sheet in input_files[file]:
        print(f'Processing sheet: {sheet}')

Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2016.xlsx
Processing sheet: 2
Processing sheet: 5
Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2017.xlsx
Processing sheet: 4
Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2018.xlsx
Processing sheet: 0
Processing sheet: 1
Processing sheet: 2
Processing sheet: 3
Processing sheet: 4
Processing sheet: 5
Processing sheet: 6
Processing sheet: 7
Processing file: ../../dataset/bentre//solieuman Do bo sung_2018/001 PHƯỚC LONG  NĂM 2018.xlsx
Processing sheet: 0
Processing file: ../../dataset/bentre//solieuman Do bo sung_2018/002 TIÊN THỦY NĂM 2018.xlsx
Processing sheet: 0
Processing file: ../../dataset/bentre//solieuman Do bo sung_2018/003 MỎ CÀY NĂM 2018.xlsx
Processing sheet: 0
Processing file: ../../dataset/bentre//solieuman Do bo sung_2018/004 TÂN THIỀNG NĂM 2018.xlsx
Processing sheet: 0
Processing file: ../../dataset/bentre//solieuman Do bo sung_2018/005 TÍCH THIỆN NĂM 2018.xlsx
Processing sheet: 0
Proces