## Run the following cells for either Experiment Code or Full Run

In [1]:
import numpy as np
import pandas as pd
import unidecode

In [2]:
# configure df options
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 20)

In [3]:
input_path = '../../dataset/bentre'
input_files = {
    '003. Binh Dai(2002-2015.xls': [10, 11, 12, 13],  # 2012, 2013, 2014, 2015
    '004. Loc Thuan (2002-2015).xls': [10, 11, 13],  # 2012, 2013, 2015 (12 = 2014 has no data)
    '005. An Thuan (2002-2015).xls': [10, 11, 12, 13],  # 2012, 2013, 2014, 2015
    '006. Son Doc (2002-2015).xls': [10, 11, 12, 13],  # 2012, 2013, 2014, 2015
    '007. Ben Trai (2002-2015).xls': [10, 11, 12, 13],  # 2012, 2013, 2014, 2015
    'So Lieu Man Ben Tre 2016.xlsx': [0, 1, 3, 4, 6, 7],  # BDai, LThuan, AThuan, SDoc, BTrai, HgMy
}

In [4]:
output_path = '../../dataset/csv/type2'

In [5]:
output_columns = [
    'code',
    'date',
    'min',
    'max',
    'h01',
    'h03',
    'h05',
    'h07',
    'h09',
    'h11',
    'h13',
    'h15',
    'h17',
    'h19',
    'h21',
    'h23'
]

## Experiment Code

In [111]:
file = list(input_files.keys())[5]

file

'So Lieu Man Ben Tre 2016.xlsx'

In [112]:
sheets = input_files[file]

sheets

[0, 1, 3, 4, 6, 7]

In [113]:
# read excel file
filename = f'{input_path}/{file}'
excel = pd.ExcelFile(filename)

filename

In [114]:
# read a sheet
df = pd.read_excel(excel, sheet_name=sheets[5], dtype=str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 16 columns):
ĐÀI KTTV KHU VỰC NAM BỘ      185 non-null object
Unnamed: 1                   93 non-null object
Unnamed: 2                   93 non-null object
Unnamed: 3                   94 non-null object
Unnamed: 4                   93 non-null object
BIỂU GHI ĐỘ MẶN GIỜ (g/L)    93 non-null object
Unnamed: 6                   93 non-null object
Unnamed: 7                   93 non-null object
Unnamed: 8                   93 non-null object
Unnamed: 9                   94 non-null object
Unnamed: 10                  93 non-null object
Unnamed: 11                  93 non-null object
Unnamed: 12                  93 non-null object
Unnamed: 13                  93 non-null object
Unnamed: 14                  94 non-null object
Unnamed: 15                  93 non-null object
dtypes: object(16)
memory usage: 23.4+ KB


In [115]:
df.head(10)

Unnamed: 0,ĐÀI KTTV KHU VỰC NAM BỘ,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,ĐÀI KTTV TỈNH BẾN TRE,,,,,...,,,,,
1,,,,,,...,,,,,
2,Station: Hương Mỹ,,,River/Canal: Cổ Chiên,,...,,,,Year 2016,
3,Date/Hour,1.0,3.0,5,7.0,...,21.0,23.0,Mean,Max,Min
4,2016-01-01 00:00:00,,,,,...,,,,,
5,2016-01-02 00:00:00,2.3,2.2,2.1,2.1,...,2.2,2.1,2.0083333333333333,2.3,1.5
6,2016-01-03 00:00:00,2.6,2.6,2.5,2.6,...,2.4,2.3,2.05,2.6,1.2
7,2016-01-04 00:00:00,2.3,2.3,2.9,2.9,...,2.6,2.7,2.441666666666667,2.9,2
8,2016-01-05 00:00:00,,,,,...,,,,,
9,2016-01-06 00:00:00,,,,,...,,,,,


In [116]:
# extract marker to detect what type of sheet
marker = df.iloc[0][0]

marker

'ĐÀI KTTV TỈNH BẾN TRE'

In [117]:
if str(marker) == 'nan':  # 2014
    marker = 14
else:
    marker = unidecode.unidecode(marker).upper().strip()
    
    if marker == 'PHONG THI NGHIEM PHAN TICH MOI TRUONG KHU VUC III':  # 2012, 2013
        marker = 12
    elif marker == 'DAI KHI TUONG THUY VAN KHU VUC NAM BO':  # 2015 (LOCTHUAN)
        marker = 15
    elif marker == 'DAI KTTV TINH BEN TRE':  
        marker = 16  # 2015 (BINHDAI), 2016
    
marker

16

In [118]:
# drop extra rows 
if marker == 12:  # first 4 ones on format 2012, 2013
    df.drop(df.head(4).index, inplace=True)
elif marker == 15: # first 2 ones on format 2015 (LOCTHUAN)
    df.drop(df.head(2).index, inplace=True)
    
df.head(10)

Unnamed: 0,ĐÀI KTTV KHU VỰC NAM BỘ,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,ĐÀI KTTV TỈNH BẾN TRE,,,,,...,,,,,
1,,,,,,...,,,,,
2,Station: Hương Mỹ,,,River/Canal: Cổ Chiên,,...,,,,Year 2016,
3,Date/Hour,1.0,3.0,5,7.0,...,21.0,23.0,Mean,Max,Min
4,2016-01-01 00:00:00,,,,,...,,,,,
5,2016-01-02 00:00:00,2.3,2.2,2.1,2.1,...,2.2,2.1,2.0083333333333333,2.3,1.5
6,2016-01-03 00:00:00,2.6,2.6,2.5,2.6,...,2.4,2.3,2.05,2.6,1.2
7,2016-01-04 00:00:00,2.3,2.3,2.9,2.9,...,2.6,2.7,2.441666666666667,2.9,2
8,2016-01-05 00:00:00,,,,,...,,,,,
9,2016-01-06 00:00:00,,,,,...,,,,,


In [119]:
# extract station and convert it to station code
if marker != 14:  # 2012, 2013, 2015, 2016
    station = df.iloc[2][0]
else:  # 2014
    station = df.columns[0]

station = station[station.find(':') + 1:].strip()    
station = unidecode.unidecode(station).upper().replace(' ', '')
    
station

'HUONGMY'

In [120]:
# extract year
if marker == 14:  # 2014
    year = df.columns[13]
elif marker == 15:  # 2015 (LOCTHUAN)
    year = df.iloc[2][11]
else:  # 2012, 2013, 2015 (BINHDAI), 2016
    year = df.iloc[2][14]
    
year = int(year[-4:])

year

2016

In [121]:
# assign new column names
if marker != 14:  # 2012, 2013, 2015, 2016: use the 4th row
    df.columns = df.iloc[3]
else:  # 2014: use the 2nd row 
    df.columns = df.iloc[1]

df.head()

3,Date/Hour,1,3.1,5,7,...,21,23,Mean,Max,Min
0,ĐÀI KTTV TỈNH BẾN TRE,,,,,...,,,,,
1,,,,,,...,,,,,
2,Station: Hương Mỹ,,,River/Canal: Cổ Chiên,,...,,,,Year 2016,
3,Date/Hour,1.0,3.0,5,7.0,...,21.0,23.0,Mean,Max,Min
4,2016-01-01 00:00:00,,,,,...,,,,,


In [122]:
# drop non-data rows
if marker != 14:  # 2012, 2013, 2015, 2016: first 4 ones
    df.drop(df.head(4).index, inplace=True)
else:  # 2014: first 2 ones 
    df.drop(df.head(2).index, inplace=True)
    
df.head()

3,Date/Hour,1,3.1,5,7,...,21,23,Mean,Max,Min
4,2016-01-01 00:00:00,,,,,...,,,,,
5,2016-01-02 00:00:00,2.3,2.2,2.1,2.1,...,2.2,2.1,2.0083333333333333,2.3,1.5
6,2016-01-03 00:00:00,2.6,2.6,2.5,2.6,...,2.4,2.3,2.05,2.6,1.2
7,2016-01-04 00:00:00,2.3,2.3,2.9,2.9,...,2.6,2.7,2.441666666666667,2.9,2.0
8,2016-01-05 00:00:00,,,,,...,,,,,


In [123]:
# drop any column named as NaN
df = df.iloc[:, [i for i in range(len(df.columns)) if str(df.columns[i]) != 'nan']]
    
df.head()

3,Date/Hour,1,3.1,5,7,...,21,23,Mean,Max,Min
4,2016-01-01 00:00:00,,,,,...,,,,,
5,2016-01-02 00:00:00,2.3,2.2,2.1,2.1,...,2.2,2.1,2.0083333333333333,2.3,1.5
6,2016-01-03 00:00:00,2.6,2.6,2.5,2.6,...,2.4,2.3,2.05,2.6,1.2
7,2016-01-04 00:00:00,2.3,2.3,2.9,2.9,...,2.6,2.7,2.441666666666667,2.9,2.0
8,2016-01-05 00:00:00,,,,,...,,,,,


In [124]:
# convert column names to ascii
df.columns = [unidecode.unidecode(name).lower() for name in df.columns]

df.columns

Index(['date/hour',
       '1', '3',
       '5', '7',
       '9', '11',
       '13', '15',
       '17', '19',
       '21', '23',
       'mean',
       'max',
       'min'],
      dtype='object')

In [125]:
# drop BQ/TB/Mean column
df.drop(['bq'], axis=1, errors='ignore', inplace=True)
df.drop(['tb'], axis=1, errors='ignore', inplace=True)
df.drop(['mean'], axis=1, errors='ignore', inplace=True)

df.head()

Unnamed: 0,date/hour,1,3,5,7,...,19,21,23,max,min
4,2016-01-01 00:00:00,,,,,...,,,,,
5,2016-01-02 00:00:00,2.3,2.2,2.1,2.1,...,1.5,2.2,2.1,2.3,1.5
6,2016-01-03 00:00:00,2.6,2.6,2.5,2.6,...,1.2,2.4,2.3,2.6,1.2
7,2016-01-04 00:00:00,2.3,2.3,2.9,2.9,...,2.4,2.6,2.7,2.9,2.0
8,2016-01-05 00:00:00,,,,,...,,,,,


In [126]:
# add station code column
df.insert(0, output_columns[0], station)
    
df.head()

Unnamed: 0,code,date/hour,1,3,5,...,19,21,23,max,min
4,HUONGMY,2016-01-01 00:00:00,,,,...,,,,,
5,HUONGMY,2016-01-02 00:00:00,2.3,2.2,2.1,...,1.5,2.2,2.1,2.3,1.5
6,HUONGMY,2016-01-03 00:00:00,2.6,2.6,2.5,...,1.2,2.4,2.3,2.6,1.2
7,HUONGMY,2016-01-04 00:00:00,2.3,2.3,2.9,...,2.4,2.6,2.7,2.9,2.0
8,HUONGMY,2016-01-05 00:00:00,,,,...,,,,,


In [127]:
# rename columns
df.rename(columns={
        'ngay/gio': output_columns[1], 
        'date/hour': output_columns[1], 
        'ngay': output_columns[1],  # 2015 (LOCTHUAN)
    
        'min': output_columns[2],
        'max': output_columns[3], 
        '1': output_columns[4], 
        '3': output_columns[5], 
        '5': output_columns[6], 
        '7': output_columns[7], 
        '9': output_columns[8], 
        '11': output_columns[9], 
        '13': output_columns[10], 
        '15': output_columns[11], 
        '17': output_columns[12], 
        '19': output_columns[13], 
        '21': output_columns[14], 
        '23': output_columns[15],
    }, errors='ignore', inplace=True)
    
df.head()

Unnamed: 0,code,date,h01,h03,h05,...,h19,h21,h23,max,min
4,HUONGMY,2016-01-01 00:00:00,,,,...,,,,,
5,HUONGMY,2016-01-02 00:00:00,2.3,2.2,2.1,...,1.5,2.2,2.1,2.3,1.5
6,HUONGMY,2016-01-03 00:00:00,2.6,2.6,2.5,...,1.2,2.4,2.3,2.6,1.2
7,HUONGMY,2016-01-04 00:00:00,2.3,2.3,2.9,...,2.4,2.6,2.7,2.9,2.0
8,HUONGMY,2016-01-05 00:00:00,,,,...,,,,,


In [128]:
# remove rows without real dates
df.drop(df[df['date'] == 'Maxtháng'].index, errors='ignore', inplace=True)
df.drop(df[df['date'] == 'Mintháng'].index, errors='ignore', inplace=True)
df.drop(df[df['date'] == 'Ngày/giờ'].index, errors='ignore', inplace=True)

df.loc[20:30]

Unnamed: 0,code,date,h01,h03,h05,...,h19,h21,h23,max,min
20,HUONGMY,2016-01-17 00:00:00,,,,...,,,,,
21,HUONGMY,2016-01-18 00:00:00,2.8,2.9,2,...,0.6,1.5,2.5,2.9,0.6
22,HUONGMY,2016-01-19 00:00:00,2.8,2.8,1.8,...,2.5,3.9,4.1,4.1,1.8
23,HUONGMY,2016-01-20 00:00:00,4.2,4.1,2.5,...,3.7,4.3,4.4,4.4,1.4
24,HUONGMY,2016-01-21 00:00:00,,,,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
26,HUONGMY,2016-01-23 00:00:00,3.7,3.9,2.9,...,8.6,3.4,3.2,8.6,1.1
27,HUONGMY,2016-01-24 00:00:00,4.7,4.8,5.5,...,8.8,7.4,5.3,8.8,2.1
28,HUONGMY,2016-01-25 00:00:00,4.5,4.4,6.4,...,6.9,6.7,5.6,6.9,3.4
29,HUONGMY,2016-01-26 00:00:00,,,,...,,,,,


In [129]:
# convert string based date to Python date
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
df.head()

Unnamed: 0,code,date,h01,h03,h05,...,h19,h21,h23,max,min
4,HUONGMY,2016-01-01,,,,...,,,,,
5,HUONGMY,2016-01-02,2.3,2.2,2.1,...,1.5,2.2,2.1,2.3,1.5
6,HUONGMY,2016-01-03,2.6,2.6,2.5,...,1.2,2.4,2.3,2.6,1.2
7,HUONGMY,2016-01-04,2.3,2.3,2.9,...,2.4,2.6,2.7,2.9,2.0
8,HUONGMY,2016-01-05,,,,...,,,,,


In [130]:
# drop all rows that have less than 3 columns filled out
df.dropna(thresh=3, inplace=True)

df.head()

Unnamed: 0,code,date,h01,h03,h05,...,h19,h21,h23,max,min
5,HUONGMY,2016-01-02,2.3,2.2,2.1,...,1.5,2.2,2.1,2.3,1.5
6,HUONGMY,2016-01-03,2.6,2.6,2.5,...,1.2,2.4,2.3,2.6,1.2
7,HUONGMY,2016-01-04,2.3,2.3,2.9,...,2.4,2.6,2.7,2.9,2.0
12,HUONGMY,2016-01-09,11.9,11.9,11.4,...,11.5,9.4,9.5,11.9,8.3
13,HUONGMY,2016-01-10,10.0,10.3,10.2,...,10.2,7.9,8.0,10.4,6.7


In [131]:
# reorder columns
df = df[output_columns]

df.head()

Unnamed: 0,code,date,min,max,h01,...,h15,h17,h19,h21,h23
5,HUONGMY,2016-01-02,1.5,2.3,2.3,...,2.1,1.5,1.5,2.2,2.1
6,HUONGMY,2016-01-03,1.2,2.6,2.6,...,1.6,1.2,1.2,2.4,2.3
7,HUONGMY,2016-01-04,2.0,2.9,2.3,...,2.3,2.5,2.4,2.6,2.7
12,HUONGMY,2016-01-09,8.3,11.9,11.9,...,9.8,11.6,11.5,9.4,9.5
13,HUONGMY,2016-01-10,6.7,10.4,10.0,...,6.8,10.4,10.2,7.9,8.0


In [132]:
# correct column types
df[output_columns[1]] = df[output_columns[1]].astype('datetime64[ns]')
df[output_columns[2:]] = df[output_columns[2:]].replace('', np.nan).replace(' ', np.nan).astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 5 to 184
Data columns (total 16 columns):
code    92 non-null object
date    92 non-null datetime64[ns]
min     92 non-null float64
max     92 non-null float64
h01     92 non-null float64
h03     92 non-null float64
h05     92 non-null float64
h07     92 non-null float64
h09     92 non-null float64
h11     92 non-null float64
h13     92 non-null float64
h15     92 non-null float64
h17     92 non-null float64
h19     92 non-null float64
h21     92 non-null float64
h23     92 non-null float64
dtypes: datetime64[ns](1), float64(14), object(1)
memory usage: 12.2+ KB


In [133]:
df.head()

Unnamed: 0,code,date,min,max,h01,...,h15,h17,h19,h21,h23
5,HUONGMY,2016-01-02,1.5,2.3,2.3,...,2.1,1.5,1.5,2.2,2.1
6,HUONGMY,2016-01-03,1.2,2.6,2.6,...,1.6,1.2,1.2,2.4,2.3
7,HUONGMY,2016-01-04,2.0,2.9,2.3,...,2.3,2.5,2.4,2.6,2.7
12,HUONGMY,2016-01-09,8.3,11.9,11.9,...,9.8,11.6,11.5,9.4,9.5
13,HUONGMY,2016-01-10,6.7,10.4,10.0,...,6.8,10.4,10.2,7.9,8.0


In [134]:
# save to csv file
df.to_csv(f'{output_path}/{year}-{station}.csv', index = False)

## Automation Code

In [135]:
for file in input_files.keys():
    # read excel file
    filename = f'{input_path}/{file}'
    print(f'Processing file: {filename}')
    
    for sheet in input_files[file]:
        print(f'Processing sheet: {sheet}')

Processing file: ../../dataset/bentre/003. Binh Dai(2002-2015.xls
Processing sheet: 10
Processing sheet: 11
Processing sheet: 12
Processing sheet: 13
Processing file: ../../dataset/bentre/004. Loc Thuan (2002-2015).xls
Processing sheet: 10
Processing sheet: 11
Processing sheet: 13
Processing file: ../../dataset/bentre/005. An Thuan (2002-2015).xls
Processing sheet: 10
Processing sheet: 11
Processing sheet: 12
Processing sheet: 13
Processing file: ../../dataset/bentre/006. Son Doc (2002-2015).xls
Processing sheet: 10
Processing sheet: 11
Processing sheet: 12
Processing sheet: 13
Processing file: ../../dataset/bentre/007. Ben Trai (2002-2015).xls
Processing sheet: 10
Processing sheet: 11
Processing sheet: 12
Processing sheet: 13
Processing file: ../../dataset/bentre/So Lieu Man Ben Tre 2016.xlsx
Processing sheet: 0
Processing sheet: 1
Processing sheet: 3
Processing sheet: 4
Processing sheet: 6
Processing sheet: 7
