### Header

In [None]:
# import libraries

# maths
import numpy as np
import pandas as pd
#import scipy.stats as stats
#from pandas.api.types import is_numeric_dtype

# others
import os
import re
import time
import datetime as datetime

In [None]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Import Data

In [None]:
train = pd.read_csv(input_path+'train.csv')
test = pd.read_csv(input_path+'test.csv')

### Inspect Data

In [None]:
train.head()

In [None]:
test.head()

### Clean Data

In [None]:
def create_yr(x): 
    return x.split('-')[0] 

def create_mth(x): 
    return x.split('-')[1] 

def create_day(x): 
    return x.split('-')[2] 

def rename_columns (columns):
    return [column.lower() for column in columns]

def clean_data(df): 
    df['year'] = df.Date.apply(create_yr)
    df['month'] = df.Date.apply(create_mth)
    df['day'] = df.Date.apply(create_day)    

    df.drop(['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'Date'], axis = 1, inplace = True)
    
    df.columns = rename_columns(df.columns)
    
    return df

train = clean_data(train)
test = clean_data(test)

In [None]:
# merge nummosquitos and wnvpresent for duplicated rows

# create train_2 (new copy)
# to compare train and train_2 and verify code
train_2 = train.copy()

# duplicated rows have the same values for the columns below
cols = ['species','trap','year','month', 'day','latitude','longitude']

for row_idx,row in train_2.iterrows():
    
    # skip 1st row
    if row_idx > 0:
        
        # reset counter for each row
        duplicate_count = 0        
        
        for col in cols:
            
            # compare cells in current and previous rows
            # increment counter if both cells have the same value 
            if train_2.at[row_idx,col] == train_2.at[row_idx-1,col]:
                duplicate_count += 1               
                
        # counter equal to number of selected column
        # current row['nummosquitos','wnvpresent'] = sum of current and previous rows
        if duplicate_count == len(cols):
            train_2.at[row_idx,'nummosquitos'] = train_2.at[row_idx,'nummosquitos'] + train_2.at[row_idx-1,'nummosquitos']
            train_2.at[row_idx,'wnvpresent'] = train_2.at[row_idx,'wnvpresent'] + train_2.at[row_idx-1,'wnvpresent']

# remove duplicated rows (keep only last row)
train_2.drop_duplicates(subset=cols, keep='last',inplace=True)

In [None]:
# check total nummosquitos and wnvpresent

print(train['nummosquitos'].sum())
print(train['wnvpresent'].sum())
print('')
print(train_2['nummosquitos'].sum())
print(train_2['wnvpresent'].sum())
print('')
print(train_2['wnvpresent'].unique())

In [None]:
# convert wnvpresent: total count to 0 or 1

for row_idx,row in train_2.iterrows():
    
    if row['wnvpresent'] > 1:
        #print(row_idx,row['wnvpresent'])
        train_2.at[row_idx,'wnvpresent'] = 1
        
print(train_2['wnvpresent'].sum())
print(train_2['wnvpresent'].unique())

### Output Data

In [None]:
train_2.head()

In [None]:
test.head()

In [None]:
train_2.to_csv(clean_path+'train_clean.csv',index=False)
test.to_csv(clean_path+'test_clean.csv',index=False)