# L0 data clean

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import imp
import datetime
import pytz

In [2]:
#input path of new file to clean
#time period should overlap with that of previous file; no gaps.
Glacier= 'Wolverine'
Station= '990'
file= "wolverine990_2017_0911.txt"
folder=r"Q:\Project Data\GlacierData\Benchmark_Program\Data\Wolverine\2017\Wx"
pth=os.path.join(folder, file)

In [3]:
if Glacier + Station== r'Wolverine990':
    header_rws=2 #number of header rows to not read in
    col_nms=["Date", "Time", "Instrument", "Value", "Unit", "Flag"]
    
if Glacier + Station==r'wolverine1420':
    header_rws=4 #number of header rows to not read in
    #Grab correct column names (a few rows up in Campbell Logger output)
    coltable=pd.read_csv(pth, header=1)
    col_nms=coltable.columns

In [17]:
#Read in data
dat=pd.read_csv(pth, header=header_rws)
dat.columns= col_nms #Name columns

In [18]:
#Set constants
date_format='%m/%d/%Y %H:%M:%S'
timezone='America/Anchorage' #choose from pytz.all_timezones

In [19]:
dat.head()

Unnamed: 0,Date,Time,Instrument,Value,Unit,Flag
0,10/01/2016,00:00:00,WD,140.0,degrees,G
1,10/01/2016,00:00:00,WS,0.7,m/s,G
2,10/01/2016,00:00:00,BV,12.9,V,G
3,10/01/2016,00:00:00,Tinternal,3.8,C,G
4,10/01/2016,00:00:00,PC,0.2258,m,G


In [20]:
for col in list(dat):
    print ("stripping whitespace from " + col) #print column name    
    if type(dat[col].values[0]).__name__ =='str': #if column is a string, strip white space
        dat[col]=dat[col].str.strip()

stripping whitespace from Date
stripping whitespace from Time
stripping whitespace from Instrument
stripping whitespace from Value
stripping whitespace from Unit
stripping whitespace from Flag


In [21]:
#Fix time

#Create date-time column
dat['time']=dat.Date+ " " + dat.Time
dat.loc[:,'time']=pd.to_datetime(dat['time'], format=date_format) #set to date-time from string
dat['index']=dat.time.copy()
#Correct for time slips
dat['time']=dat['time'].dt.round('15min') #round time to the nearest 15 minute value
#Convert to UTC time
dat['UTC_col']=dat.time + datetime.timedelta(hours=8)
dat['UTC_col'].timezone='UTC'
#Create column for local time
local_timezone=pytz.timezone(timezone) #create local timezone object (e.g. AK time)

In [22]:
dat.head()

Unnamed: 0,Date,Time,Instrument,Value,Unit,Flag,time,index,UTC_col
0,10/01/2016,00:00:00,WD,140.0,degrees,G,2016-10-01,2016-10-01,2016-10-01 08:00:00
1,10/01/2016,00:00:00,WS,0.7,m/s,G,2016-10-01,2016-10-01,2016-10-01 08:00:00
2,10/01/2016,00:00:00,BV,12.9,V,G,2016-10-01,2016-10-01,2016-10-01 08:00:00
3,10/01/2016,00:00:00,Tinternal,3.8,C,G,2016-10-01,2016-10-01,2016-10-01 08:00:00
4,10/01/2016,00:00:00,PC,0.2258,m,G,2016-10-01,2016-10-01,2016-10-01 08:00:00


In [None]:
dat=dat.set_index('UTC_col')
#Create column for local time
local_timezone=pytz.timezone(timezone) #create local timezone object (e.g. AK time)

In [30]:
dat['local_time'] = dat.index.tz_localize('UTC').tz_convert(local_timezone)

In [31]:
dat

Unnamed: 0_level_0,Date,Time,Instrument,Value,Unit,Flag,time,index,local_time
UTC_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-10-01 08:00:00,10/01/2016,00:00:00,WD,140.0000,degrees,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,WS,0.7000,m/s,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,BV,12.9000,V,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,Tinternal,3.8000,C,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,PC,0.2258,m,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,RH,26.0000,%,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,AT,5.9900,C,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:00:00,10/01/2016,00:00:00,AT2,5.9000,C,G,2016-10-01 00:00:00,2016-10-01 00:00:00,2016-10-01 00:00:00-08:00
2016-10-01 08:15:00,10/01/2016,00:14:00,T,23.3000,C,G,2016-10-01 00:15:00,2016-10-01 00:14:00,2016-10-01 00:15:00-08:00
2016-10-01 08:15:00,10/01/2016,00:15:00,WSG,1.5000,m/s,G,2016-10-01 00:15:00,2016-10-01 00:15:00,2016-10-01 00:15:00-08:00


In [104]:
dat.UTC

AttributeError: 'DataFrame' object has no attribute 'UTC'

In [67]:
#Correct time slips:
for recorded_minute in pd.unique(dat.index.minute):
    print (recorded_minute)
    if recorded_minute in [0,15,30,45]:
        continue
    else:
        new_min

0
14
15
29
30
44
45
59


array([ 0, 14, 15, 29, 30, 44, 45, 59], dtype=int64)