# Imports

In [1]:
import pandas as pd
import numpy as np
import pathlib
import calendar
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Data Connection

In [None]:
from sqlalchemy import create_engine

database_filename = 'tdpdata.db'
table_name = 'tdpsheet'
engine = create_engine('sqlite:///{}'.format(database_filename)) 

In [None]:
tester = engine.execute("SELECT SITE_NUMBER, OBSERVATION_TIME, TMR_SUB_18 FROM tdpsheet WHERE SITE_NUMBER is '128' LIMIT 5")

for result in tester:
    print(result)

print(engine.table_names())

# Data Loading

In [None]:
query = "SELECT SITE_NUMBER, OBSERVATION_TIME, TMR_SUB_18 FROM tdpsheet WHERE SITE_NUMBER is '128'"

In [None]:
df_sql_test = pd.read_sql(query, engine, parse_dates='OBSERVATION_TIME')

In [None]:
df_copy_sql = df_sql_test.reset_index(drop=True)

df_copy_sql.reset_index(inplace=True, drop=True)

df_copy_sql

In [None]:
df_copy_sql.info()

# Reduce and Specify Dataframe feature/values

In [None]:
df_128_all = df_copy_sql.loc[df_copy_sql['SITE_NUMBER']=='128']

df_128_all_18 = df_128_all[['SITE_NUMBER','OBSERVATION_TIME','TMR_SUB_18']]

df_128_all_18.head()

# DateTime

In [None]:
# df_128_all_18.OBSERVATION_TIME = pd.to_datetime(df_128_all_18['OBSERVATION_TIME'], 
                                                format="%Y-%m-%d-%H-%M-%S")

In [5]:
def datetime_parse(df):

    df['year'] = df.OBSERVATION_TIME.dt.year
    df['month'] = df.OBSERVATION_TIME.dt.month
    
    df['day'] = df.OBSERVATION_TIME.dt.day
    df['hour'] = df.OBSERVATION_TIME.dt.hour
    
    return df

In [6]:
df_128 = datetime_parse(df_128)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
df_128_all_18.dtypes

# DataFrame Pickle

In [None]:
# filename = "df_128_all_18.pkl"
filename = "df_128_all_years_18.pkl"
df_128_all_18.to_pickle(filename)

In [None]:
filename = "df_128_all_years_18.pkl"
df_128_all_18 = pd.read_pickle(filename)

In [None]:
filename = "df_128_all_years_18_clean.pkl" # this is a cleaned DF with nulls
df_clean = pd.read_pickle(filename)

In [2]:
filename = "df_128.pkl" # this is a cleaned DF with nulls
df_128 = pd.read_pickle(filename)

df_128.sort_values('OBSERVATION_TIME', inplace=True)

# drop unnecessary rows
df_128 = df_128.drop(['REF_TEMP', 'IN_PAVEMENT_TEMP', 'INTERNAL_TEMP', 'BATTERY_VOLTAGE'], axis=1)

In [None]:
df_128.isnull().sum()

# Create 3/5 Test dataframe

In [None]:
df_35 = df_128_all_18

In [None]:
df_35_test = df_35.loc[df_35.year.between(2013, 2017)]

# 24 hours rule

## less_than_24_hours

In [None]:
df_reindex.head()

In [None]:
df_index = df_reindex.set_index(['year','month','day']).sort_values('hour') 

In [None]:
df_index.head()

In [None]:
df_group = df_reindex.groupby(['year', 'month', 'day'])['hour'].count()

In [None]:
df_group

In [9]:
def less_than_24_hours(df):
    
    """
    this function determines if there is are less than 24 hours in a day of a month/year groupby object,
    then creates a hour_flag column and identifies it as flagged with a "1" value.
    
    This function involves groupby objects, masking, reindexing, and returns a new DataFrame.
    
    Argument:
    df: DataFrame with OBSERVATION_TIME, year, month, day columns
    
    Output:
    df_new: New DataFrame with similar design as original, but with addition of hours_flag column
    
    """
    print(df.OBSERVATION_TIME.isnull().sum())
    print(df.shape)
    df_group = df.groupby(['year', 'month', 'day'])['hour'].count() # gets the count of hours in the day as a series
    
    df_less = df_group == 24 # boolean test for 24 hours, produces series of boolean True/False
    
    # makes a list of values that are false and gets their index
    less_list = df_less.loc[df_less==False].index.values.tolist() 
    
    
    # creates new dataframe with a new index matching a groupby
    df_index = df.set_index(['year','month','day']).sort_values('hour') 
    
    df_index['hours_flag'] = 0 # creates new column for flagging records
    
    # any row index that matches the loc mask "less_list" gets its' hours_flag value set to "1"
    df_index.loc[less_list,'hours_flag'] = 1
    # df_index.loc[less_list,'OBSERVATION_TIME'] = np.nan # need to keep value until later
    
    print(df.OBSERVATION_TIME.isnull().sum())
    
    #create new dataframe with regular index based upon time
    df_new = df_index.reset_index().sort_values(['OBSERVATION_TIME'])
    
    print(df_new.shape)
    
    return df_new

In [10]:
df_128_24 = less_than_24_hours(df_128)

0
(110581, 23)
0
(110581, 24)


In [None]:
df_clean = df_clean.dropna(axis=0)

In [None]:
df_clean = less_than_24_hours(df_clean)

In [None]:
df_128.loc[df_128['hours_flag']==1].head()

In [None]:
df_128.loc[df_128['hours_flag']==1].count()

In [None]:
reindex_df_test = reindex_df_test[reindex_df_test.hours_flag!=1] # this is one way to drop flagged records

In [None]:
df_128 = df_128[df_128.hours_flag!=1] # this is one way to drop flagged records

In [None]:
df_clean.loc[df_clean['hours_flag']==1].count()

# 5 Total Rule

## 5 Missing Day Rule For Loop

In [None]:
print(mm)
print(yy)
print((days_month.loc[(days_month['month']==mm) & (days_month['year']==yy)]['day'].values))
print(calendar.monthlen(yy,mm))

In [11]:
## Calendar Builder
## throws a warning about truth ambiguous for empty array. i believe it's for months that don't exist in the data.
## will need to develop logic for [arrray.size > 0] to avoid generating a future error. 

def more_than_5_total(df):
    
    days_month = df.groupby(['year','month'])['day'].nunique().reset_index()
   
    df['days_5_total_flag'] = 0
    # get the values used to created calendar
    days_arrays = days_month[['year', 'month']]
    
    # for loop to create calendar for use with calendar.monthlen
    for years in range(len(days_arrays.year.unique())):
        # print(days_arrays.loc[years, 'year'])
        yy = days_arrays.loc[days_arrays.year==days_arrays.year.unique()[years], 'year'].min()
        # print(yy)
        for months in range(len(days_arrays.month.unique())):
            # print(days_arrays.loc[months, 'month'])
            mm = days_arrays.loc[months, 'month'].max()
            # print(mm)
            # print(days_arrays.monthlen(yy,mm)-5)
            if (days_month.loc[(days_month['month']==mm) & (days_month['year']==yy)]['day'].values) > (calendar.monthlen(yy,mm)-5):
                # print("good")
                df.loc[(df.month==mm) & (df.year==yy), 'days_5_total_flag'] = 0
            else:
                df.loc[(df.month==mm) & (df.year==yy), 'days_5_total_flag'] = 1
                # df.loc[(df.month==mm) & (df.year==yy), 'OBSERVATION_TIME'] = np.nan
    
    return df

In [12]:
df_128_24_5 = more_than_5_total(df_128_24)



In [None]:
df_128.loc[df_128.days_5_total_flag==1].count()

In [None]:
df_clean.shape

In [None]:
# df_tester = df_tester[df_tester.days_5_total_flag!=1] # this is one way to drop flagged records

In [None]:
df_tester.shape

# 3 consecutive days rule

In [None]:
df_tester_copy = df_tester.sort_values('OBSERVATION_TIME').copy()

https://stackoverflow.com/questions/50809594/remove-all-rows-in-pandas-dataframe-with-n-or-more-consecutive-nans

There's an issue between various influences of the state of a missed/flagged/null date, 
may need to employ this cell below to check for **ALL** missing dates 

https://stackoverflow.com/questions/52044348/check-for-any-missing-dates-in-the-index

https://stackoverflow.com/questions/19324453/add-missing-dates-to-pandas-dataframe

In [None]:
# TODO
# have to build a function to preprocess df_128_24_5 
# have to complete index date_range then reassemble the dataframe to include all dates including missing dates
# then have to change OBSERVATION_TIME to np.nan (NaT) whenever the flag columns have a value of 1
# then consecutive_3_days function needs to run for anything less than 73 of nan's in the OBSERVATION_TIME Column

# WHERE YOU LEFT OFF
# you are building the function with df_128_24_5 in mind, the cell 2 below (1 below df_128_24_5.head())
# this needs to be turned into a function which also converts the flags to np.nans

In [13]:
df_128_24_5.head()

Unnamed: 0,year,month,day,SITE_NUMBER,OBSERVATION_TIME,AMBIENT_AIR_TEMP,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,hour,hours_flag,days_5_total_flag
12382,2003,1,31,128,2003-01-31 02:00:00+00:00,31.31,31.93,31.85,31.57,31.07,31.07,31.0,30.72,30.86,31.43,31.43,31.92,33.03,33.73,34.41,34.88,35.02,2,1,1
18376,2003,1,31,128,2003-01-31 03:00:00+00:00,34.11,31.37,32.07,32.14,31.51,30.94,31.23,30.8,30.8,31.09,31.51,32.07,32.42,33.6,34.08,34.76,35.1,3,1,1
20712,2003,1,31,128,2003-01-31 04:00:00+00:00,32.83,30.87,31.79,31.58,31.3,31.16,30.87,30.52,30.66,30.94,31.51,32.14,32.21,33.74,34.22,34.63,35.3,4,1,1
27574,2003,1,31,128,2003-01-31 05:00:00+00:00,32.47,31.37,32.15,31.79,31.51,31.37,31.02,30.8,30.8,31.02,31.23,31.73,32.5,33.54,34.29,34.83,35.3,5,1,1
28052,2003,1,31,128,2003-01-31 06:00:00+00:00,33.89,31.38,32.01,31.8,31.73,31.37,30.8,30.52,30.59,30.87,31.8,32.15,32.64,33.61,34.29,34.83,35.17,6,1,1


In [15]:
idx = pd.date_range('2003-01-31', 
              '2017-12-31', freq="H", tz='UTC')

df_128_24_5.index = pd.DatetimeIndex(df_128_24_5.OBSERVATION_TIME)

df_reindex_test = df_128_24_5.reindex(idx, fill_value=np.nan)
df_reindex_test.head()

Unnamed: 0,year,month,day,SITE_NUMBER,OBSERVATION_TIME,AMBIENT_AIR_TEMP,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,hour,hours_flag,days_5_total_flag
2003-01-31 00:00:00+00:00,,,,,NaT,,,,,,,,,,,,,,,,,,,,
2003-01-31 01:00:00+00:00,,,,,NaT,,,,,,,,,,,,,,,,,,,,
2003-01-31 02:00:00+00:00,2003.0,1.0,31.0,128.0,2003-01-31 02:00:00+00:00,31.31,31.93,31.85,31.57,31.07,31.07,31.0,30.72,30.86,31.43,31.43,31.92,33.03,33.73,34.41,34.88,35.02,2.0,1.0,1.0
2003-01-31 03:00:00+00:00,2003.0,1.0,31.0,128.0,2003-01-31 03:00:00+00:00,34.11,31.37,32.07,32.14,31.51,30.94,31.23,30.8,30.8,31.09,31.51,32.07,32.42,33.6,34.08,34.76,35.1,3.0,1.0,1.0
2003-01-31 04:00:00+00:00,2003.0,1.0,31.0,128.0,2003-01-31 04:00:00+00:00,32.83,30.87,31.79,31.58,31.3,31.16,30.87,30.52,30.66,30.94,31.51,32.14,32.21,33.74,34.22,34.63,35.3,4.0,1.0,1.0


In [19]:
df_reindex_test.loc[(df_reindex_test.hours_flag==1) | (df_reindex_test.days_5_total_flag==1), 'OBSERVATION_TIME'] = np.nan

In [None]:
time_missing = pd.date_range(df_128.OBSERVATION_TIME.min(), 
              df_128.OBSERVATION_TIME.max()).difference(df_128.OBSERVATION_TIME)

In [None]:
type(time_missing)

In [None]:
time_missing = time_missing.strftime('%Y-%m-%d')

In [None]:
time_missing = pd.Series(time_missing)

In [None]:
time_missing.head()

In [None]:
df_reindex = df_128.reindex(pd.date_range('2003-01-31', 
              '2017-12-31', freq="H"))

In [None]:
df_128.groupby(['SITE_NUMBER','year','month'])['day'].nunique().reset_index()

In [None]:
df_128.groupby(['SITE_NUMBER','year','month', 'day']).count()

In [None]:
df_128.loc[(df_128['year']==2004) & (df_128['month']==2) & (df_128['day'].between(23,26))]

In [None]:
df_128.sort_values('OBSERVATION_TIME', inplace=True)

In [None]:
mask_zero = df_reindex.OBSERVATION_TIME.notna()

In [None]:
shift_sum = mask_zero.ne(mask_zero.shift()).cumsum() # everytime there is a difference in boolean value, it increments the count.

In [None]:
shift_sum.values

In [None]:
shift_sum.groupby(shift_sum.values).transform('size').unique()

In [23]:
def consecutive_3_days(df, sort_by_time=False):
    """
    Function takes a dataframe and looks for consecutive flagged days by using amask, ne(), shift(), and cumsum()
    the mask is made from non-flagged values ("O") in the "days_5_total_flag" column.
    Then used the ne() function to generate boolean values based on inequality between the following row (with shift()).
    It keeps a running count using cumsum().
    This sum of changes in the Boolean values is then grouped and filtered with transform for anything less than 3
    OR in the mask of non-flagged values.
    Records that groups of more than 3 and not in the mask are filtered out and not in the new Dataframe
    this function is preceded by the "more_than_5_total" function as it requires "days_5_total_flag" column.
    
    Arguments:
    df: DataFrame with a days_5_total_flag column
    
    Output:
    df: New DataFrame without the filtered values 
    """
    
    # pass argument to sort or pass through
    if sort_by_time is True:
        
        df = df.sort_values('OBSERVATION_TIME').copy() # dataframe must be sorted to perform
        
    print(df.shape)
    # creates a mask of non-flagged values (any row that doesn't have a 1 in the days_5_total_flag column)
    mask_zero = df.OBSERVATION_TIME.notna()
    
    # this uses the mask, and searches for a "NOT EQUAL" (ne) value below it using shift() which produces a boolean
    shift_sum = mask_zero.ne(mask_zero.shift()).cumsum() # everytime there is a difference in boolean value, it increments the count.
    
    # Filters the dataframe but grouping the cumulitve sum in "a" and getting the size.
    # if the size of a "False" group is less than 3 OR listed in mask variable, it remains and is turned into new DF
    df = df[(shift_sum.groupby(shift_sum.values).transform('size') < 72) | mask_zero]
    print(df.shape)
    
    return df

In [36]:
df = consecutive_3_days(df_reindex_test, sort_by_time=True)

(130753, 25)
(107497, 25)


In [None]:
df.head()

In [26]:
df.dropna(subset=['TMR_SUB_0', 'TMR_SUB_3', 'TMR_SUB_6', 'TMR_SUB_9', 'TMR_SUB_12',
       'TMR_SUB_18', 'TMR_SUB_24', 'TMR_SUB_30', 'TMR_SUB_36', 'TMR_SUB_42',
       'TMR_SUB_48', 'TMR_SUB_54', 'TMR_SUB_60', 'TMR_SUB_66', 'TMR_SUB_72'], inplace=True, axis=0)

In [25]:
df.isnull().sum()

year                    0
month                   0
day                     0
SITE_NUMBER             0
OBSERVATION_TIME        0
AMBIENT_AIR_TEMP     1952
TMR_PAV              4191
TMR_SUB_0              57
TMR_SUB_3             114
TMR_SUB_6             158
TMR_SUB_9             189
TMR_SUB_12            160
TMR_SUB_18            127
TMR_SUB_24            128
TMR_SUB_30             99
TMR_SUB_36            111
TMR_SUB_42             84
TMR_SUB_48             28
TMR_SUB_54             26
TMR_SUB_60            100
TMR_SUB_66            143
TMR_SUB_72              1
hour                    0
hours_flag              0
days_5_total_flag       0
dtype: int64

In [27]:
df.isnull().sum()

year                    0
month                   0
day                     0
SITE_NUMBER             0
OBSERVATION_TIME        0
AMBIENT_AIR_TEMP     1929
TMR_PAV              4137
TMR_SUB_0               0
TMR_SUB_3               0
TMR_SUB_6               0
TMR_SUB_9               0
TMR_SUB_12              0
TMR_SUB_18              0
TMR_SUB_24              0
TMR_SUB_30              0
TMR_SUB_36              0
TMR_SUB_42              0
TMR_SUB_48              0
TMR_SUB_54              0
TMR_SUB_60              0
TMR_SUB_66              0
TMR_SUB_72              0
hour                    0
hours_flag              0
days_5_total_flag       0
dtype: int64

In [48]:
df.groupby(['year','month'])['day'].nunique().index[1][0]

2003.0

In [41]:
df.loc[(df.year==2004) & (df.month==6) & (df.day.between(24,29))]

Unnamed: 0,year,month,day,SITE_NUMBER,OBSERVATION_TIME,AMBIENT_AIR_TEMP,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,hour,hours_flag,days_5_total_flag
2004-06-24 00:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 00:00:00+00:00,77.5,109.2,90.3,83.7,79.7,77.5,75.8,73.4,70.7,67.04,63.92,61.15,58.07,55.79,53.62,52.64,51.86,0.0,0.0,0.0
2004-06-24 01:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 01:00:00+00:00,75.5,109.0,92.2,85.3,80.9,78.2,75.9,73.2,70.5,67.19,64.15,61.03,57.9,55.82,53.67,52.68,51.74,1.0,0.0,0.0
2004-06-24 02:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 02:00:00+00:00,75.2,106.4,93.5,86.8,81.9,78.8,76.3,73.3,70.7,67.38,63.95,61.08,58.06,55.98,53.88,52.57,51.68,2.0,0.0,0.0
2004-06-24 03:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 03:00:00+00:00,73.4,103.1,94.0,87.9,82.9,79.5,76.8,73.5,70.6,67.43,64.15,61.19,58.06,55.93,53.88,52.68,51.57,3.0,0.0,0.0
2004-06-24 04:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 04:00:00+00:00,68.88,99.1,93.6,88.4,83.6,80.3,77.3,73.5,70.5,67.23,64.31,61.34,58.22,55.77,54.05,52.68,51.57,4.0,0.0,0.0
2004-06-24 05:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 05:00:00+00:00,67.82,93.4,92.8,88.5,84.4,80.9,77.6,73.6,70.6,67.51,64.22,61.2,58.28,56.05,53.96,53.02,51.87,5.0,0.0,0.0
2004-06-24 06:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 06:00:00+00:00,63.57,88.6,91.2,88.2,84.8,81.4,77.9,73.8,70.8,67.72,64.19,61.37,58.35,55.96,53.86,52.76,51.6,6.0,0.0,0.0
2004-06-24 07:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 07:00:00+00:00,58.71,82.7,89.3,87.8,84.8,81.6,78.5,74.1,70.9,67.65,64.02,61.2,58.27,55.95,54.16,52.62,51.63,7.0,0.0,0.0
2004-06-24 08:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 08:00:00+00:00,54.65,78.5,87.1,86.9,84.7,82.0,78.7,74.2,71.0,67.67,64.25,61.27,58.25,56.07,53.96,52.7,51.38,8.0,0.0,0.0
2004-06-24 09:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 09:00:00+00:00,51.83,75.5,84.9,85.6,84.3,81.9,78.9,74.4,70.9,67.55,64.47,61.55,58.32,56.09,53.93,52.73,51.84,9.0,0.0,0.0


In [42]:
.loc[(df_reindex_test.year==2004)
                    & (df_reindex_test.month==6)
                    & (df_reindex_test.day.between(24,29))]

Unnamed: 0,year,month,day,SITE_NUMBER,OBSERVATION_TIME,AMBIENT_AIR_TEMP,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,hour,hours_flag,days_5_total_flag
2004-06-24 00:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 00:00:00+00:00,77.5,109.2,90.3,83.7,79.7,77.5,75.8,73.4,70.7,67.04,63.92,61.15,58.07,55.79,53.62,52.64,51.86,0.0,0.0,0.0
2004-06-24 01:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 01:00:00+00:00,75.5,109.0,92.2,85.3,80.9,78.2,75.9,73.2,70.5,67.19,64.15,61.03,57.9,55.82,53.67,52.68,51.74,1.0,0.0,0.0
2004-06-24 02:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 02:00:00+00:00,75.2,106.4,93.5,86.8,81.9,78.8,76.3,73.3,70.7,67.38,63.95,61.08,58.06,55.98,53.88,52.57,51.68,2.0,0.0,0.0
2004-06-24 03:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 03:00:00+00:00,73.4,103.1,94.0,87.9,82.9,79.5,76.8,73.5,70.6,67.43,64.15,61.19,58.06,55.93,53.88,52.68,51.57,3.0,0.0,0.0
2004-06-24 04:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 04:00:00+00:00,68.88,99.1,93.6,88.4,83.6,80.3,77.3,73.5,70.5,67.23,64.31,61.34,58.22,55.77,54.05,52.68,51.57,4.0,0.0,0.0
2004-06-24 05:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 05:00:00+00:00,67.82,93.4,92.8,88.5,84.4,80.9,77.6,73.6,70.6,67.51,64.22,61.2,58.28,56.05,53.96,53.02,51.87,5.0,0.0,0.0
2004-06-24 06:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 06:00:00+00:00,63.57,88.6,91.2,88.2,84.8,81.4,77.9,73.8,70.8,67.72,64.19,61.37,58.35,55.96,53.86,52.76,51.6,6.0,0.0,0.0
2004-06-24 07:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 07:00:00+00:00,58.71,82.7,89.3,87.8,84.8,81.6,78.5,74.1,70.9,67.65,64.02,61.2,58.27,55.95,54.16,52.62,51.63,7.0,0.0,0.0
2004-06-24 08:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 08:00:00+00:00,54.65,78.5,87.1,86.9,84.7,82.0,78.7,74.2,71.0,67.67,64.25,61.27,58.25,56.07,53.96,52.7,51.38,8.0,0.0,0.0
2004-06-24 09:00:00+00:00,2004.0,6.0,24.0,128,2004-06-24 09:00:00+00:00,51.83,75.5,84.9,85.6,84.3,81.9,78.9,74.4,70.9,67.55,64.47,61.55,58.32,56.09,53.93,52.73,51.84,9.0,0.0,0.0


___

In [32]:
df_group = df.groupby(['SITE_NUMBER','year','month']).mean().reset_index()

In [33]:
df_group.pivot(index='year', columns=['month'], values='TMR_SUB_18').style.highlight_max(axis=0, color='green').highlight_min(axis=0, color='yellow')

month,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2003.0,,31.458063,28.068786,37.556828,54.594704,60.535245,69.492305,64.194831,54.693643,43.268964,32.455594,24.624168
2004.0,15.218147,25.409672,29.634543,36.636785,56.24789,64.19618,70.976798,70.338643,55.608588,42.079959,32.98,30.094252
2005.0,21.660611,21.185732,30.460412,41.443023,,64.029673,67.733014,65.520627,55.117581,41.748812,27.219711,26.564706
2006.0,19.304992,21.580075,23.607803,37.499925,51.740306,62.113449,65.811916,59.19502,52.749073,41.301184,17.649049,23.686088
2007.0,18.231909,23.877943,14.119438,35.714701,50.17251,60.321198,65.166914,62.119825,54.791537,39.329386,34.125173,27.833911
2009.0,13.905975,19.061306,24.538844,35.904986,53.841562,61.451833,68.185548,61.561687,53.79695,43.041343,28.857504,23.955105
2010.0,13.609341,28.161734,29.21307,36.678955,54.192035,61.323343,60.557493,59.868309,53.760371,39.635601,32.881524,
2012.0,10.912878,26.936697,23.589453,39.420145,49.562901,60.345503,60.611253,60.595512,50.810326,37.491023,22.452851,10.810748
2013.0,11.801241,25.141699,27.217854,33.769411,49.576464,67.256467,67.766801,63.775068,52.954718,42.509062,32.905609,19.502409
2014.0,29.073139,22.334362,28.488946,36.812507,59.118374,59.608647,65.755087,61.708839,53.773986,38.549501,33.044534,31.946758


In [None]:
df_group.pivot(index='year', columns=['month'], values='TMR_SUB_18').to_csv('temp_data_128_test.csv')

In [None]:
df_2004_6 = df.loc[(df.year==2004) & (df.month==6)]

In [None]:
df_2004_6.groupby(['year','month'])['day'].unique().values

In [None]:
df_128_all_18_group_pivot = df_128_all_18_group.pivot(index='year', columns=['month'], values='TMR_SUB_18')

In [None]:
df_128_all_18_group_pivot.style.highlight_max(axis=0, color='green').highlight_min(axis=0, color='yellow')

In [None]:
px.line(x=, y=, color=)

In [None]:
# Get data
data = pd.read_csv(r'C:\Users\ejmason\Desktop\THIGNS\Temperature Data Scripts\TDP_2017_entire.csv')

In [None]:
# data to dataframe to organize structure
data = pd.DataFrame(data, columns=['SITE_NUMBER','OBSERVATION_TIME','REF_TEMP','AMBIENT_AIR_TEMP','IN_PAVEMENT_TEMP','INTERNAL_TEMP','BATTERY_VOLTAGE','TMR_PAV','TMR_SUB_0','TMR_SUB_3','TMR_SUB_6','TMR_SUB_9','TMR_SUB_12','TMR_SUB_18','TMR_SUB_24','TMR_SUB_30','TMR_SUB_36','TMR_SUB_42','TMR_SUB_48','TMR_SUB_54','TMR_SUB_60','TMR_SUB_66','TMR_SUB_72','TIMEZONE_FLAG'])

In [None]:
# drop unnecessary rows
data = data.drop(['REF_TEMP', 'IN_PAVEMENT_TEMP', 'INTERNAL_TEMP', 'BATTERY_VOLTAGE', 'TIMEZONE_FLAG'], axis=1)

#### This next block likely won't be necessary but the steps may be useful

In [None]:
# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# diffgroup = data.drop(['IN_PAVEMENT_TEMP', 'TMR_PAV'], axis=1)

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# diffgroup[np.logical_or(diffgroup.diff(axis=1) == 0, diffgroup.diff(axis=1, periods=-1) == 0)] = np.nan

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# diffgroup.head(20)

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# data.head(20)

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# frames = data.columns.difference(diffgroup.columns)

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# newdata = diffgroup.merge(data[frames], left_index=True, right_index=True, how='outer')

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# cols = newdata.columns

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# newdata = newdata[['SITE_NUMBER','OBSERVATION_TIME','AMBIENT_AIR_TEMP','IN_PAVEMENT_TEMP','TMR_PAV','TMR_SUB_0','TMR_SUB_3','TMR_SUB_6','TMR_SUB_9','TMR_SUB_12','TMR_SUB_18','TMR_SUB_24','TMR_SUB_30','TMR_SUB_36','TMR_SUB_42','TMR_SUB_48','TMR_SUB_54','TMR_SUB_60','TMR_SUB_66','TMR_SUB_72']]

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# newdata.isnull().sum()

# Only necessary if IN_PAVEMENT_TEMP and TMR_PAV are BOTH still present in DataFrame

# data = newdata

Get the count and null values (which is 0, since the sensors won't produce a null value)

In [None]:
print(f'Total Count : {data.shape[0]}')
print(data.isnull().sum())

In [None]:
"""
replace all error values with np.nan or NaN 
(pandas/numpy's "null")
"""

data.replace(to_replace=['-9999.000000', 
                         '-9999.0', 
                         '-9999.00',
                         '-9999.000', 
                         -9999.00, 
                         -9999.000000, 
                         -9999.0,
                         -6999.0,
                         -6999.00,
                         -6999.000,
                         -6999.000000,
                         -9999.000], value=np.nan, inplace=True)

print(data.isnull().sum())

##### Many of the sensors have error codes that could be likely values, the only way to tell is that the values repeat themselves among sensor depth

In [None]:
# isolate sensor depth columns
sensors_df = data.iloc[:, 2:]

In [None]:
sensors_df.isnull().sum()

In [None]:
# isolate rows that have duplicates between columns
sensor_dup = sensors_df[np.logical_or(sensors_df.diff(axis=1) == 0, sensors_df.diff(axis=1, periods=-1) == 0)]

In [None]:
sensor_dup.shape[0]

In [None]:
sensor_dup_clean = sensor_dup.dropna()
sensor_dup_clean.shape[0]

In [None]:
sensor_dup_clean.isnull().sum()

#### replace all duplicated sensor values with nulls and merge to get matching dataframe to original dataframe

In [None]:
sensors_df[np.logical_or(sensors_df.diff(axis=1) == 0, sensors_df.diff(axis=1, periods=-1) == 0)] = np.nan

In [None]:
cols = data.iloc[:, :2]
cols

In [None]:
df = sensors_df.merge(cols, left_index=True, right_index=True)

#### Organize Columns

In [None]:
df_cols = df.columns.tolist()

df_cols = df_cols[-2:] + df_cols[:-2]

df = df[df_cols]

In [None]:
df

#### Back to the main data

In [None]:
# df.head().to_html()

#### Convert observation_time to DateTime

In [None]:
df.OBSERVATION_TIME = pd.to_datetime(df['OBSERVATION_TIME'])

In [None]:
df.isnull().sum()

### convert questionable values in each individual column to null "np.nan" or "NaN"

In [None]:
# next comment is secondary attempt that uses masking, but could not get to function
# data['TMR_PAV'] = np.where(data.loc[(data['TMR_PAV'] >= 150) | (data['TMR_PAV'] <= -50)],np.nan,data['TMR_PAV']
# TMR_PAV
df['TMR_PAV'] = np.where((df['TMR_PAV'] >= 110) | (df['TMR_PAV'] <= -30),np.nan,df['TMR_PAV'])  

# AMBIENT_AIR_TEMP
df['AMBIENT_AIR_TEMP'] = np.where((df['AMBIENT_AIR_TEMP'] >= 105) | (df['AMBIENT_AIR_TEMP'] <= -55),
                                    np.nan,df['AMBIENT_AIR_TEMP'])

# IN_PAVEMENT_TEMP (Only if included)
# data['IN_PAVEMENT_TEMP'] = np.where((data['IN_PAVEMENT_TEMP'] >= 110) | (data['IN_PAVEMENT_TEMP'] <= -30),
#                                     np.nan,data['IN_PAVEMENT_TEMP'])

# TMR_SUB_0
df['TMR_SUB_0'] = np.where((df['TMR_SUB_0'] >= 110) | (df['TMR_SUB_0'] <= -25),
                                    np.nan,df['TMR_SUB_0'])

# TMR_SUB_3
df['TMR_SUB_3'] = np.where((df['TMR_SUB_3'] >= 100) | (df['TMR_SUB_3'] <= -20),
                                    np.nan,df['TMR_SUB_3'])

# TMR_SUB_6

df['TMR_SUB_6'] = np.where((df['TMR_SUB_6'] >= 95) | (df['TMR_SUB_6'] <= -15),
                                    np.nan,df['TMR_SUB_6'])

# TMR_SUB_9 

df['TMR_SUB_9'] = np.where((df['TMR_SUB_9'] >= 90) | (df['TMR_SUB_9'] <= -15),
                                    np.nan,df['TMR_SUB_9'])

# TMR_SUB_12
df['TMR_SUB_12'] = np.where((df['TMR_SUB_12'] >= 85) | (df['TMR_SUB_12'] <= -10),
                                    np.nan,df['TMR_SUB_12'])

# TMR_SUB_18
df['TMR_SUB_18'] = np.where((df['TMR_SUB_18'] >= 85) | (df['TMR_SUB_18'] <= -10),
                                    np.nan,df['TMR_SUB_18'])

# TMR_SUB_24
df['TMR_SUB_24'] = np.where((df['TMR_SUB_24'] >= 85) | (df['TMR_SUB_24'] <= -10),
                                    np.nan,df['TMR_SUB_24'])

# TMR_SUB_30
df['TMR_SUB_30'] = np.where((df['TMR_SUB_30'] >= 80) | (df['TMR_SUB_30'] <= -5),
                                    np.nan,df['TMR_SUB_30'])

# TMR_SUB_36
df['TMR_SUB_36'] = np.where((df['TMR_SUB_36'] >= 80) | (df['TMR_SUB_36'] <= -5),
                                    np.nan,df['TMR_SUB_36'])

# TMR_SUB_42
df['TMR_SUB_42'] = np.where((df['TMR_SUB_42'] >= 75) | (df['TMR_SUB_42'] <= 0),
                                    np.nan,df['TMR_SUB_42'])

# TMR_SUB_48
df['TMR_SUB_48'] = np.where((df['TMR_SUB_48'] >= 70) | (df['TMR_SUB_48'] <= 0),
                                    np.nan,df['TMR_SUB_48'])

# TMR_SUB_54
df['TMR_SUB_54'] = np.where((df['TMR_SUB_54'] >= 70) | (df['TMR_SUB_54'] <= 5),
                                    np.nan,df['TMR_SUB_54'])

# TMR_SUB_60
df['TMR_SUB_60'] = np.where((df['TMR_SUB_60'] >= 70) | (df['TMR_SUB_60'] <= 5),
                                    np.nan,df['TMR_SUB_60'])

# TMR_SUB_66
df['TMR_SUB_66'] = np.where((df['TMR_SUB_66'] >= 65) | (df['TMR_SUB_66'] <= 10),
                                    np.nan,df['TMR_SUB_66'])

# TMR_SUB_72
df['TMR_SUB_72'] = np.where((df['TMR_SUB_72'] >= 65) | (df['TMR_SUB_72'] <= 10),
                                    np.nan,df['TMR_SUB_72'])

In [None]:
# Verify null values and compare with original dataFrame isnull().sum() values which should have been 0
df.isnull().sum()

In [None]:
df['year'] = df.OBSERVATION_TIME.dt.year

In [None]:
df['month'] = df.OBSERVATION_TIME.dt.month

In [None]:
df

In [None]:
collist = df.columns.tolist()

In [None]:
col1 = collist[:2]

In [None]:
col2 = collist[-2:]

In [None]:
col3 = collist[2:-2]

In [None]:
colcombine = col1 + col2 + col3

In [None]:
df = df[colcombine]

In [None]:
df

### Begin Grouping

#### 128	Seward Highway @ Moose Pass MP 32	MOP

In [None]:
df_128 = df.loc[df['SITE_NUMBER']==128]

In [None]:
col128 = df.columns.tolist()

In [None]:
col128_1 = col128[:4]
col128_1

In [None]:
col128_2 = col128[11:12]
col128_2

In [None]:
col128list = col128_1 + col128_2

In [None]:
df_128 = df_128[col128list]

In [None]:
df_128

In [None]:
df_128.groupby(['SITE_NUMBER', 'month']).mean()

In [None]:
df_128_mean = df_128.groupby(['SITE_NUMBER', 'month']).mean().reset_index()

In [None]:
df_128_mean.TMR_SUB_18.plot(legend=True, title='Moose Pass')

In [None]:
import seaborn as sns

In [None]:
sns.lineplot(data=df_128_mean, x="month", y="TMR_SUB_18").set_title("Moose Pass Station");

In [None]:
# Kaggle API - pip install kaggle or conda install -c conda-forge kaggle

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [None]:
# download data - this will download the indicated file to the workspace in which the notebook resides

dataset = 'erikjamesmason/akdot-tdp-temperature-data-probes'
datafile = 'TDP_2002_entire.csv'
api.dataset_download_files(dataset,datafile)

In [None]:
datagroup_site = df.groupby(['SITE_NUMBER', 'OBSERVATION_TIME'])

In [None]:
dg_site_mean = datagroup_site.mean()

In [None]:
datagroup_site

In [None]:
datagroup_site.head()

In [None]:
dg_site_mean.head()

In [None]:
df.groupby(['SITE_NUMBER','month']).mean()

In [None]:
dg_site_max = datagroup_site.max()

In [None]:
dg_site_max.head()

In [None]:
dg_site_min = datagroup_site.min()

In [None]:
dg_site_min.head()

In [None]:
datagroup_site2 = data.groupby(['SITE_NUMBER', 'OBSERVATION_TIME']).mean().reset_index()

In [None]:
datagroup_site2.head()

In [None]:
data.head()

In [None]:
data.to_csv(r'C:\Users\ejmason\Desktop\THIGNS\Temperature Data Scripts\TDP_DF_Nulls_Update.csv', header=True)

In [None]:
dg_site_min.to_csv(r'U:\Erik Planning\Coding\TDP_DF_Min.csv', header=True)

In [None]:
dgs_stat = datagroup_site.agg(['min', 'max', 'mean'])

In [None]:
dgs_stat.head()

In [None]:
dgs_stat.to_csv(r'U:\Erik Planning\Coding\TDP_DF_test.csv', header=True)

In [None]:
# dgs = datagroup_site.agg([np.mean, np.min, np.max, len ])

In [None]:
# dgs

In [None]:
# cols = data.columns

In [None]:
# datagroup_site.describe()

In [None]:
data.head()

In [None]:
type(data)

In [None]:
!pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org firebase

In [None]:
import firebase
firebase = firebase.FirebaseApplication('https://temperature-data-probes-default-rtdb.firebaseio.com/', None)
result = firebase.get('/temperature-data-probes-default-rtdb', None)

In [None]:
result