# Imports

In [3]:
import pandas as pd
import numpy as np
import pathlib
import calendar
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# SQL Load Data

In [3]:
from sqlalchemy import create_engine

database_filename = 'tdpdata.db'
table_name = 'tdpsheet'
engine = create_engine('sqlite:///{}'.format(database_filename)) 

In [4]:
df_sql_test = pd.read_sql(table_name, engine, parse_dates='OBSERVATION_TIME')

In [10]:
df_sql_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5754574 entries, 0 to 5754573
Data columns (total 24 columns):
 #   Column            Dtype              
---  ------            -----              
 0   SITE_NUMBER       object             
 1   OBSERVATION_TIME  datetime64[ns, UTC]
 2   REF_TEMP          float64            
 3   AMBIENT_AIR_TEMP  float64            
 4   IN_PAVEMENT_TEMP  float64            
 5   INTERNAL_TEMP     float64            
 6   BATTERY_VOLTAGE   float64            
 7   TMR_PAV           float64            
 8   TMR_SUB_0         float64            
 9   TMR_SUB_3         float64            
 10  TMR_SUB_6         float64            
 11  TMR_SUB_9         float64            
 12  TMR_SUB_12        float64            
 13  TMR_SUB_18        float64            
 14  TMR_SUB_24        float64            
 15  TMR_SUB_30        float64            
 16  TMR_SUB_36        float64            
 17  TMR_SUB_42        float64            
 18  TMR_SUB_48        floa

In [8]:
df_sql_test.isnull().sum()

SITE_NUMBER         0
OBSERVATION_TIME    0
REF_TEMP            0
AMBIENT_AIR_TEMP    0
IN_PAVEMENT_TEMP    0
INTERNAL_TEMP       0
BATTERY_VOLTAGE     0
TMR_PAV             0
TMR_SUB_0           0
TMR_SUB_3           0
TMR_SUB_6           0
TMR_SUB_9           0
TMR_SUB_12          0
TMR_SUB_18          0
TMR_SUB_24          0
TMR_SUB_30          0
TMR_SUB_36          0
TMR_SUB_42          0
TMR_SUB_48          0
TMR_SUB_54          0
TMR_SUB_60          0
TMR_SUB_66          0
TMR_SUB_72          0
TIMEZONE_FLAG       0
dtype: int64

#create dataframe with uncleaned records
filename = "df_all.pkl"
df_sql_test.to_pickle(filename)

In [15]:
df_sql_test.to_pickle("df_all.pkl")

# Pickle DataFrame Load Data

In [2]:
df = pd.read_pickle('df_all.pkl')

In [3]:
df_128 = df.loc[df.SITE_NUMBER=='128']

In [4]:
df_128.isnull().sum()

SITE_NUMBER         0
OBSERVATION_TIME    0
REF_TEMP            0
AMBIENT_AIR_TEMP    0
IN_PAVEMENT_TEMP    0
INTERNAL_TEMP       0
BATTERY_VOLTAGE     0
TMR_PAV             0
TMR_SUB_0           0
TMR_SUB_3           0
TMR_SUB_6           0
TMR_SUB_9           0
TMR_SUB_12          0
TMR_SUB_18          0
TMR_SUB_24          0
TMR_SUB_30          0
TMR_SUB_36          0
TMR_SUB_42          0
TMR_SUB_48          0
TMR_SUB_54          0
TMR_SUB_60          0
TMR_SUB_66          0
TMR_SUB_72          0
TIMEZONE_FLAG       0
dtype: int64

In [5]:
df = df.drop(['REF_TEMP', 
                  'IN_PAVEMENT_TEMP', 
                  'INTERNAL_TEMP', 
                  'BATTERY_VOLTAGE', 
                  'TIMEZONE_FLAG'], axis=1)

In [6]:
df.isnull().sum()

SITE_NUMBER         0
OBSERVATION_TIME    0
AMBIENT_AIR_TEMP    0
TMR_PAV             0
TMR_SUB_0           0
TMR_SUB_3           0
TMR_SUB_6           0
TMR_SUB_9           0
TMR_SUB_12          0
TMR_SUB_18          0
TMR_SUB_24          0
TMR_SUB_30          0
TMR_SUB_36          0
TMR_SUB_42          0
TMR_SUB_48          0
TMR_SUB_54          0
TMR_SUB_60          0
TMR_SUB_66          0
TMR_SUB_72          0
dtype: int64

In [7]:
def replace_error_code_values(df):  
    """
    replace all error values with np.nan or NaN 
    (pandas/numpy's "null")
    """
    print(df.shape)
    print(df.isnull().sum())
    df.replace(to_replace=['-9999.000000', 
                             '-9999.0', 
                             '-9999.00',
                             '-9999.000', 
                             -9999.00, 
                             -9999.000000, 
                             -9999.0,
                             -6999.0,
                             -6999.00,
                             -6999.000,
                             -6999.000000,
                             -9999.000], value=np.nan, inplace=True)
    
    print(df.shape)
    print(df.isnull().sum())
    
    return df

In [8]:
df = replace_error_code_values(df_128)

(110581, 24)
SITE_NUMBER         0
OBSERVATION_TIME    0
REF_TEMP            0
AMBIENT_AIR_TEMP    0
IN_PAVEMENT_TEMP    0
INTERNAL_TEMP       0
BATTERY_VOLTAGE     0
TMR_PAV             0
TMR_SUB_0           0
TMR_SUB_3           0
TMR_SUB_6           0
TMR_SUB_9           0
TMR_SUB_12          0
TMR_SUB_18          0
TMR_SUB_24          0
TMR_SUB_30          0
TMR_SUB_36          0
TMR_SUB_42          0
TMR_SUB_48          0
TMR_SUB_54          0
TMR_SUB_60          0
TMR_SUB_66          0
TMR_SUB_72          0
TIMEZONE_FLAG       0
dtype: int64
(110581, 24)
SITE_NUMBER             0
OBSERVATION_TIME        0
REF_TEMP            17588
AMBIENT_AIR_TEMP     1207
IN_PAVEMENT_TEMP    20313
INTERNAL_TEMP           0
BATTERY_VOLTAGE         0
TMR_PAV                 0
TMR_SUB_0               0
TMR_SUB_3               0
TMR_SUB_6               0
TMR_SUB_9               0
TMR_SUB_12              0
TMR_SUB_18              0
TMR_SUB_24              0
TMR_SUB_30              0
TMR_SUB_36       

In [9]:
sensors_df = df.iloc[:, 2:]
sensors_df.head()

Unnamed: 0,REF_TEMP,AMBIENT_AIR_TEMP,IN_PAVEMENT_TEMP,INTERNAL_TEMP,BATTERY_VOLTAGE,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,TIMEZONE_FLAG
84994,35.2,24.2,35.442,35.247,14.02,26.99,28.79,29.3,29.81,30.17,29.95,29.95,29.95,30.38,30.96,31.45,32.09,34.16,34.78,35.72,36.91,0
84995,34.99,27.87,35.251,34.99,14.02,27.15,28.41,29.44,29.66,29.81,30.81,30.1,30.17,30.96,30.81,31.46,32.02,33.89,34.91,35.58,36.84,0
84996,34.54,28.06,34.567,34.578,14.03,27.74,28.64,29.37,29.81,30.02,29.81,29.88,30.03,30.25,31.1,31.68,32.24,34.24,34.64,35.59,36.65,0
84997,34.56,29.71,34.342,34.56,14.03,27.9,28.49,28.86,29.66,29.81,29.59,29.88,30.1,30.02,30.96,31.67,32.16,33.9,35.12,35.92,36.84,0
84998,35.01,28.94,34.644,34.99,14.03,27.9,28.49,29.15,29.37,29.66,29.95,30.1,30.1,30.53,31.18,31.67,32.31,34.1,34.91,35.86,36.58,0


In [10]:
broken_sensors_df = sensors_df.loc[(sensors_df.TMR_SUB_0==sensors_df.TMR_SUB_72)]

In [11]:
broken_sensors_df.count()

REF_TEMP            175
AMBIENT_AIR_TEMP    176
IN_PAVEMENT_TEMP    169
INTERNAL_TEMP       178
BATTERY_VOLTAGE     178
TMR_PAV             178
TMR_SUB_0           178
TMR_SUB_3           178
TMR_SUB_6           178
TMR_SUB_9           178
TMR_SUB_12          178
TMR_SUB_18          178
TMR_SUB_24          178
TMR_SUB_30          178
TMR_SUB_36          178
TMR_SUB_42          178
TMR_SUB_48          178
TMR_SUB_54          178
TMR_SUB_60          178
TMR_SUB_66          178
TMR_SUB_72          178
TIMEZONE_FLAG       178
dtype: int64

In [14]:
broken_sensors_df = df.iloc[broken_sensors_df]

ValueError: cannot convert float NaN to integer

In [None]:
## problem above: ValueError: cannot convert float NaN to integer


In [15]:
broken_sensors_df.to_csv('broken_sensors.csv', index=False, float_format='%g')

In [16]:
df_128 = df.loc[df.SITE_NUMBER=='128']

In [17]:
df_128.head()

Unnamed: 0,SITE_NUMBER,OBSERVATION_TIME,REF_TEMP,AMBIENT_AIR_TEMP,IN_PAVEMENT_TEMP,INTERNAL_TEMP,BATTERY_VOLTAGE,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72,TIMEZONE_FLAG
84994,128,2003-12-14 17:00:00+00:00,35.2,24.2,35.442,35.247,14.02,26.99,28.79,29.3,29.81,30.17,29.95,29.95,29.95,30.38,30.96,31.45,32.09,34.16,34.78,35.72,36.91,0
84995,128,2003-12-14 18:00:00+00:00,34.99,27.87,35.251,34.99,14.02,27.15,28.41,29.44,29.66,29.81,30.81,30.1,30.17,30.96,30.81,31.46,32.02,33.89,34.91,35.58,36.84,0
84996,128,2003-12-14 19:00:00+00:00,34.54,28.06,34.567,34.578,14.03,27.74,28.64,29.37,29.81,30.02,29.81,29.88,30.03,30.25,31.1,31.68,32.24,34.24,34.64,35.59,36.65,0
84997,128,2003-12-14 20:00:00+00:00,34.56,29.71,34.342,34.56,14.03,27.9,28.49,28.86,29.66,29.81,29.59,29.88,30.1,30.02,30.96,31.67,32.16,33.9,35.12,35.92,36.84,0
84998,128,2003-12-14 21:00:00+00:00,35.01,28.94,34.644,34.99,14.03,27.9,28.49,29.15,29.37,29.66,29.95,30.1,30.1,30.53,31.18,31.67,32.31,34.1,34.91,35.86,36.58,0


In [119]:
df_128.loc[(df_128.TMR_SUB_0==df_128.TMR_SUB_72)].count()

SITE_NUMBER         178
OBSERVATION_TIME    178
AMBIENT_AIR_TEMP    172
TMR_PAV             178
TMR_SUB_0           178
TMR_SUB_3           178
TMR_SUB_6           178
TMR_SUB_9           178
TMR_SUB_12          178
TMR_SUB_18          178
TMR_SUB_24          178
TMR_SUB_30          178
TMR_SUB_36          178
TMR_SUB_42          178
TMR_SUB_48          178
TMR_SUB_54          178
TMR_SUB_60          178
TMR_SUB_66          178
TMR_SUB_72          178
dtype: int64

In [61]:
df.loc[df.TMR_SUB_18.isna()].shape

(100906, 19)

In [63]:
"""sensors_df[np.logical_or(sensors_df.diff(axis=1) == 0, 
                             sensors_df.diff(axis=1, periods=-1) == 0)] = np.nan"""

sensors_df[np.logical_and(sensors_df.diff(axis=1) == 0, 
                             sensors_df.diff(axis=1, periods=-1) == 0)] = np.nan



TypeError: unsupported operand type(s) for -: 'Timestamp' and 'str'

In [51]:
sensors_df.loc[sensors_df.TMR_SUB_12.isna()].head()

Unnamed: 0,AMBIENT_AIR_TEMP,TMR_PAV,TMR_SUB_0,TMR_SUB_3,TMR_SUB_6,TMR_SUB_9,TMR_SUB_12,TMR_SUB_18,TMR_SUB_24,TMR_SUB_30,TMR_SUB_36,TMR_SUB_42,TMR_SUB_48,TMR_SUB_54,TMR_SUB_60,TMR_SUB_66,TMR_SUB_72
118,49.05,48.43,49.43,49.81,49.81,49.65,,49.65,49.81,49.86,49.97,50.03,49.86,49.81,49.59,49.48,49.2
4107,48.14,51.35,51.67,51.78,51.67,51.56,,51.56,51.51,51.46,51.35,51.23,50.96,50.74,50.36,50.14,49.81
5370,19.5,21.94,24.92,24.3,23.67,23.28,,,23.28,23.36,23.52,23.6,23.99,24.38,24.99,25.83,26.73
5948,4.309,96.6,0.846,0.712,0.588,0.212,,0.212,1.084,1.701,3.61,7.42,10.24,13.39,16.99,19.94,22.33
6499,3.066,94.2,2.821,2.94,,,,2.94,3.528,3.411,4.569,6.473,8.31,10.68,13.62,16.31,18.43


In [39]:
sensors_df.isnull().sum()

AMBIENT_AIR_TEMP    435457
TMR_PAV             177945
TMR_SUB_0           306336
TMR_SUB_3           457303
TMR_SUB_6           465355
TMR_SUB_9           477913
TMR_SUB_12          467186
TMR_SUB_18          460701
TMR_SUB_24          467117
TMR_SUB_30          467519
TMR_SUB_36          459524
TMR_SUB_42          461328
TMR_SUB_48          466554
TMR_SUB_54          471898
TMR_SUB_60          471050
TMR_SUB_66          456775
TMR_SUB_72          188392
dtype: int64

In [138]:
df = df.drop('TIMEZONE_FLAG', axis=1)

In [120]:
def remove_duplicate_adjacent_values(df):
    
    print(df.shape)
    
    print(df.isnull().sum())

    # isolate sensor depth columns
    sensors_df = df.iloc[:, 2:]
    
    """# isolate rows that have duplicates between columns
    sensor_dup = sensors_df[np.logical_or(sensors_df.diff(axis=1) == 0, 
                                          sensors_df.diff(axis=1, periods=-1) == 0)]
    
    sensor_dup.shape[0]
    
    sensor_dup_clean = sensor_dup.dropna()
    sensor_dup_clean.shape[0]"""
    
    sensors_df[np.logical_and(sensors_df.diff(axis=1) == 0, 
                             sensors_df.diff(axis=1, periods=-1) == 0)] = np.nan
    
    cols = df.iloc[:, :2]
    
    df = sensors_df.merge(cols, left_index=True, right_index=True)
    
    df_cols = df.columns.tolist()
    
    df_cols = df_cols[-2:] + df_cols[:-2]
    
    df = df[df_cols]
    
    print(df.isnull().sum())
    
    return df

In [139]:
df_128 = remove_duplicate_adjacent_values(df)

(110581, 23)
SITE_NUMBER             0
OBSERVATION_TIME        0
REF_TEMP            17588
AMBIENT_AIR_TEMP     2527
IN_PAVEMENT_TEMP    20313
INTERNAL_TEMP           0
BATTERY_VOLTAGE         0
TMR_PAV              4296
TMR_SUB_0              42
TMR_SUB_3              30
TMR_SUB_6              34
TMR_SUB_9              30
TMR_SUB_12             58
TMR_SUB_18             24
TMR_SUB_24              0
TMR_SUB_30              0
TMR_SUB_36              0
TMR_SUB_42              0
TMR_SUB_48              0
TMR_SUB_54              0
TMR_SUB_60              0
TMR_SUB_66              0
TMR_SUB_72              2
dtype: int64
SITE_NUMBER             0
OBSERVATION_TIME        0
REF_TEMP            17588
AMBIENT_AIR_TEMP     2527
IN_PAVEMENT_TEMP    20323
INTERNAL_TEMP           0
BATTERY_VOLTAGE         0
TMR_PAV              4296
TMR_SUB_0              58
TMR_SUB_3             118
TMR_SUB_6             165
TMR_SUB_9             197
TMR_SUB_12            162
TMR_SUB_18            127
TMR_SUB_24  

In [79]:
def clean_extreme_values(df):
    
    # next comment is secondary attempt that uses masking, but could not get to function
    # data['TMR_PAV'] = np.where(data.loc[(data['TMR_PAV'] >= 150) | (data['TMR_PAV'] <= -50)],np.nan,data['TMR_PAV']
    # TMR_PAV
    df['TMR_PAV'] = np.where((df['TMR_PAV'] >= 110) | (df['TMR_PAV'] <= -30),np.nan,df['TMR_PAV'])  
    
    # AMBIENT_AIR_TEMP
    df['AMBIENT_AIR_TEMP'] = np.where((df['AMBIENT_AIR_TEMP'] >= 105) | (df['AMBIENT_AIR_TEMP'] <= -55),
                                        np.nan,df['AMBIENT_AIR_TEMP'])
    
    # IN_PAVEMENT_TEMP (Only if included)
    # data['IN_PAVEMENT_TEMP'] = np.where((data['IN_PAVEMENT_TEMP'] >= 110) | (data['IN_PAVEMENT_TEMP'] <= -30),
    #                                     np.nan,data['IN_PAVEMENT_TEMP'])
    
    # TMR_SUB_0
    df['TMR_SUB_0'] = np.where((df['TMR_SUB_0'] >= 110) | (df['TMR_SUB_0'] <= -25),
                                        np.nan,df['TMR_SUB_0'])
    
    # TMR_SUB_3
    df['TMR_SUB_3'] = np.where((df['TMR_SUB_3'] >= 100) | (df['TMR_SUB_3'] <= -20),
                                        np.nan,df['TMR_SUB_3'])
    
    # TMR_SUB_6
    df['TMR_SUB_6'] = np.where((df['TMR_SUB_6'] >= 95) | (df['TMR_SUB_6'] <= -15),
                                        np.nan,df['TMR_SUB_6'])
    
    # TMR_SUB_9 
    df['TMR_SUB_9'] = np.where((df['TMR_SUB_9'] >= 90) | (df['TMR_SUB_9'] <= -15),
                                        np.nan,df['TMR_SUB_9'])
    
    # TMR_SUB_12
    df['TMR_SUB_12'] = np.where((df['TMR_SUB_12'] >= 85) | (df['TMR_SUB_12'] <= -10),
                                        np.nan,df['TMR_SUB_12'])
    
    # TMR_SUB_18
    df['TMR_SUB_18'] = np.where((df['TMR_SUB_18'] >= 85) | (df['TMR_SUB_18'] <= -10),
                                        np.nan,df['TMR_SUB_18'])
    
    # TMR_SUB_24
    df['TMR_SUB_24'] = np.where((df['TMR_SUB_24'] >= 85) | (df['TMR_SUB_24'] <= -10),
                                        np.nan,df['TMR_SUB_24'])
    
    # TMR_SUB_30
    df['TMR_SUB_30'] = np.where((df['TMR_SUB_30'] >= 80) | (df['TMR_SUB_30'] <= -5),
                                        np.nan,df['TMR_SUB_30'])
    
    # TMR_SUB_36
    df['TMR_SUB_36'] = np.where((df['TMR_SUB_36'] >= 80) | (df['TMR_SUB_36'] <= -5),
                                        np.nan,df['TMR_SUB_36'])
    
    # TMR_SUB_42
    df['TMR_SUB_42'] = np.where((df['TMR_SUB_42'] >= 75) | (df['TMR_SUB_42'] <= 0),
                                        np.nan,df['TMR_SUB_42'])
    
    # TMR_SUB_48
    df['TMR_SUB_48'] = np.where((df['TMR_SUB_48'] >= 70) | (df['TMR_SUB_48'] <= 0),
                                        np.nan,df['TMR_SUB_48'])
    
    # TMR_SUB_54
    df['TMR_SUB_54'] = np.where((df['TMR_SUB_54'] >= 70) | (df['TMR_SUB_54'] <= 5),
                                        np.nan,df['TMR_SUB_54'])
    
    # TMR_SUB_60
    df['TMR_SUB_60'] = np.where((df['TMR_SUB_60'] >= 70) | (df['TMR_SUB_60'] <= 5),
                                        np.nan,df['TMR_SUB_60'])
    
    # TMR_SUB_66
    df['TMR_SUB_66'] = np.where((df['TMR_SUB_66'] >= 65) | (df['TMR_SUB_66'] <= 10),
                                        np.nan,df['TMR_SUB_66'])
    
    # TMR_SUB_72
    df['TMR_SUB_72'] = np.where((df['TMR_SUB_72'] >= 65) | (df['TMR_SUB_72'] <= 10),
                                        np.nan,df['TMR_SUB_72'])
    
    return df

In [129]:
df = clean_extreme_values(df)

In [130]:
df.isnull().sum()

SITE_NUMBER             0
OBSERVATION_TIME        0
REF_TEMP            17588
AMBIENT_AIR_TEMP     2527
IN_PAVEMENT_TEMP    20313
INTERNAL_TEMP           0
BATTERY_VOLTAGE         0
TMR_PAV              4296
TMR_SUB_0              42
TMR_SUB_3              30
TMR_SUB_6              34
TMR_SUB_9              30
TMR_SUB_12             58
TMR_SUB_18             24
TMR_SUB_24              0
TMR_SUB_30              0
TMR_SUB_36              0
TMR_SUB_42              0
TMR_SUB_48              0
TMR_SUB_54              0
TMR_SUB_60              0
TMR_SUB_66              0
TMR_SUB_72              2
TIMEZONE_FLAG           0
dtype: int64

In [None]:
# filename = "df_128_all_18.pkl"
filename = "df_all_clean.pkl"
df.to_pickle(filename)

In [21]:
df = df.loc[df['SITE_NUMBER']=='128']

df = df[['SITE_NUMBER','OBSERVATION_TIME','TMR_SUB_18']]

df.head()

Unnamed: 0,SITE_NUMBER,OBSERVATION_TIME,TMR_SUB_18
84994,128,2003-12-14 17:00:00+00:00,
84995,128,2003-12-14 18:00:00+00:00,30.1
84996,128,2003-12-14 19:00:00+00:00,29.88
84997,128,2003-12-14 20:00:00+00:00,29.88
84998,128,2003-12-14 21:00:00+00:00,


In [22]:
df.isnull().sum()

SITE_NUMBER            0
OBSERVATION_TIME       0
TMR_SUB_18          3916
dtype: int64

In [24]:
df.shape

(110581, 3)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110581 entries, 84994 to 5751771
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   SITE_NUMBER       110581 non-null  object             
 1   OBSERVATION_TIME  110581 non-null  datetime64[ns, UTC]
 2   TMR_SUB_18        106665 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(1)
memory usage: 3.4+ MB


In [26]:
df['year'] = df.OBSERVATION_TIME.dt.year
df['month'] = df.OBSERVATION_TIME.dt.month

df['day'] = df.OBSERVATION_TIME.dt.day
df['hour'] = df.OBSERVATION_TIME.dt.hour

In [27]:
df.head()

Unnamed: 0,SITE_NUMBER,OBSERVATION_TIME,TMR_SUB_18,year,month,day,hour
84994,128,2003-12-14 17:00:00+00:00,,2003,12,14,17
84995,128,2003-12-14 18:00:00+00:00,30.1,2003,12,14,18
84996,128,2003-12-14 19:00:00+00:00,29.88,2003,12,14,19
84997,128,2003-12-14 20:00:00+00:00,29.88,2003,12,14,20
84998,128,2003-12-14 21:00:00+00:00,,2003,12,14,21


In [29]:
# filename = "df_128_all_18.pkl"
filename = "df_128_all_years_18_clean.pkl"
df.to_pickle(filename)

In [140]:
# filename = "df_128_all_18.pkl"
filename = "df_128.pkl"
df_128.to_pickle(filename)

In [34]:
df.groupby(['year','month','day'])['TMR_SUB_18'].count() ==24

year  month  day
2003  1      31     False
      2      1      False
             2      False
             3      False
             4      False
             5      False
             6      False
             7      False
             8      False
             9      False
             10     False
             11     False
             12     False
             13     False
             14     False
             15     False
             16     False
             17     False
             18     False
             19      True
             20      True
             21      True
             22      True
             23     False
             24     False
             25     False
             26     False
             27     False
             28     False
      3      1      False
             2      False
             3      False
             4      False
             5      False
             6      False
             7      False
             8       True
             9       

In [4]:
df = pd.read_pickle('df_128.pkl')

In [None]:
print(df.head(5))