In [165]:
# create a class that houses a pandas.DataFrame and a dict 
# the df will have values for variables that have more than one unique value
# for variables that only have a single unique value, the dict will contain the label:value pair where label is column

import numpy as np
import pandas as pd

In [166]:
# are values within a mutable object stored in a tuple, mutable? yes, BUT...
# you can add values to the existing objects and update values in existing objects, but not redeclare the objects
# see that updating L1[1]='B' works whereas reconstructing L1=[2,3,4] does not

# uncomment to see
#L1 = ['a','b']
#d1 = {'lab1':'val1','lab2':'val2'}
#tup = (L1,d1)
#print(tup)
#L1.append('c')
#d1['lab1'] = 'valA'
#d1['newkey']='newval'
#print(tup)
#L1[1]='B'
#print(tup)
#print(d1)
#L1 = [2,3,4]
#print(tup)

In [167]:
# uncomment to prove that it is also true of pandas.DataFrame and dict

#d1 = {'lab1':'val1','lab2':'val2'}
#index_list=[0,1,2,3,4]
#df1 = pd.DataFrame(np.random.uniform(0,10,(5,5)), index=index_list, columns=[f'column{x}' for x in index_list])
#tupdf = (d1, df1)
#print(tupdf)
#d1.update(zip(list(df1.columns),list(df1['column0'])))
#df1.loc[0,'column0'] = 0
#print(tupdf)

In [168]:
# so a tuple containing a dict of metadata and dataframe of variables can work...
# just be careful never to create a new instance of either the dict or dataframe

# which is to say, don't do this.  it will be too error prone.

# use df.columns.name to refer to the name of a dict with metadata?  
# then use a separate meta_lib_dict to store df.columns.name:meta_dict key pairs: implementing this!

# OR just keep track of the name of the dict with the metadata? nah

In [169]:
df2 = pd.DataFrame(np.random.uniform(0,10,(5,5)), index=index_list, columns=[f'column{x}' for x in index_list])
df2.columns.name = 'meta_in_cols'
df2.index.name = 'meta_in_index'
meta_lib_index = {df2.index.name:{'metaind1':'A','metaind2':'B'}}
meta_lib_cols = {df2.columns.name:{'metacol1':1,'metacol2':2}}
#print(meta_lib_index[df2.index.name])
#print(meta_lib_cols[df2.columns.name])

In [170]:
# time to examine whether DataFrame.index.name or DataFrame.columns.name is lossy
# it appears that you lose columns.name when saving to csv and reopening.  that's a problem (maybe)

# you lose the columns.name
# you can keep index.name if it loads as a new non-index column with index.name as the column header when using RangeIndex
# if you use index_col=0 in read_csv, the index.name is preserved

#df2['indexnames'] = [f'row{x}' for x in range(1,6)] # works for RangeIndex and named indices, uncomment to see working with names
#df2.set_index('indexnames', inplace=True)

#df2['indexdtimes'] = pd.date_range('2017-07-01', periods=5, freq='2H') # works for datetimeindex, uncomment to see working
#df2.set_index('indexdtimes', inplace=True)

# proof that this works, uncomment below to see for yourself
#print(df2)
#df2.to_csv('df2.csv')
#df3= pd.read_csv('df2.csv', index_col=0)
#df3

In [171]:
df2['column2'] = 3

In [172]:
# function to strip the metadata:columns with only 1 unique value out of df and store in a dict with column:unique pairs
# relies on using the df.index.name as the name of key for accessing the dict in a second dictionary...
#   which stores all of the metadata dictionaries
# set df.name = '[the name of the dataframe]' before calling

# input:            dataframe
#                     /  \
#                    /    \
#                smaller  dict with all of the not well utilized columnar data
# output:      dataframe    


def strip_meta(df):
    meta_dict = {}
    
    # if index.name has a value, store it in the meta_dict under 'index.name'
    if df.index.name is not None:
        if df.index.name != df.name + '_meta':
            meta_dict['index.name'] = df.index.name
        else: raise UserWarning('meta_data being overwritten by strip_meta') # if the meta data has already been written
        
    df.index.name = df.name + '_meta'
    
    # DON'T NEED?? -- preserve the original columns list as meta_data
    #meta_dict['column_list'] = list(df.columns)
    # What about storing del_list in meta_data??s
    
    # go through columns, if only 1 unique value, store in meta_dict with column_name:unique pair and remove column
    del_list = []
    for col in df.columns:
        if df[col].nunique(dropna=False) == 1:
            meta_dict[col] = df.loc[df.index[0],col]
            del_list.append(col)
    if len(del_list) > 0:
        df.drop(del_list, axis = 'columns', inplace=True)
    return (df, meta_dict)

meta_meta_dict = {}
df2.name = 'df2'
df2, meta_meta_dict[testdf.index.name] = strip_meta(df2)
display(df2)
print(meta_meta_dict[df2.index.name])

meta_in_cols,column0,column1,column3,column4
df2_meta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.202955,5.882942,0.621899,3.677148
1,4.507023,0.47793,2.536992,2.47317
2,1.637524,9.21383,3.488948,1.782326
3,3.939406,9.121812,8.476983,1.177324
4,6.091417,4.618425,2.081242,5.919084


{'index.name': 'meta_in_index', 'column2': 3}


In [173]:
sr_25 = pd.read_csv('SRSep-Nov17PM2.5-12lines.csv', parse_dates=[['Date Local','24 Hour Local'],['Date GMT','24 Hour GMT']], index_col='Date Local_24 Hour Local')

In [174]:
sr_25

Unnamed: 0_level_0,Date GMT_24 Hour GMT,Latitude,Longitude,Datum,Horizontal Accuracy,State Code,County Code,Site Num,Parameter Code,POC,...,Day In Year GMT,Sample Measurement,Units of Measure,Sample Duration,Sample Frequency,Detection Limit,Measurement Uncertainty,Qualifier Description,Method Type,Method Description
Date Local_24 Hour Local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-09-04 23:00:00,2017-09-05 07:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,26.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 22:00:00,2017-09-05 06:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,24.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 00:00:00,2017-09-04 08:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,247.0,78.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 20:00:00,2017-09-05 04:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,22.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 19:00:00,2017-09-05 03:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,32.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 21:00:00,2017-09-05 05:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,22.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 18:00:00,2017-09-05 02:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,42.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 17:00:00,2017-09-05 01:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,248.0,43.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 15:00:00,2017-09-04 23:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,247.0,56.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...
2017-09-04 14:00:00,2017-09-04 22:00:00,38.403765,-122.818294,WGS84,2.0,6.0,97.0,4.0,88101.0,3.0,...,247.0,53.0,Micrograms/cubic meter (LC),1 HOUR,,5.0,,Wildfire-U. S.,Equivalent,Met One BAM-1020 Mass Monitor w/VSCC-Beta Atte...


In [175]:
sr_25.name = 'sr_25'   # set the name attribute for use within the strip_meta function
sr_25, meta_meta_dict[sr_25.index.name] = strip_meta(sr_25)

# for this particular data set, remove the GMT columns while saving their range in the meta_data
meta_meta_dict[sr_25.index.name]['Date GMT_24 Hour GMT']=[sr_25['Date GMT_24 Hour GMT'].min(),sr_25['Date GMT_24 Hour GMT'].max()]
meta_meta_dict[sr_25.index.name]['Day In Year GMT']=[sr_25['Day In Year GMT'].min(),sr_25['Day In Year GMT'].max()]
sr_25.drop(['Date GMT_24 Hour GMT', 'Day In Year GMT'], axis = 'columns', inplace=True)

#display the results
display(sr_25)
print(meta_meta_dict[sr_25.index.name])

Unnamed: 0_level_0,Sample Measurement
sr_25_meta,Unnamed: 1_level_1
2017-09-04 23:00:00,26.0
2017-09-04 22:00:00,24.0
2017-09-04 00:00:00,78.0
2017-09-04 20:00:00,22.0
2017-09-04 19:00:00,32.0
2017-09-04 21:00:00,22.0
2017-09-04 18:00:00,42.0
2017-09-04 17:00:00,43.0
2017-09-04 15:00:00,56.0
2017-09-04 14:00:00,53.0


{'index.name': 'Date Local_24 Hour Local', 'Latitude': 38.403765, 'Longitude': -122.818294, 'Datum': 'WGS84', 'Horizontal Accuracy': 2.0, 'State Code': 6.0, 'County Code': 97.0, 'Site Num': 4.0, 'Parameter Code': 88101.0, 'POC': 3.0, 'AQS Parameter Desc': 'PM2.5 - Local Conditions', 'Year GMT': 2017.0, 'Units of Measure': 'Micrograms/cubic meter (LC)', 'Sample Duration': '1 HOUR', 'Sample Frequency': nan, 'Detection Limit': 5.0, 'Measurement Uncertainty': nan, 'Qualifier Description': 'Wildfire-U. S.', 'Method Type': 'Equivalent', 'Method Description': 'Met One BAM-1020 Mass Monitor w/VSCC-Beta Attenuation', 'Date GMT_24 Hour GMT': [Timestamp('2017-09-04 08:00:00'), Timestamp('2017-09-05 07:00:00')], 'Day In Year GMT': [247.0, 248.0]}
