# Create dictionaries for steamlit app data

use lewagon pyenv for any case

In [1]:
import pandas as pd


---

# original data


In [2]:
raw = pd.read_csv('../raw_data/recs2020_public_v6.csv')


In [3]:
# these features were selected by hand (csv imported from xls file on google drive)
selected = pd.read_csv('../csv/selected_features.csv', index_col=0)


In [4]:
# precompute dicts
val_cols = ['Description and Labels','Response Codes']
key_col=['Variable']
label_dict = selected.set_index(key_col)[val_cols[0]].to_dict()
values_dict = selected.set_index(key_col)[val_cols[1]].to_dict()


In [5]:
sections = list(selected.Section.unique())

print(sections)


['GEOGRAPHY', 'ADMIN', 'YOUR HOME', 'APPLIANCES', 'ELECTRONICS', 'SPACE HEATING', 'AIR CONDITIONING', 'LIGHTING', 'ENERGY BILLS', 'HOUSEHOLD CHARACTERISTICS']


In [10]:
#! pip install openpyxl


In [9]:
codebook = pd.read_excel(
    '../raw_data/RECS 2020 Codebook for Public File - v6.xlsx'
    , header=1)
codebook.head()


Unnamed: 0,Variable,Type,Description and Labels,Response Codes,Section
0,DOEID,Num,Unique identifier for each respondent,100001-118496,ADMIN
1,REGIONC,Char,Census Region,Midwest\nNortheast\nSouth\nWest,GEOGRAPHY
2,DIVISION,Char,"Census Division, Mountain Division is divided ...",East North Central\nEast South Central\nMiddle...,GEOGRAPHY
3,STATE_FIPS,Char,State Federal Information Processing System Code,state_dictionary!A1,GEOGRAPHY
4,state_postal,Char,State Postal Code,state_dictionary!A1,GEOGRAPHY


In [20]:
# which things in which section
section_dict = (codebook[['Variable','Section']]
                .groupby('Section')
                .agg(lambda x: list(x))
                .reset_index()
                )
section_dict.head()


Unnamed: 0,Section,Variable
0,ADMIN,"[DOEID, BA_climate, IECC_climate_code, UATYP10]"
1,AIR CONDITIONING,"[AIRCOND, COOLAPT, ACEQUIPM_PUB, ACEQUIPAGE, A..."
2,APPLIANCES,"[NUMFRIG, SIZRFRI1, TYPERFR1, AGERFRI1, ICE, S..."
3,ELECTRONICS,"[TVCOLOR, TVSIZE1, TVTYPE1, TVUSE1, TVONWD1, T..."
4,ENERGY ASSISTANCE,"[SCALEB, SCALEG, SCALEE, PAYHELP, NOHEATBROKE,..."


#### Which feature belongs to each section

In [29]:
section_dict.set_index('Section')['Variable'].to_dict()


{'ADMIN': ['DOEID', 'BA_climate', 'IECC_climate_code', 'UATYP10'],
 'AIR CONDITIONING': ['AIRCOND',
  'COOLAPT',
  'ACEQUIPM_PUB',
  'ACEQUIPAGE',
  'ACEQUIPAUXTYPE_PUB',
  'NUMDLHPAC',
  'NUMWWAC',
  'NUMPORTAC',
  'BASECOOL',
  'ATTCCOOL',
  'GARGCOOL',
  'NUMCFAN',
  'NUMFLOORFAN',
  'USECFAN',
  'HOUSEFAN',
  'ATTICFAN',
  'DEHUMTYPE',
  'NUMPORTDEHUM',
  'USEDEHUM',
  'ELCOOL',
  'ZACEQUIPAGE',
  'ZAIRCOND',
  'ZATTCCOOL',
  'ZATTICFAN',
  'ZBASECOOL',
  'ZCOOLAPT',
  'ZCOOLCNTL',
  'ZDEHUMTYPE',
  'ZGARGCOOL',
  'ZHOUSEFAN',
  'ZNUMCFAN',
  'ZNUMDLHPAC',
  'ZNUMFLOORFAN',
  'ZNUMPORTAC',
  'ZNUMPORTDEHUM',
  'ZNUMWWAC',
  'ZUSECFAN',
  'ZUSEDEHUM',
  'ZACEQUIPM_PUB',
  'ZACEQUIPAUXTYPE_PUB'],
 'APPLIANCES': ['NUMFRIG',
  'SIZRFRI1',
  'TYPERFR1',
  'AGERFRI1',
  'ICE',
  'SIZRFRI2',
  'TYPERFR2',
  'AGERFRI2',
  'LOCRFRI2',
  'WINECHILL',
  'NUMFREEZ',
  'UPRTFRZR',
  'SIZFREEZ',
  'FREEZER',
  'AGEFRZR',
  'RANGE',
  'COOKTOP',
  'OVEN',
  'RANGEFUEL',
  'RANGEINDT',
  'RCOOKUSE

---
## Define feature types

In [12]:
# purely selectbox features
checkbox_features = selected['Variable'][ (selected['Response Codes'].str.contains('\n')) & (selected['Type']=='Char')].to_list()
checkbox_features.remove('REGIONC')
checkbox_features


['BA_climate', 'IECC_climate_code']

In [13]:
# numeric selectbox features
num_checkbox_features = selected['Variable'][ (selected['Response Codes'].str.contains('\n')) & (selected['Type']=='Num')].to_list()
print(num_checkbox_features)


['TYPEHUQ', 'STORIES', 'YEARMADERANGE', 'WALLTYPE', 'ROOFTYPE', 'WINDOWS', 'SWIMPOOL', 'DISHWASH', 'CWASHER', 'DRYER', 'TELLWORK', 'TELLDAYS', 'HEATHOME', 'EQUIPM', 'NUMPORTEL', 'AIRCOND', 'NUMPORTAC', 'SMARTMETER', 'SOLAR']


In [14]:
# purely numeric features
# TODO extract range and put reasonable defaults
numeric_features = selected['Variable'][selected['Response Codes'].str.match('[0-9]+\s*-\s*[0-9]+$')].to_list()
print(numeric_features)


['NCOMBATH', 'NHAFBATH', 'TOTROOMS', 'NUMFRIG', 'MICRO', 'TVCOLOR', 'DESKTOP', 'NUMLAPTOP', 'LGTIN1TO4', 'LGTIN4TO8', 'LGTINMORE8', 'NHSLDMEM', 'SQFTEST']


In [15]:
# features whihc are numeric but are input through dropdown selectbox
numeric_features_dropdown = selected['Variable'][selected['Response Codes'].str.match('[0-9]+\s*-\s*[0-9]+\n')].to_list()
numeric_features_dropdown


['TELLDAYS', 'NUMPORTEL', 'NUMPORTAC']

---
### create mappings for numeric selectbox features

In [16]:
mapped_features={}
for feature in num_checkbox_features:
    mapped_features[feature]=dict(val.split(' ', 1)[::-1] for val in values_dict.get(feature).split('\n'))
for feature in numeric_features_dropdown:
    str_range, text  = values_dict[feature].split('\n')
    d = {str(k):str(k) for k in range(int(str_range.split(' - ')[-1])+1)}
    d.update({text.split(' ',1)[1]:text.split(' ',1)[0]})
    mapped_features[feature] = d


In [35]:
mapped_features #direction : from text to int


{'TYPEHUQ': {'Mobile home': '1',
  'Single-family house detached from any other house ': '2',
  'Single-family house attached to one or more other houses (for example: duplex, row house, or townhome)': '3',
  'Apartment in a building with 2 to 4 units': '4',
  'Apartment in a building with 5 or more units': '5'},
 'STORIES': {'One story': '1',
  'Two stories': '2',
  'Three stories': '3',
  'Four or more stories': '4',
  'Split-level': '5',
  'Not applicable': '-2'},
 'YEARMADERANGE': {'Before 1950': '1',
  '1950 to 1959': '2',
  '1960 to 1969': '3',
  '1970 to 1979': '4',
  '1980 to 1989': '5',
  '1990 to 1999': '6',
  '2000 to 2009': '7',
  '2010 to 2015': '8',
  '2016 to 2020': '9'},
 'WALLTYPE': {'Brick': '1',
  'Wood': '2',
  'Siding (aluminum, fiber cement, vinyl, or steel) ': '3',
  'Stucco': '4',
  'Shingle (composition)': '5',
  'Stone ': '6',
  'Concrete block ': '7',
  'Other': '99'},
 'ROOFTYPE': {'Ceramic or clay tiles': '1',
  'Wood shingles/shakes': '2',
  'Metal': '3',


---

## create state dictionary

In [19]:
state_dictionary = pd.read_csv('../csv/state_dictionary.csv')
state_dictionary.head(1)


Unnamed: 0,STATE_FIPS,state_postal,state_name
0,1,AL,Alabama


In [20]:
#state_dictionary.drop(columns='STATE_FIPS').set_index('state_postal').to_dict('state_name')

{code:state for code, state in zip(state_dictionary.state_postal, state_dictionary.state_name)}
# faster
#df.set_index(KEY).to_dict()[VALUE]
#https://stackoverflow.com/questions/17426292/how-to-create-a-dictionary-of-two-pandas-dataframe-columns#17426500


{'AL': 'Alabama',
 'AK': 'Alaska',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'DC': 'District of Columbia',
 'FL': 'Florida',
 'GA': 'Georgia',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VT': 'Vermont',
 'VA': 'Virginia',
 'WA': 'Washington',
 'WV': 'West Virginia',
 'WI': 'Wisconsin',
 'WY': 'Wyoming

In [21]:
state_dictionary.set_index('state_postal')['state_name'].to_dict()


{'AL': 'Alabama',
 'AK': 'Alaska',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'DC': 'District of Columbia',
 'FL': 'Florida',
 'GA': 'Georgia',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VT': 'Vermont',
 'VA': 'Virginia',
 'WA': 'Washington',
 'WV': 'West Virginia',
 'WI': 'Wisconsin',
 'WY': 'Wyoming

In [22]:
# map region from state
raw[['state_name','REGIONC']].drop_duplicates().set_index('state_name')['REGIONC'].to_dict()


{'New Mexico': 'WEST',
 'Arkansas': 'SOUTH',
 'South Carolina': 'SOUTH',
 'New Jersey': 'NORTHEAST',
 'Texas': 'SOUTH',
 'Oklahoma': 'SOUTH',
 'Mississippi': 'SOUTH',
 'District of Columbia': 'SOUTH',
 'Arizona': 'WEST',
 'California': 'WEST',
 'Louisiana': 'SOUTH',
 'Minnesota': 'MIDWEST',
 'Vermont': 'NORTHEAST',
 'Rhode Island': 'NORTHEAST',
 'Illinois': 'MIDWEST',
 'Maine': 'NORTHEAST',
 'South Dakota': 'MIDWEST',
 'Massachusetts': 'NORTHEAST',
 'Florida': 'SOUTH',
 'Ohio': 'MIDWEST',
 'Nebraska': 'MIDWEST',
 'Virginia': 'SOUTH',
 'Wyoming': 'WEST',
 'Pennsylvania': 'NORTHEAST',
 'Hawaii': 'WEST',
 'New Hampshire': 'NORTHEAST',
 'Michigan': 'MIDWEST',
 'Maryland': 'SOUTH',
 'New York': 'NORTHEAST',
 'Colorado': 'WEST',
 'North Carolina': 'SOUTH',
 'Kentucky': 'SOUTH',
 'North Dakota': 'MIDWEST',
 'Georgia': 'SOUTH',
 'West Virginia': 'SOUTH',
 'Oregon': 'WEST',
 'Missouri': 'MIDWEST',
 'Utah': 'WEST',
 'Connecticut': 'NORTHEAST',
 'Tennessee': 'SOUTH',
 'Wisconsin': 'MIDWEST',
 'Id

---

## final dataframe for testing

In [None]:
X_new = pd.DataFrame.from_dict({'feat1':[1], 'feat2':[-2]},orient='columns')
X_new


Unnamed: 0,feat1,feat2
0,1,-2
