In [1]:
# Pandas gives you the ability to work with Dataframes, which are Excel

import pandas as pd

# Numpy gives you the ability to work with vector math and is very fast.
import numpy as np
from random import sample

# Matplotlib is for graphing
import matplotlib.pyplot as plt

# Allows you to plot within the IpyNotebook browse
%matplotlib inline

In [2]:
# This section reads a csv file and tests to make sure the header is correct

df = pd.read_csv(r"C:\Users\dustin\Desktop\Indoor Air Quality\IAQ - Fairbanks Borough BEES Ratings CSV.csv", header=0)
df.head()

Unnamed: 0,ProjectID,LocationID,City,Address,RatingPoints,FloorArea,RatingType,HouseType,FileName,YearBuilt,CFM50,EnVolumeExpression,ENACH,Bedrooms,FloorArea.1,VentType,YearBuilt.1,MetadataID,RegionName,ANCSARegionId
0,10430,39332,FAIRBANKS,2545 ALLEN ADALE RD,93.2,4650,BEES,Single_Family,Mark Greer re-created 10430.hm2,2008,,64306,1.48,3,4650,HRV,2008,7894,Doyon - Fairbanks North Star Borough only,22
1,14092,38590,FAIRBANKS,1260 LINZ DR,84.6,1656,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\FO...,2009,817.931641,21715~,2.26,4,1656,HRV,2009,14788,Doyon - Fairbanks North Star Borough only,22
2,14234,38790,NORTH POLE,669 5TH AVE,92.0,2594,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\DR...,2009,789.975952,33616~,1.41,4,2594,HRV,2009,14562,Doyon - Fairbanks North Star Borough only,22
3,14235,39427,NORTH POLE,659 W 5TH AVE,92.5,4090,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\DR...,2009,820.8384,44773~,1.1,5,4090,HRV,2009,13382,Doyon - Fairbanks North Star Borough only,22
4,15975,40274,FAIRBANKS,2873 ALDERBERRY TRL,93.3,2731,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\CH...,2009,584.0,25526~,1.372718,3,2731,HRV,2009,16682,Doyon - Fairbanks North Star Borough only,22


In [16]:
df['HouseType'].unique()

array(['Single_Family'], dtype=object)

In [17]:
df['RatingType'].unique()

array(['BEES'], dtype=object)

In [21]:
df['Bedrooms'].unique()

array([3, 4, 5, 2, 1, 7, 6], dtype=int64)

In [4]:
df.dtypes

ProjectID               int64
LocationID              int64
City                   object
Address                object
RatingPoints          float64
FloorArea              object
RatingType             object
HouseType              object
FileName               object
YearBuilt               int64
CFM50                 float64
EnVolumeExpression     object
ENACH                 float64
Bedrooms                int64
FloorArea.1            object
VentType               object
YearBuilt.1             int64
MetadataID              int64
RegionName             object
ANCSARegionId           int64
dtype: object

In [5]:
df.shape

(728, 20)

In [6]:
df['RatingPoints']

0      93.2
1      84.6
2      92.0
3      92.5
4      93.3
5      92.2
6      92.5
7      92.2
8      90.2
9      93.7
10     93.5
11     93.3
12     89.7
13     92.3
14     93.9
15     92.5
16     92.1
17     95.2
18     92.0
19     92.1
20     93.3
21     92.8
22     92.4
23     93.0
24     92.4
25     92.0
26     93.5
27     92.6
28     83.2
29     93.6
       ... 
698    96.3
699    97.5
700    91.3
701    96.0
702    93.1
703    96.4
704    92.0
705    96.0
706    94.4
707    77.4
708    92.8
709    94.4
710    92.7
711    96.7
712    97.4
713    95.5
714    96.0
715    86.8
716    98.0
717    95.4
718    95.6
719    95.8
720    96.2
721    93.3
722    83.2
723    95.7
724    96.2
725    92.7
726    95.6
727    96.7
Name: RatingPoints, dtype: float64

In [7]:
rating_mean = df['RatingPoints'].mean()
rating_mean

92.58173076923077

In [8]:
df['YearBuilt'].describe()

count      728.000000
mean      2060.502747
std        948.171310
min       2008.000000
25%       2009.000000
50%       2010.000000
75%       2012.000000
max      20115.000000
Name: YearBuilt, dtype: float64

In [23]:
shortened_df = df[['City', 'YearBuilt', 'ENACH', 'Bedrooms']]
shortened_df.head()

Unnamed: 0,City,YearBuilt,ENACH,Bedrooms
0,FAIRBANKS,2008,1.48,3
1,FAIRBANKS,2009,2.26,4
2,NORTH POLE,2009,1.41,4
3,NORTH POLE,2009,1.1,5
4,FAIRBANKS,2009,1.372718,3


In [14]:
shortened_df.groupby(['City']).mean()

Unnamed: 0_level_0,YearBuilt,ENACH
City,Unnamed: 1_level_1,Unnamed: 2_level_1
FAIRBANKS,2053.598592,1.463573
NORTH POLE,2071.87415,1.459774
SALCHA,2010.142857,1.626745
TWO RIVERS,2011.0,1.658187


In [15]:
shortened_df.groupby(['City']).count()

Unnamed: 0_level_0,YearBuilt,ENACH
City,Unnamed: 1_level_1,Unnamed: 2_level_1
FAIRBANKS,426,426
NORTH POLE,294,294
SALCHA,7,7
TWO RIVERS,1,1


In [24]:
shortened_df.groupby(['City', 'Bedrooms']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,YearBuilt,ENACH
City,Bedrooms,Unnamed: 2_level_1,Unnamed: 3_level_1
FAIRBANKS,1,2011.25,1.580888
FAIRBANKS,2,2010.979592,1.738857
FAIRBANKS,3,2080.030418,1.376326
FAIRBANKS,4,2010.75,1.438959
FAIRBANKS,5,2011.625,1.038192
FAIRBANKS,7,2008.0,0.51
NORTH POLE,1,2011.111111,2.361362
NORTH POLE,2,2680.888889,1.808797
NORTH POLE,3,2010.275556,1.40629
NORTH POLE,4,2010.321429,1.305329


In [26]:
# Use only unique ID and floor area to be parsed and convert to string.  
#May need to add garage area expression depending on applications

sqft = df[['ProjectID', 'FloorArea']].astype(str)
sqft.head()

Unnamed: 0,ProjectID,FloorArea
0,10430,4650
1,14092,1656
2,14234,2594
3,14235,4090
4,15975,2731


In [27]:
# Takes a series of AkWarm floor area expressions and returns the evaluated values
def cleanExpression(expression_series):
    
    # Split into expressions based on return character
    expressions = pd.DataFrame(list(expression_series.str.split('~')))
    
    # ignore and remove all foot indicators and convert inch text to symbol
    cleaned_expressions = expressions.replace(["'", 'foot', 'feet', 'inches', 'inch'], [' ', ' ', ' ', '"', '"'], regex=True)
    
    # Remove labels at the beginning of expressions if they end with a ':'.
    cleaned_expressions = cleaned_expressions.replace('.*:', ' ', regex=True)
    
    # Convert inches into decimals. Note: only works up to half inches-- will not compute for decimal inches, e.g. 1.25"
    inches = [' 1"', ' 2"', ' 3"', ' 4"', ' 5"', ' 6"', ' 7"', ' 8"', ' 9"', ' 10"', ' 11"']
    decimals = ['.083333', '.166667', '.25', '.333333', '.416667', '.5', '.583333', '.666667', '.75', '.833333', '.9166667']
    half_inches = [' 0.5"' ,' 1.5"' ,' 2.5"' ,' 3.5"' ,' 4.5"' ,' 5.5"' ,' 6.5"' ,' 7.5"' ,' 8.5"' ,' 9.5"' ,' 10.5"' ,' 11.5"']
    half_decimals = ['.041667', '.125', '.2083333', '.291667', '.375', '.458333', '.541667', '.625', '.708333', '.791667', '.875' ,'.958333']
    cleaned_expressions = cleaned_expressions.replace(inches, decimals, regex=True)
    cleaned_expressions = cleaned_expressions.replace(half_inches, half_decimals, regex=True)
    cleaned_expressions = cleaned_expressions.replace(' ', '', regex=True)
    
    # Replace all empty and NaN cells with 0
    cleaned_expressions = cleaned_expressions.replace(['', 'nan'], [0,0], regex=True)
    cleaned_expressions = cleaned_expressions.applymap(lambda x: np.nan if isinstance(x, basestring) and x.isspace() else x)
    cleaned_expressions = cleaned_expressions.fillna(0)
    
    return cleaned_expressions

def evaluate_expression(cleaned_expressions):
    # This evaluates the string based formula and returns a float for each cell.
    formula = lambda x: pd.eval(x)
    cleaned_expressions = cleaned_expressions.applymap(formula)
    
    # This sums the expressions for an AkWarm Rating
    evaluated_expression = cleaned_expressions.sum(axis=1)
    
    return evaluated_expression

In [28]:
# Make sure that the two column names are the same as the column names to be evaluated

cleaned_sf = cleanExpression(sqft['FloorArea'])
df['EvaluatedFloorArea'] = evaluate_expression(cleaned_sf)

# Only need the below section if evaluating garage area as well. 

#cleaned_garage_sf = cleanExpression(sqft['GarageArea'])
#df['EvaluatedGarageArea'] = evaluate_expression(cleaned_garage_sf)

In [29]:
df.head()

Unnamed: 0,ProjectID,LocationID,City,Address,RatingPoints,FloorArea,RatingType,HouseType,FileName,YearBuilt,...,EnVolumeExpression,ENACH,Bedrooms,FloorArea.1,VentType,YearBuilt.1,MetadataID,RegionName,ANCSARegionId,EvaluatedFloorArea
0,10430,39332,FAIRBANKS,2545 ALLEN ADALE RD,93.2,4650,BEES,Single_Family,Mark Greer re-created 10430.hm2,2008,...,64306,1.48,3,4650,HRV,2008,7894,Doyon - Fairbanks North Star Borough only,22,4650
1,14092,38590,FAIRBANKS,1260 LINZ DR,84.6,1656,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\FO...,2009,...,21715~,2.26,4,1656,HRV,2009,14788,Doyon - Fairbanks North Star Borough only,22,1656
2,14234,38790,NORTH POLE,669 5TH AVE,92.0,2594,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\DR...,2009,...,33616~,1.41,4,2594,HRV,2009,14562,Doyon - Fairbanks North Star Borough only,22,2594
3,14235,39427,NORTH POLE,659 W 5TH AVE,92.5,4090,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\DR...,2009,...,44773~,1.1,5,4090,HRV,2009,13382,Doyon - Fairbanks North Star Borough only,22,4090
4,15975,40274,FAIRBANKS,2873 ALDERBERRY TRL,93.3,2731,BEES,Single_Family,C:\Domains\akrebate.com\wwwroot\AkWarmFiles\CH...,2009,...,25526~,1.372718,3,2731,HRV,2009,16682,Doyon - Fairbanks North Star Borough only,22,2731


In [30]:
# Make sure to enter a new name for the file each time
df.to_csv("IAQ_FAI_BEES_Evaluated_Floor_Area.csv")