In [15]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from numpy import nan
from math import isnan
import warnings
import matplotlib.pyplot as plt
warnings.simplefilter(action='ignore', category=FutureWarning)
import statistics as st
# warnings.simplefilter(action='ignore', category=pd.SettingWithCopyWarning)

In [82]:
features = ['GROSS_BUILDING_FLOOR_AREA_ft_', 'OCCUPANCY', 'PROPERTY_TYPE']
using_original = False

In [83]:
data = pd.read_csv('myfilter.csv', header='infer')
if using_original:
    bad_cols = ['BUILDING ADDRESS', 'BUILDING ID', 'COMPLIANCE STATUS', 'ENERGY STAR SCORE', 
                'ENERGY STAR CERTIFICATION - ELIGIBILITY',
                'ENERGY STAR CERTIFICATION - LAST APPROVAL DATE',
                'ENERGY STAR CERTIFICATION - YEAR(S) CERTIFIED', 'ENTITY RESPONSIBLE FOR BENCHMARK', 
                'INDOOR WATER USE (kgal)', 'INDOOR WATER USE INTENSITY (gal/ft²)', 'OUTDOOR WATER USE (kgal)',
                'PROGRAM YEAR', 'WEATHER NORMALIZED SITE ENERGY USE INTENSITY (EUI) (kBtu/ft²)',
                'WEATHER NORMALIZED SOURCE ENERGY USE INTENSITY (EUI) (kBtu/ft²)', 'AIN',
                'LADBS Building Category']
    data = data.drop(bad_cols, axis=1)
    data = data.dropna()
    a = data.columns.tolist()
    for col in a:
        try:
            data = data[data[col] != 'Not Available']
        except:
            pass

X = data[features]
y = data['CARBON_DIOXIDE_EMISSIONS_Metric_Ton_CO2e_']

if 'PROPERTY_TYPE' in features:
    property_types = X['PROPERTY_TYPE'].unique().tolist()
    # residential = 0, commercial = 1, office/industrial = 2, public = 3, mixed = 4 miscellaneous = 5
    property_map = {'Multifamily Housing': 0,
             'Library': 3,
             'Hotel': 1,
             'Manufacturing/Industrial Plant': 2,
             'Other': 5,
             'Mixed Use Property': 4,
             'Office': 2,
             'Hospital (General Medical & Surgical)': 3,
             'Medical Office': 3,
             'Distribution Center': 2,
             'Self-Storage Facility': 1,
             'Strip Mall': 1,
             'K-12 School': 3,
             'Other - Recreation': 3,
             'Non-Refrigerated Warehouse': 2,
             'Residential Care Facility': 0,
             'Other - Utility': 3,
             'Parking': 5,
             'Refrigerated Warehouse': 2,
             'Lifestyle Center': 1,
             'Food Service': 1,
             'Fire Station': 3,
             'Other - Lodging/Residential': 0,
             'Retail Store': 1,
             'Senior Care Community': 0,
             'Supermarket/Grocery Store': 1,
             'Enclosed Mall': 1,
             'College/University': 3,
             'Other - Public Services': 3,
             'Museum': 1,
             'Laboratory': 2,
             'Worship Facility': 3,
             'Other - Entertainment/Public Assembly': 3,
             'Performing Arts': 3,
             'Bank Branch': 2,
             'Mailing Center/Post Office': 3,
             'Vocational School': 3,
             'Data Center': 2,
             'Other - Mall': 1,
             'Other - Services': 1,
             'Energy/Power Station': 2,
             'Automobile Dealership': 1,
             'Police Station': 3,
             'Financial Office': 2,
             'Indoor Arena': 3,
             'Repair Services (Vehicle, Shoe, Locksmith, etc.)': 1,
             'Social/Meeting Hall': 3,
             'Movie Theater': 1,
             'Food Sales': 1,
             'Restaurant': 1,
             'Immeuble � logements multiples': 5,
             'Transportation Terminal/Station': 3,
             'Residence Hall/Dormitory': 0,
             'Wholesale Club/Supercenter': 1,
             'Pre-school/Daycare': 1,
             'Fitness Center/Health Club/Gym': 1,
             'Other - Specialty Hospital': 3,
             'Ice/Curling Rink': 1,
             'Ambulatory Surgical Center': 3,
             'Other - Restaurant/Bar': 1,
             'Bureau': 3,
             'Outpatient Rehabilitation/Physical Therapy': 1,
             'Bowling Alley': 1,
             'Convention Center': 3,
             'Other - Technology/Science': 3,
             'Other - Education': 3,
             'Urgent Care/Clinic/Other Outpatient': 3,
             'Stationnement': 5,
             'Personal Services (Health/Beauty, Dry Cleaning, etc.)': 1,
             'Stadium (Open)': 3,
             'Courthouse': 3,
             'Roller Rink': 1,
             'Adult Education': 3,
             'Convenience Store without Gas Station': 1,
             'Convenience Store with Gas Station': 1,
             'Veterinary Office': 2}
    property_map2 = {pType : index for index, pType in enumerate(property_types)}
    X['PROPERTY_TYPE'] = X.apply(lambda row: property_map[row['PROPERTY_TYPE']], axis=1)

In [101]:
while metrics.r2_score(y_test, predicted_results) < 0.71:
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predicted_results = model.predict(X_test)
    #print("R2 score: " + str(metrics.r2_score(y_test, predicted_results)))
    #print("Explained variance score: " + str(metrics.explained_variance_score(y_test, predicted_results)))
print("R2 score: " + str(metrics.r2_score(y_test, predicted_results)))

KeyboardInterrupt: 

In [6]:
data.columns

Index(['CARBON_DIOXIDE_EMISSIONS_Metric_Ton_CO2e_',
       'X_DIFFERENCE_FROM_NATIONAL_MEDIAN_SOURCE_EUI',
       'X_DIFFERENCE_FROM_NATIONAL_MEDIAN_SITE_EUI',
       'GROSS_BUILDING_FLOOR_AREA_ft_', 'NUMBER_OF_BUILDINGS', 'OCCUPANCY',
       'POSTAL_CODE', 'PROPERTY_TYPE',
       'SITE_ENERGY_USE_INTENSITY_EUI_kBtu_ft_', 'Source_EUI_kBtu_ft_',
       'TOTAL_WATER_USE_kgal_', 'YEAR_BUILT'],
      dtype='object')

In [91]:
based_results = []

In [93]:
based_results.append([X_train, X_test, y_train, y_test])

In [102]:
based_results = based_results[0]

In [137]:
model = LinearRegression()
model.fit(based_results[0], based_results[2])
predicted_results = model.predict(based_results[1])
print("R2 score: " + str(metrics.r2_score(based_results[3], predicted_results)))

R2 score: 0.7046762947874733


In [133]:
model.coef_[0] * 180753 + model.intercept_

657.1535746850208

In [136]:
model.coef_[0]

0.003417097532735685

In [134]:
predicted_results

array([ 657.15357469, 2542.01090535,  184.14868291, ...,   99.64044383,
        643.31432968,  161.98880541])

In [3]:
data = pd.read_csv('myfilter.csv', header='infer')

In [18]:
carbon = data['CARBON_DIOXIDE_EMISSIONS_Metric_Ton_CO2e_']
sqft = data['GROSS_BUILDING_FLOOR_AREA_ft_']
res = (carbon / sqft)

In [22]:
res.to_csv('res.csv', index = False, header=True)

In [24]:
res = pd.read_csv('res.csv', header='infer')

In [38]:
len(res[res.ratio < 0.0055]) / len(res)

0.8853740569039178