In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

## Label

In [2]:
energy_types = ["electricity", "gas"]
companies = ["enexis", "liander", "stedin"]
features = ["city", "delivery_perc", "num_connections", "perc_of_active_connections", "type_conn_perc", "annual_consume", "annual_consume_lowtarif_perc", "smartmeter_perc"]
real_num_features = ["city", "num_connections", "annual_consume", "active_connections", "annual_lowtarif_consume", "product_electricity", "p_type_conn", "smartmeter"]
perc_features = ["city", "active_conn_perc", "consume_per_conn", "lowtarif_perc", "product_perc", "p_type_conn_perc", "years", "smartmeter_perc", "true_label"]

In [3]:
def label_data(label):
    if(label <= 0.1):
        return 5
    elif(label <= 0.2):
        return 4
    elif(label <= 0.4):
        return 3
    elif(label <= 0.7):
        return 2
    else:
        return 1

In [4]:
def repartition(data):
    data["active_connections"] = (data["num_connections"] * data["perc_of_active_connections"] / 100).apply(lambda x: int(x))
    data["annual_lowtarif_consume"] = data["annual_consume"] * data["annual_consume_lowtarif_perc"] / 100
    data["product_electricity"] = data["annual_consume"] * (1 - data["delivery_perc"] / 100)
    data["p_type_conn"] = (data["active_connections"] * data["type_conn_perc"] / 100).apply(lambda x: int(x))
    data["smartmeter"] = (data["active_connections"] * data["smartmeter_perc"] / 100).apply(lambda x: int(x))
    data = pd.DataFrame(data.loc[:, real_num_features])
    # Group by city
    data = data.groupby("city").sum().reset_index()
    
    data["active_conn_perc"] = data["active_connections"] / data["num_connections"]
    data["consume_per_conn"] = data["annual_consume"] / data["active_connections"]
    # data["lowtarif_per_conn"] = data["annual_lowtarif_consume"] / data["active_connections"]
    # data["product_per_conn"] = data["product_electricity"] / data["active_connections"]
    data["lowtarif_perc"] = data["annual_lowtarif_consume"] / data["annual_consume"]
    data["product_perc"] = data["product_electricity"] / data["annual_consume"]
    data["p_type_conn_perc"] = data["p_type_conn"] / data["active_connections"]
    data["smartmeter_perc"] = data["smartmeter"] / data["active_connections"]
    data["years"] = year
    data["true_label"] = data["smartmeter_perc"].apply(label_data)
    data = data[perc_features]
    return data[perc_features]

In [6]:
for energy_type in energy_types:
    for year in range(2010, 2020):
        raw_data = pd.DataFrame()
        for company in companies:
            origin = pd.read_csv("%s/%s_%s_%s.csv" % (energy_type, company, energy_type, year))[features]
            raw_data = raw_data.append(origin.dropna(), ignore_index = True)
        output_data = repartition(raw_data)
        output_data.to_csv("%s_labeled_data/%s.csv" % (energy_type[0], year), index = False, float_format = "%g")

## Add Geocode

In [20]:
e_geocode = pd.read_csv("e_geocode.csv")

In [21]:
labeled_data = pd.DataFrame()
for year in range(2010, 2020):
    labeled_data = labeled_data.append(pd.read_csv("e_labeled_data/%s.csv" % year))

In [24]:
e_geocode.head()

Unnamed: 0,city,years,companies,predict_label,latitude,longtitude
0,'S GRAVENHAGE,2015,stedin,1,52.083333,4.3
1,'S GRAVENHAGE,2016,stedin,2,52.083333,4.3
2,'S GRAVENHAGE,2017,stedin,2,52.083333,4.3
3,'S GRAVENHAGE,2019,stedin,3,52.083333,4.3
4,'S-GRAVENHAGE,2010,liander,3,52.083333,4.3


In [51]:
temp = e_geocode.merge(right = labeled_data, how = "left", on = ["city", "years"])
temp = temp.dropna().loc[:,["city", "years", "companies", "latitude", "longtitude", "true_label"]].reset_index(drop = True)
temp = temp.drop_duplicates()
temp["true_label"] = temp["true_label"].apply(lambda x: int(x))
temp.to_csv("e_geocode_labeled.csv", index = False, float_format = "%g")

In [52]:
g_geocode = pd.read_csv("g_geocode.csv")

In [55]:
labeled_data = pd.DataFrame()
for year in range(2010, 2020):
    labeled_data = labeled_data.append(pd.read_csv("g_labeled_data/%s.csv" % year))

In [58]:
temp = g_geocode.merge(right = labeled_data, how = "left", on = ["city", "years"])
temp = temp.dropna().loc[:,["city", "years", "companies", "latitude", "longtitude", "true_label"]].reset_index(drop = True)
temp = temp.drop_duplicates()
temp["true_label"] = temp["true_label"].apply(lambda x: int(x))
temp.to_csv("g_geocode_labeled.csv", index = False, float_format = "%g")