In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LinearRegression
import gc
import holidays
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression

# !pip install datatable
# !pip install pyspark
# from pyspark import SparkContext
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import *

# import datatable as dt

# from pyspark.ml.regression import LinearRegression,RandomForestRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ashrae-energy-prediction/building_metadata.csv
/kaggle/input/ashrae-energy-prediction/weather_train.csv
/kaggle/input/ashrae-energy-prediction/test.csv
/kaggle/input/ashrae-energy-prediction/sample_submission.csv
/kaggle/input/ashrae-energy-prediction/train.csv
/kaggle/input/ashrae-energy-prediction/weather_test.csv


In [2]:
building_meta=pd.read_csv('/kaggle/input/ashrae-energy-prediction/building_metadata.csv')
weather_tr=pd.read_csv('/kaggle/input/ashrae-energy-prediction/weather_train.csv')                         
test=pd.read_csv('/kaggle/input/ashrae-energy-prediction/test.csv')
train=pd.read_csv('/kaggle/input/ashrae-energy-prediction/train.csv')
weather_tst=pd.read_csv('/kaggle/input/ashrae-energy-prediction/weather_test.csv')

In [3]:
## Memory optimization

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
train=reduce_mem_usage(train,use_float16=True)
weather_tr=reduce_mem_usage(weather_tr,use_float16=True)
building_meta=reduce_mem_usage(building_meta,use_float16=True)

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.65 MB
Decreased by 72.4%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%


In [8]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


 ### Prepare data
 join with 2 df buiding and weather both datasets train and test

In [15]:

def prepare_date(X,building,waether,test=False):
    X=X.merge(building,on='building_id',how='left')
    X=X.merge(waether,on=['site_id','timestamp'],how='left')
    X.timestamp=pd.to_datetime(X.timestamp,format='%Y-%m-%d %H:%M:%S')
    X.square_feet=np.log1p(X.square_feet)
    
    if not test:
        X.sort_values('timestamp',inplace=True)
        X.reset_index(drop=True , inplace=True)
        
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X['hour']=X.timestamp.dt.hour
    X['weekday']=X.timestamp.dt.weekday
    X['is_holiday']=(X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed"]

    X.drop(drop_features, axis=1, inplace=True)
    
    if test:
        row_ids=X.row_id
        X.drop('row_id',inplace=True,axis=1)
        return X,row_ids
        
    else:
        y=np.log1p(X.meter_reading)
        X.drop('meter_reading',inplace=True,axis=1)
        
        return X,y

In [16]:
X_train,y_train = prepare_date(train,building_meta,weather_tr)
del train,weather_tr
gc.collect()


4514

In [17]:
X_train.head()

AttributeError: 'NoneType' object has no attribute 'head'

## Naive Model 

In [None]:
from sklearn import preprocessing
OHE = preprocessing.OneHotEncoder()


In [None]:

# cat_cals=['meter','primary_use','site_id']
# X=train[:,'square_feet']
# for cat in cat_cals:
#     X[:,cat]=np.array(OHE.fit(train[:,cat]))

# X.head()

In [None]:
train.names

In [None]:
model = LinearRegression()
model.fit(train[:,["meter", "site_id","square_feet"]], np.log1p(train[:,"meter_reading"]))
model.coef_

In [None]:
test.head()

In [None]:
test=test[:,:,dt.join(sample_meta_buliding)]
test.head()

In [None]:
preds=model.predict(test[:,["meter", "site_id","square_feet"]])


In [None]:
preds=[max(0,float(x)) for x in preds]
preds

In [None]:
sample_output=pd.DataFrame(preds,columns=['meter_reading'])
sample_output=sample_output.reset_index()
sample_output=sample_output.rename(columns={'index':'row_id'})
sample_output.to_csv('sample_submission.csv',index=False)

sample_output.head()

In [None]:
sample_output.columns