In [61]:
#%pylab inline
#%matplotlib inline
    
import pywt
import numpy as np
import seaborn
from statsmodels.robust import mad
import matplotlib.pyplot as plt  
import sys
import pandas as pd
import datetime
from dateutil.parser import parse
from tqdm import tqdm

In [None]:
def waveletSmooth( x, wavelet="db4", level=1, title=None ):
    ''' smooth denoise'''
    # calculate the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per" )
    # calculate a threshold
    sigma = mad( coeff[-level] )
    # changing this threshold also changes the behavior,
    # but I have not played with this very much
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode="soft" ) for i in coeff[1:] )
    # reconstruct the signal using the thresholded coefficients
    y = pywt.waverec( coeff, wavelet, mode="per" )
    return y

In [106]:

def read(filePath):
    '''
    read data 2 dict
    '''
    #!!!
    data = pd.read_csv(filePath)
    data['record_date'] = data['record_date'].apply(lambda x:parse(str(x)).strftime('%Y-%m-%d'))
    data['week'] = data['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').weekday())
    data['month'] = data['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').month)
    data['day'] = data['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').day)
    data['year'] = data['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').year)
    data.to_csv("./data/dataRead.csv",index=False)
    
    return data
data.head()

Unnamed: 0,record_date,user_id,power_consumption,week,month,day,year
0,2015-01-01,1,1135,3,1,1,2015
1,2015-01-02,1,570,4,1,2,2015
2,2015-01-03,1,3418,5,1,3,2015
3,2015-01-04,1,3968,6,1,4,2015
4,2015-01-05,1,3986,0,1,5,2015


In [65]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

pdf = PdfPages('./data/originyearly.pdf')

def draw(data):
    '''
    read data 2 dict
    '''
    data2 = data.groupby('user_id')
    for key,name in tqdm(data2):
        fig = plt.figure(1,figsize=(20,10))
        timeSerie = pd.Series(data=name['power_consumption'].values,index=name['record_date'])
        ax = plt.plot(timeSerie.values[:365],'b')
        ax = plt.plot(timeSerie.values[365:],'r')
        pdf.savefig()
        plt.close()
    pdf.close()
    return data

data = draw(data)

100%|██████████| 1454/1454 [02:49<00:00,  9.08it/s]


In [108]:
# 归一化保存
from sklearn.preprocessing import StandardScaler, RobustScaler

scalerDic={} # store scaler class
def scaleProcess(key,data):
    robust_scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0), copy=True)
    data['power_consumption_scale'] = robust_scaler.fit_transform(data['power_consumption'].reshape(-1,1))
    scalerDic[key] = robust_scaler
    return data

scaleList=[] # userid scaled matrix
def scaleData(data):
    '''
    read data 2 dict
    '''
    pdf = PdfPages('./data/scale.pdf')
    
    #draw
    data2 = data.groupby('user_id')
    for key,truck in tqdm(data2): 
        scaleData = scaleProcess(key,truck)
        scaleList.append(scaleData)
        
        fig = plt.figure(1,figsize=(20,10))
        timeSerie2 = pd.Series(data=scaleData['power_consumption_scale'].values,index=scaleData['record_date'])
        ax = plt.plot(timeSerie2.values,'r')
        pdf.savefig()
        plt.close()
    pdf.close()
 
scaleData(data)

# scaleList  save 
import pickle
with open('./data/scaleList', 'wb') as fp:
    pickle.dump(scaleList, fp)
    
with open('./data/scalerDic', 'wb') as fp:
    pickle.dump(scalerDic, fp)   

#check
#datax = scalerDic[2].inverse_transform(scaleList[1]['power_consumption_scale'])
#plt.plot(datax)
#plt.plot(scaleList[1]['power_consumption'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 1454/1454 [34:33<00:00,  2.58s/it] 


In [109]:
#read scale
with open ('./data/scaleList', 'rb') as fp:
    scaleList = pickle.load(fp)
    
with open ('./data/scalerDic', 'rb') as fp:
    scalerDic = pickle.load(fp)   

In [111]:
scaleList[0].head()

Unnamed: 0,record_date,user_id,power_consumption,week,month,day,year,power_consumption_scale
0,2015-01-01,1,1135,3,1,1,2015,1.057134
1,2015-01-02,1,570,4,1,2,2015,0.194012
2,2015-01-03,1,3418,5,1,3,2015,4.54476
3,2015-01-04,1,3968,6,1,4,2015,5.384968
4,2015-01-05,1,3986,0,1,5,2015,5.412466


In [110]:
# 用去年的预测今年的数据 按照日期对应 1-7月训练 8月validation 
def getTrain(scaleList=scaleList,testMonth=8):
    originTrainXList = []
    originTrainYList = []
    originTestXList=[]
    originTestYList=[]
    for userData in tqdm(scaleList):
        trainX = userData[userData.month < testMonth & userData.year == 2015][['power_consumption'],['week']]
        trainY = userData[userData.month < testMonth & userData.year == 2016][['power_consumption'],['week']]
        testX = userData[userData.month == testMonth & userData.year == 2015][['power_consumption'],['week']]
        testY = userData[userData.month == testMonth & userData.year == 2016][['power_consumption'],['week']]
        originTrainXList.append(trainX)
        originTrainYList.append(trainY)
        originTestXList.append(testX)
        originTestYList.append(testY)
        
    with open('./data/originTrainXList', 'wb') as fp:
        pickle.dump(originTrainXList, fp)
    
    with open('./data/originTrainYList', 'wb') as fp:
        pickle.dump(originTrainYList, fp)
        
    with open('./data/originTestXList', 'wb') as fp: 
        pickle.dump(originTestXList, fp)
        
    with open('./data/originTestYList', 'wb') as fp:
        pickle.dump(originTestYList, fp)
 
        
    return originTrainXList,originTrainYList,originTestXList,originTestYList
originTrainXList,originTrainYList,originTestXList,originTestYList = getTrain()

  0%|          | 0/1454 [00:00<?, ?it/s]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# LSTM for international airline passengers problem with window regression framing
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return numpy.array(dataX), numpy.array(dataY)
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset
dataframe = read_csv('international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3)
dataset = dataframe.values
dataset = dataset.astype('float32')
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
# split into train and test sets
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
# reshape into X=t and Y=t+1
look_back = 3
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

In [None]:
def sumAll(data):
    '''sum by day'''
    dataSum = pd.DataFrame()
     
  
    dataSum['sum_consumption'] = data.groupby('record_date')['smooth'].sum() 

    dataSum['record_date'] = dataSum.index
    dataSum['week'] = dataSum['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').weekday())
    dataSum['month'] = dataSum['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').month)
    dataSum['day'] = dataSum['record_date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d').day)
  
    return dataSum
    
#  1.read and processdata   
dataRead = read("./data/Tianchi_power.csv")
dataRead.to_csv("./data/dataRead.csv",index=False)
plt.plot(dataRead[dataRead==1000]['power_consumption'])
print dataRead.head() 