In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from  sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score,mean_squared_error,mean_absolute_percentage_error

In [2]:
def load_data():
    raw_df = []
    dict_df = pd.read_excel(r'C:\Users\Chandan\Desktop\Analog_Reports\Solar data\GridTieInverterData.xlsx',skiprows=5,parse_dates={'Time':[2]},index_col='Time',skipfooter=1,sheet_name=None)
    for key,dframe in dict_df.items():
        dframe.drop(dframe.tail(1).index,inplace=True)
        raw_df.append(dframe[['PDC(W)']])
    raw_df=pd.concat(raw_df)
    raw_df.index=pd.to_datetime(raw_df.index,format='%Y-%m-%d %H:%M:%S')
    raw_df.sort_index(inplace=True,ascending=False)
    return raw_df

def load_weather():
    weather = pd.read_csv(r'C:\Users\Chandan\Desktop\Analog_Reports\Solar data\kanpur_weather_solar.csv',
                             header=25)
    LIST=[]
    for i in range(weather.shape[0]):
        if weather.HR.values[i] < 10:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {0}{weather.HR.values[i]}"
        else:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {weather.HR.values[i]}"

        LIST.append(pd.to_datetime(string,format='%Y-%m-%d %H'))

    weather['time']=LIST
    weather=weather.set_index(weather['time'],drop=False)
    weather.drop(['YEAR','MO','DY','HR','time'],axis=1,inplace=True)
    return weather

def clean_data(df):
    df['datetime'] = [str(index)[0:13] for index in df.index.values]
    df = df.groupby('datetime').mean()
    df.index=pd.to_datetime(df.index,format='%Y-%m-%d %H')
    return df

def fill_zero_pow(df,period):
    start,end=df.index[0],df.index[-1]
    index=pd.date_range(start=start,end=end,freq='1H')
    t_df=pd.DataFrame({'val':np.NaN},index=index)
    result = df.join(t_df, how="outer").drop(['val'],axis=1)
    for index in result.index:
        if index.hour in period:
            result.loc[index]=0
    return result

def scaling(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(df)
    return scaler

def add_weather(df):
    weather_df=load_weather()
    combine_df = pd.merge(df, weather_df, left_index=True, right_index=True)
    return combine_df,weather_df

def apply_pca(X):
    pca = PCA(0.95)
    pca = pca.fit(X)
    return pca

def create_model(train_input, train_target, model_config):
    model = RandomForestRegressor(n_jobs=-1, random_state=42)
    model.fit(train_input, train_target.ravel())
    return model

def fill_missing(df):
    missing_index=df[df['PDC(W)'].isna()].index.to_list()
    dff = df.dropna(axis=0,inplace = False)
    combine_df,weather_df = add_weather(dff)
    
    target = ['PDC(W)']
    input_data = combine_df.drop(columns=target,axis=1,inplace=False).values
    target = combine_df[target].values
    
    scaler = scaling(input_data)
    scaled = scaler.transform(input_data)
    
    pca= apply_pca(scaled)
    pca_input = pca.transform(scaled)
    
    train_input,test_input,train_target,test_target = train_test_split(pca_input,target,test_size=0.1)
    model_config = []
    model = create_model(train_input,train_target,model_config=model_config)
    
    # predicting missing....
    missing_inp = weather_df.loc[missing_index].values
    missing_inp = pca.transform(scaler.transform(missing_inp))
    missing_val = model.predict(missing_inp)
    df.loc[missing_index] = missing_val.reshape(-1,1)
    return df

In [3]:
#loadind data....
raw_df= load_data()
# making time period hourly wise.....
clean_df = clean_data(raw_df)
#filling black_period.......
zero_pow_period = [19,20,21,22,23,0,1,2,3,4]
result = fill_zero_pow(clean_df,period=zero_pow_period)
df_filled = fill_missing(result)
combine_df,_ = add_weather(df_filled)

target = ['PDC(W)']
input_data = combine_df.drop(columns=target,axis=1,inplace=False).values
target = combine_df[target].values

# dimention reduction.....
scaler = scaling(input_data)
scaled = scaler.transform(input_data)

pca= apply_pca(scaled)
pca_input = pca.transform(scaled)

train_input,test_input,train_target,test_target = train_test_split(pca_input,target,test_size=0.3)
model_config = []
model = create_model(train_input,train_target,model_config=model_config)

In [5]:
test_result = model.predict(test_input)
train_result = model.predict(train_input)
print(f"Training set model score....")
model.score(train_input, train_target)

Training set model score....


0.9784516130284252

In [7]:
print(f"Test set model score....")
model.score(test_input, test_target)

Test set model score....


0.8281603169306806

In [8]:
def print_summary():
    print(f"test...summary")
    print(f"explained_variance_score = {explained_variance_score(test_result,test_target)}")
    print(f"mean_squared_error = {mean_squared_error(test_result,test_target)}")
    print(f"mean_absolute_percentage_error = {mean_absolute_percentage_error(test_result,test_target)}")
print_summary()

test...summary
explained_variance_score = 0.7970311236327194
mean_squared_error = 206482.12050862072
mean_absolute_percentage_error = 685069323993029.6


# Neural Network

In [None]:
def series_to_supervised(df_as_np,n_lags,n_out=1):
    X,y = [],[]
    for i in range(len(df_as_np)-n_lags):
        row = [ a for a in df_as_np[i:i+n_lags]]
        X.append(row)
        label = df_as_np[i+n_lags:i+n_lags+n_out,0]
        y.append(label)
    return np.array(X), np.array(y)