# Random Forest and regression to overcome obstacles

## Import the relevant libraries

In [86]:
import pandas as pd
import numpy as np

In [87]:
df = pd.read_csv("https://raw.githubusercontent.com/charlesp1996/Pythonproject/main/data.csv")

In [88]:
df

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Vitesse du vent Axa0 100m (m/s),Production éolienne (GWh),Rayonnement solaire global (W/m2),Production solaire (GWh)
0,0,0,0,2,2014,,6.518965,,2.931509
1,0,0,1,3,2014,,220.644746,,116.568735
2,0,0,1,4,2014,,189.563606,,125.730294
3,0,0,1,4,2014,,9.671714,,98.951709
4,1,0,0,5,2014,,363.412141,,58.899353
...,...,...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,0.0,,5.82,
1249,0,0,1,3,2021,0.0,,6.74,
1250,0,1,0,3,2021,0.0,,7.15,
1251,0,0,1,3,2021,0.0,,6.04,


In [89]:
df = df[df['Rayonnement solaire global (W/m2)'].notna()]

In [91]:
cols = df.columns.tolist()
cols

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Vitesse du vent Axa0 100m (m/s)',
 'Production éolienne (GWh)',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

### Solaire

In [106]:
colss = cols[:5] + cols[7:9]

In [107]:
colss

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

In [108]:
dfs = df[colss]

In [109]:
dfs

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
47,0,1,0,1,2016,7.466964,7.040643
48,1,0,0,1,2016,8.295263,5.596140
49,0,1,0,2,2016,9.242543,23.337292
50,0,1,0,2,2016,9.946336,9.640264
51,0,0,1,2,2016,6.508233,116.805294
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,
1249,0,0,1,3,2021,6.740000,
1250,0,1,0,3,2021,7.150000,
1251,0,0,1,3,2021,6.040000,


In [112]:
X,y = make_regression(n_samples=10000, n_features=6,
                                 n_informative=3, n_targets=1,  
                                 tail_strength=0.5,  
                                 shuffle=True, coef=False, random_state=0)
notnans = dfs['Production solaire (GWh)'].notnull()
df_notnans = df[notnans]
X_train, X_test, y_train, y_test = train_test_split(df_notnans[colss[:6]], df_notnans[colss[6:7]],
                                                    train_size=0.75,
                                                    random_state=1000)

In [113]:
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=30,
                                                          random_state=0))

# Fit on the train data
regr_multirf.fit(X_train, y_train)

# Check the prediction score
score = regr_multirf.score(X_test, y_test)
print("The prediction score on the test data is {:.2f}%".format(score*100))

The prediction score on the test data is 49.60%


In [114]:
df_nans = dfs.loc[~notnans].copy()
df_nans[colss[6:7]] = regr_multirf.predict(df_nans[colss[0:6]])
df_nans

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
1092,1,0,0,1,2016,6.130850,10.600893
1093,1,0,0,1,2016,7.882551,4.928253
1094,1,0,0,2,2016,6.926638,7.970764
1095,1,0,0,2,2016,8.113793,7.361584
1096,1,0,0,3,2016,5.711842,15.387454
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,212.833877
1249,0,0,1,3,2021,6.740000,211.701197
1250,0,1,0,3,2021,7.150000,31.504386
1251,0,0,1,3,2021,6.040000,175.097647


# Create the targets

In [None]:
targets = np.where(df_nans['Production solaire (GWh)'] > df_nans['Production solaire (GWh)'].median(), 1, 0)

In [None]:
targets.shape

In [None]:
df_nans['Execessive Solar'] = targets

# A comment on the targets

In [None]:
targets.sum() / targets.shape[0]

In [None]:
data_with_targets = df_nans

In [None]:
data_with_targets

# Select the inputs for the regression

In [None]:
data_with_targets.shape

In [None]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

# Standardize the data

In [None]:
#from sklearn.preprocessing import StandardScaler

solar_scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.vr_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled= X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled], axis=1)[init_col_order]

In [None]:
unscaled_inputs.columns.values

In [None]:
columns_to_scale = ['region_sud', 'region_nord', 'region_centre', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)', 'Production solaire (GWh)']
columns_to_omit = []

In [None]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [None]:
solar_scaler = CustomScaler(columns_to_scale)

In [None]:
solar_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [None]:
solar_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs[:161][:]

In [None]:
scaled_inputs[:161][:].shape

# Split the data into train & test and shuffle

## Import the relevant module 

In [None]:
from sklearn.model_selection import train_test_split

## Split

In [None]:
train_test_split(scaled_inputs[:161][:], targets)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs[:161][:], targets, train_size = 0.8, random_state = 20)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

# Logisitc regression with sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training the model

In [None]:
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.score(x_train, y_train)

### Manually check accuracy

In [None]:
model_outputs = reg.predict(x_train)
model_outputs

In [None]:
y_train

In [None]:
model_outputs == y_train

In [None]:
np.sum(model_outputs==y_train)

In [None]:
model_outputs.shape[0]

In [None]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

### Finding the intercept and coefficients

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
unscaled_inputs.columns.values

In [None]:
feature_name = unscaled_inputs.columns.values

In [None]:
summary_table = pd.DataFrame (columns=['feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

In [None]:
summary_table.index = summary_table.index+1

In [None]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

## Interpreting the coefficient

In [None]:
summary_table['Odd_ratio'] = np.exp(summary_table.Coefficient)

In [None]:
summary_table

In [None]:
summary_table.sort_values('Odd_ratio', ascending=False)

# Testing the model

In [None]:
reg.score(x_test, y_test)

In [None]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

In [None]:
predicted_proba.shape

In [None]:
predicted_proba[:,1]

# Saving the model

In [None]:
import pickle

In [None]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [None]:
with open('scaler','wb') as file:
    pickle.dump(solar_scaler, file)