# Random Forest and regression to overcome obstacles

## Import the relevant libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

In [16]:
df = pd.read_csv("https://raw.githubusercontent.com/charlesp1996/Pythonproject/main/data.csv")

In [17]:
df

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Vitesse du vent Axa0 100m (m/s),Production éolienne (GWh),Rayonnement solaire global (W/m2),Production solaire (GWh)
0,0,0,0,2,2014,,6.518965,,2.931509
1,0,0,1,3,2014,,220.644746,,116.568735
2,0,0,1,4,2014,,189.563606,,125.730294
3,0,0,1,4,2014,,9.671714,,98.951709
4,1,0,0,5,2014,,363.412141,,58.899353
...,...,...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,0.0,,5.82,
1249,0,0,1,3,2021,0.0,,6.74,
1250,0,1,0,3,2021,0.0,,7.15,
1251,0,0,1,3,2021,0.0,,6.04,


In [18]:
df = df[df['Rayonnement solaire global (W/m2)'].notna()]

In [19]:
cols = df.columns.tolist()
cols

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Vitesse du vent Axa0 100m (m/s)',
 'Production éolienne (GWh)',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

### Solaire

In [20]:
colss = cols[:5] + cols[7:9]

In [21]:
colss

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

In [22]:
dfs = df[colss]

In [23]:
dfs

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
47,0,1,0,1,2016,7.466964,7.040643
48,1,0,0,1,2016,8.295263,5.596140
49,0,1,0,2,2016,9.242543,23.337292
50,0,1,0,2,2016,9.946336,9.640264
51,0,0,1,2,2016,6.508233,116.805294
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,
1249,0,0,1,3,2021,6.740000,
1250,0,1,0,3,2021,7.150000,
1251,0,0,1,3,2021,6.040000,


In [24]:
X,y = make_regression(n_samples=10000, n_features=6,
                                 n_informative=3, n_targets=1,  
                                 tail_strength=0.5,  
                                 shuffle=True, coef=False, random_state=0)
notnans = dfs['Production solaire (GWh)'].notnull()
df_notnans = df[notnans]
X_train, X_test, y_train, y_test = train_test_split(df_notnans[colss[:6]], df_notnans[colss[6:7]],
                                                    train_size=0.75,
                                                    random_state=1000)

In [25]:
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=30,
                                                          random_state=0))

# Fit on the train data
regr_multirf.fit(X_train, y_train)

# Check the prediction score
score = regr_multirf.score(X_test, y_test)
print("The prediction score on the test data is {:.2f}%".format(score*100))

The prediction score on the test data is 49.60%


In [26]:
df_nans = dfs.loc[~notnans].copy()
df_nans[colss[6:7]] = regr_multirf.predict(df_nans[colss[0:6]])
df_nans

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
1092,1,0,0,1,2016,6.130850,10.600893
1093,1,0,0,1,2016,7.882551,4.928253
1094,1,0,0,2,2016,6.926638,7.970764
1095,1,0,0,2,2016,8.113793,7.361584
1096,1,0,0,3,2016,5.711842,15.387454
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,212.833877
1249,0,0,1,3,2021,6.740000,211.701197
1250,0,1,0,3,2021,7.150000,31.504386
1251,0,0,1,3,2021,6.040000,175.097647


# Create the targets

In [27]:
targets = np.where(df_nans['Production solaire (GWh)'] > df_nans['Production solaire (GWh)'].median(), 1, 0)

In [28]:
targets.shape

(161,)

In [29]:
df_nans['Execessive Solar'] = targets

# A comment on the targets

In [30]:
targets.sum() / targets.shape[0]

0.4968944099378882

In [31]:
data_with_targets = df_nans

In [32]:
data_with_targets

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh),Execessive Solar
1092,1,0,0,1,2016,6.130850,10.600893,0
1093,1,0,0,1,2016,7.882551,4.928253,0
1094,1,0,0,2,2016,6.926638,7.970764,0
1095,1,0,0,2,2016,8.113793,7.361584,0
1096,1,0,0,3,2016,5.711842,15.387454,0
...,...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,212.833877,1
1249,0,0,1,3,2021,6.740000,211.701197,1
1250,0,1,0,3,2021,7.150000,31.504386,1
1251,0,0,1,3,2021,6.040000,175.097647,1


# Select the inputs for the regression

In [33]:
data_with_targets.shape

(161, 8)

In [34]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
1092,1,0,0,1,2016,6.130850,10.600893
1093,1,0,0,1,2016,7.882551,4.928253
1094,1,0,0,2,2016,6.926638,7.970764
1095,1,0,0,2,2016,8.113793,7.361584
1096,1,0,0,3,2016,5.711842,15.387454
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,212.833877
1249,0,0,1,3,2021,6.740000,211.701197
1250,0,1,0,3,2021,7.150000,31.504386
1251,0,0,1,3,2021,6.040000,175.097647


# Standardize the data

In [35]:
from sklearn.preprocessing import StandardScaler

solar_scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.vr_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled= X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled], axis=1)[init_col_order]

In [37]:
unscaled_inputs.columns.values

array(['region_nord', 'region_centre', 'region_sud', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)',
       'Production solaire (GWh)'], dtype=object)

In [38]:
columns_to_scale = ['region_sud', 'region_nord', 'region_centre', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)', 'Production solaire (GWh)']
columns_to_omit = []

In [39]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [40]:
solar_scaler = CustomScaler(columns_to_scale)

In [41]:
solar_scaler.fit(unscaled_inputs)



CustomScaler(columns=['region_nord', 'region_centre', 'region_sud',
                      'Month Value', 'Year Value',
                      'Rayonnement solaire global (W/m2)',
                      'Production solaire (GWh)'],
             copy=None, with_mean=None, with_std=None)

In [42]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [43]:
solar_scaler.fit(unscaled_inputs)



CustomScaler(columns=['region_nord', 'region_centre', 'region_sud',
                      'Month Value', 'Year Value',
                      'Rayonnement solaire global (W/m2)',
                      'Production solaire (GWh)'],
             copy=None, with_mean=None, with_std=None)

In [44]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [45]:
scaled_inputs[:161][:]

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
0,0.478547,-0.283790,-0.320530,-1.204950,-1.545362,0.262034,-0.441509
1,0.478547,-0.283790,-0.320530,-1.204950,-1.545362,1.672103,-0.601185
2,0.478547,-0.283790,-0.320530,-0.927415,-1.545362,0.902621,-0.515543
3,0.478547,-0.283790,-0.320530,-0.927415,-1.545362,1.858247,-0.532691
4,0.478547,-0.283790,-0.320530,-0.649880,-1.545362,-0.075255,-0.306774
...,...,...,...,...,...,...,...
156,-2.089657,-0.283790,3.119829,-0.649880,1.250181,0.011809,5.251052
157,-2.089657,-0.283790,3.119829,-0.649880,1.250181,0.752383,5.219169
158,-2.089657,3.523729,-0.320530,-0.649880,1.250181,1.082421,0.146894
159,-2.089657,-0.283790,3.119829,-0.649880,1.250181,0.188903,4.188833


In [46]:
scaled_inputs[:161][:].shape

(161, 7)

# Split the data into train & test and shuffle

## Import the relevant module 

In [47]:
from sklearn.model_selection import train_test_split

## Split

In [48]:
train_test_split(scaled_inputs[:161][:], targets)

[     region_nord  region_centre  region_sud  Month Value  Year Value  \
 20      0.478547      -0.283790   -0.320530     1.570400   -1.545362   
 64      0.478547      -0.283790   -0.320530     1.015330   -0.427145   
 62      0.478547      -0.283790   -0.320530     0.737795   -0.427145   
 32      0.478547      -0.283790   -0.320530    -0.094810   -0.986254   
 131    -2.089657      -0.283790    3.119829    -1.204950    1.250181   
 ..           ...            ...         ...          ...         ...   
 117     0.478547      -0.283790   -0.320530     1.570400    0.691072   
 125     0.478547      -0.283790   -0.320530    -1.204950    1.250181   
 137    -2.089657       3.523729   -0.320530    -0.927415    1.250181   
 12      0.478547      -0.283790   -0.320530     0.460260   -1.545362   
 116     0.478547      -0.283790   -0.320530     1.570400    0.691072   
 
      Rayonnement solaire global (W/m2)  Production solaire (GWh)  
 20                           -0.588733               

In [49]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs[:161][:], targets, train_size = 0.8, random_state = 20)

In [50]:
print(x_train.shape, y_train.shape)

(128, 7) (128,)


In [51]:
print(x_test.shape, y_test.shape)

(33, 7) (33,)


# Logisitc regression with sklearn

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training the model

In [53]:
reg = LogisticRegression()

In [54]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
reg.score(x_train, y_train)

0.9140625

### Manually check accuracy

In [56]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0])

In [57]:
y_train

array([0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [58]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
       False,  True]

In [59]:
np.sum(model_outputs==y_train)

117

In [60]:
model_outputs.shape[0]

128

In [61]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.9140625

### Finding the intercept and coefficients

In [62]:
reg.intercept_

array([0.11835126])

In [63]:
reg.coef_

array([[ 0.56942464,  0.88654267,  0.14576746,  0.15917285,  2.0297981 ,
        -2.08159284,  2.03285389]])

In [64]:
unscaled_inputs.columns.values

array(['region_nord', 'region_centre', 'region_sud', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)',
       'Production solaire (GWh)'], dtype=object)

In [65]:
feature_name = unscaled_inputs.columns.values

In [66]:
summary_table = pd.DataFrame (columns=['feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,feature_name,Coefficient
0,region_nord,0.569425
1,region_centre,0.886543
2,region_sud,0.145767
3,Month Value,0.159173
4,Year Value,2.029798
5,Rayonnement solaire global (W/m2),-2.081593
6,Production solaire (GWh),2.032854


In [67]:
summary_table.index = summary_table.index+1

In [68]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,Coefficient
0,Intercept,0.118351
1,region_nord,0.569425
2,region_centre,0.886543
3,region_sud,0.145767
4,Month Value,0.159173
5,Year Value,2.029798
6,Rayonnement solaire global (W/m2),-2.081593
7,Production solaire (GWh),2.032854


## Interpreting the coefficient

In [69]:
summary_table['Odd_ratio'] = np.exp(summary_table.Coefficient)

In [70]:
summary_table

Unnamed: 0,feature_name,Coefficient,Odd_ratio
0,Intercept,0.118351,1.125639
1,region_nord,0.569425,1.76725
2,region_centre,0.886543,2.426725
3,region_sud,0.145767,1.156927
4,Month Value,0.159173,1.172541
5,Year Value,2.029798,7.612549
6,Rayonnement solaire global (W/m2),-2.081593,0.124731
7,Production solaire (GWh),2.032854,7.635847


In [71]:
summary_table.sort_values('Odd_ratio', ascending=False)

Unnamed: 0,feature_name,Coefficient,Odd_ratio
7,Production solaire (GWh),2.032854,7.635847
5,Year Value,2.029798,7.612549
2,region_centre,0.886543,2.426725
1,region_nord,0.569425,1.76725
4,Month Value,0.159173,1.172541
3,region_sud,0.145767,1.156927
0,Intercept,0.118351,1.125639
6,Rayonnement solaire global (W/m2),-2.081593,0.124731


# Testing the model

In [72]:
reg.score(x_test, y_test)

0.7878787878787878

In [73]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[1.90589293e-03, 9.98094107e-01],
       [8.94562180e-01, 1.05437820e-01],
       [3.85306007e-01, 6.14693993e-01],
       [4.97734230e-02, 9.50226577e-01],
       [8.51349377e-01, 1.48650623e-01],
       [7.95645468e-01, 2.04354532e-01],
       [9.30165084e-01, 6.98349164e-02],
       [2.18347003e-01, 7.81652997e-01],
       [9.82862419e-01, 1.71375808e-02],
       [2.28701131e-02, 9.77129887e-01],
       [2.18631659e-01, 7.81368341e-01],
       [4.13667219e-01, 5.86332781e-01],
       [6.96914977e-02, 9.30308502e-01],
       [2.46810102e-05, 9.99975319e-01],
       [9.86293082e-01, 1.37069176e-02],
       [9.77179744e-01, 2.28202562e-02],
       [4.03815949e-04, 9.99596184e-01],
       [1.21478501e-02, 9.87852150e-01],
       [3.51086622e-03, 9.96489134e-01],
       [8.69143173e-01, 1.30856827e-01],
       [2.44265860e-01, 7.55734140e-01],
       [9.26563481e-01, 7.34365192e-02],
       [9.93093500e-01, 6.90650026e-03],
       [7.86750131e-01, 2.13249869e-01],
       [5.191878

In [74]:
predicted_proba.shape

(33, 2)

In [75]:
predicted_proba[:,1]

array([0.99809411, 0.10543782, 0.61469399, 0.95022658, 0.14865062,
       0.20435453, 0.06983492, 0.781653  , 0.01713758, 0.97712989,
       0.78136834, 0.58633278, 0.9303085 , 0.99997532, 0.01370692,
       0.02282026, 0.99959618, 0.98785215, 0.99648913, 0.13085683,
       0.75573414, 0.07343652, 0.0069065 , 0.21324987, 0.4808122 ,
       0.57731939, 0.01409327, 0.23134944, 0.14586312, 0.53313718,
       0.36231268, 0.44203791, 0.7324163 ])

# Saving the model

In [76]:
import pickle

In [77]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [78]:
with open('scaler','wb') as file:
    pickle.dump(solar_scaler, file)