# Random Forest and regression to overcome obstacles

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/charlesp1996/Pythonproject/main/data.csv")

In [3]:
df

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Vitesse du vent Axa0 100m (m/s),Production éolienne (GWh),Rayonnement solaire global (W/m2),Production solaire (GWh)
0,1,0,0,2,2014,,6.518965,,2.931509
1,0,0,1,3,2014,,220.644746,,116.568735
2,0,0,1,4,2014,,189.563606,,125.730294
3,0,0,1,4,2014,,9.671714,,98.951709
4,1,0,0,5,2014,,363.412141,,58.899353
...,...,...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,0.0,,5.82,
1249,0,0,1,3,2021,0.0,,6.74,
1250,0,1,0,3,2021,0.0,,7.15,
1251,0,0,1,3,2021,0.0,,6.04,


In [4]:
df = df[df['Rayonnement solaire global (W/m2)'].notna()]

In [5]:
cols = df.columns.tolist()
cols

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Vitesse du vent Axa0 100m (m/s)',
 'Production éolienne (GWh)',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

### Solaire

In [6]:
colss = cols[:5] + cols[7:9]

In [7]:
colss

['region_nord',
 'region_centre',
 'region_sud',
 'Month Value',
 'Year Value',
 'Rayonnement solaire global (W/m2)',
 'Production solaire (GWh)']

In [8]:
dfs = df[colss]

In [9]:
dfs

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
47,0,1,0,1,2016,7.466964,7.040643
48,1,0,0,1,2016,8.295263,5.596140
49,0,1,0,2,2016,9.242543,23.337292
50,0,1,0,2,2016,9.946336,9.640264
51,0,0,1,2,2016,6.508233,116.805294
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,
1249,0,0,1,3,2021,6.740000,
1250,0,1,0,3,2021,7.150000,
1251,0,0,1,3,2021,6.040000,


In [10]:
X,y = make_regression(n_samples=821, n_features=6,
                                 n_informative=5, n_targets=1,  
                                 tail_strength=0.5,  
                                 shuffle=True, coef=False, random_state=0)
notnans = dfs['Production solaire (GWh)'].notnull()
df_notnans = df[notnans]
X_train, X_test, y_train, y_test = train_test_split(df_notnans[colss[:6]], df_notnans[colss[6:7]],
                                                    train_size=0.75,
                                                    random_state=250)

In [11]:
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=30,
                                                          random_state=0))

# Fit on the train data
regr_multirf.fit(X_train, y_train)

# Check the prediction score
score = regr_multirf.score(X_test, y_test)
print("The prediction score on the test data is {:.2f}%".format(score*100))

The prediction score on the test data is 64.75%


In [12]:
df_nans = dfs.loc[~notnans].copy()
df_nans[colss[6:7]] = regr_multirf.predict(df_nans[colss[0:6]])
df_nans

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
1092,1,0,0,1,2016,6.130850,10.805883
1093,0,0,0,1,2016,7.882551,7.651051
1094,1,0,0,2,2016,6.926638,7.326095
1095,0,0,0,2,2016,8.113793,12.297672
1096,1,0,0,3,2016,5.711842,16.241297
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,236.481544
1249,0,0,1,3,2021,6.740000,214.822227
1250,0,1,0,3,2021,7.150000,28.393754
1251,0,0,1,3,2021,6.040000,187.794577


# Create the targets

In [13]:
targets = np.where(df_nans['Production solaire (GWh)'] > df_nans['Production solaire (GWh)'].median(), 1, 0)

In [14]:
targets.shape

(161,)

In [15]:
df_nans['Execessive Solar'] = targets

# A comment on the targets

In [16]:
targets.sum() / targets.shape[0]

0.4968944099378882

In [17]:
data_with_targets = df_nans

In [18]:
data_with_targets

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh),Execessive Solar
1092,1,0,0,1,2016,6.130850,10.805883,0
1093,0,0,0,1,2016,7.882551,7.651051,0
1094,1,0,0,2,2016,6.926638,7.326095,0
1095,0,0,0,2,2016,8.113793,12.297672,0
1096,1,0,0,3,2016,5.711842,16.241297,0
...,...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,236.481544,1
1249,0,0,1,3,2021,6.740000,214.822227,1
1250,0,1,0,3,2021,7.150000,28.393754,1
1251,0,0,1,3,2021,6.040000,187.794577,1


# Select the inputs for the regression

In [19]:
data_with_targets.shape

(161, 8)

In [20]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
1092,1,0,0,1,2016,6.130850,10.805883
1093,0,0,0,1,2016,7.882551,7.651051
1094,1,0,0,2,2016,6.926638,7.326095
1095,0,0,0,2,2016,8.113793,12.297672
1096,1,0,0,3,2016,5.711842,16.241297
...,...,...,...,...,...,...,...
1248,0,0,1,3,2021,5.820000,236.481544
1249,0,0,1,3,2021,6.740000,214.822227
1250,0,1,0,3,2021,7.150000,28.393754
1251,0,0,1,3,2021,6.040000,187.794577


# Standardize the data

In [21]:
from sklearn.preprocessing import StandardScaler

solar_scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.vr_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled= X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled], axis=1)[init_col_order]

In [23]:
unscaled_inputs.columns.values

array(['region_nord', 'region_centre', 'region_sud', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)',
       'Production solaire (GWh)'], dtype=object)

In [24]:
columns_to_scale = ['region_sud', 'region_nord', 'region_centre', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)', 'Production solaire (GWh)']
columns_to_omit = []

In [25]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [26]:
solar_scaler = CustomScaler(columns_to_scale)

In [27]:
solar_scaler.fit(unscaled_inputs)



CustomScaler(columns=['region_nord', 'region_centre', 'region_sud',
                      'Month Value', 'Year Value',
                      'Rayonnement solaire global (W/m2)',
                      'Production solaire (GWh)'],
             copy=None, with_mean=None, with_std=None)

In [28]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [29]:
solar_scaler.fit(unscaled_inputs)



CustomScaler(columns=['region_nord', 'region_centre', 'region_sud',
                      'Month Value', 'Year Value',
                      'Rayonnement solaire global (W/m2)',
                      'Production solaire (GWh)'],
             copy=None, with_mean=None, with_std=None)

In [30]:
scaled_inputs = solar_scaler.transform(unscaled_inputs)

In [31]:
scaled_inputs[:161][:]

Unnamed: 0,region_nord,region_centre,region_sud,Month Value,Year Value,Rayonnement solaire global (W/m2),Production solaire (GWh)
0,1.097943,-0.283790,-0.320530,-1.204950,-1.545362,0.262034,-0.502448
1,-0.910794,-0.283790,-0.320530,-1.204950,-1.545362,1.672103,-0.590000
2,1.097943,-0.283790,-0.320530,-0.927415,-1.545362,0.902621,-0.599018
3,-0.910794,-0.283790,-0.320530,-0.927415,-1.545362,1.858247,-0.461049
4,1.097943,-0.283790,-0.320530,-0.649880,-1.545362,-0.075255,-0.351606
...,...,...,...,...,...,...,...
156,-0.910794,-0.283790,3.119829,-0.649880,1.250181,0.011809,5.760445
157,-0.910794,-0.283790,3.119829,-0.649880,1.250181,0.752383,5.159361
158,-0.910794,3.523729,-0.320530,-0.649880,1.250181,1.082421,-0.014354
159,-0.910794,-0.283790,3.119829,-0.649880,1.250181,0.188903,4.409296


In [32]:
scaled_inputs[:161][:].shape

(161, 7)

# Split the data into train & test and shuffle

## Import the relevant module 

In [33]:
from sklearn.model_selection import train_test_split

## Split

In [34]:
train_test_split(scaled_inputs[:161][:], targets)

[     region_nord  region_centre  region_sud  Month Value  Year Value  \
 105    -0.910794       -0.28379    -0.32053    -0.094810    0.691072   
 21     -0.910794       -0.28379    -0.32053     1.570400   -1.545362   
 36      1.097943       -0.28379    -0.32053     0.460260   -0.986254   
 43     -0.910794       -0.28379    -0.32053     1.292865   -0.986254   
 125     1.097943       -0.28379    -0.32053    -1.204950    1.250181   
 ..           ...            ...         ...          ...         ...   
 155     1.097943       -0.28379    -0.32053    -0.649880    1.250181   
 5      -0.910794       -0.28379    -0.32053    -0.649880   -1.545362   
 65     -0.910794       -0.28379    -0.32053     1.015330   -0.427145   
 66      1.097943       -0.28379    -0.32053     1.292865   -0.427145   
 61     -0.910794       -0.28379    -0.32053     0.460260   -0.427145   
 
      Rayonnement solaire global (W/m2)  Production solaire (GWh)  
 105                          -0.226566               

In [35]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs[:161][:], targets, train_size = 0.8, random_state = 20)

In [36]:
print(x_train.shape, y_train.shape)

(128, 7) (128,)


In [37]:
print(x_test.shape, y_test.shape)

(33, 7) (33,)


# Logisitc regression with sklearn

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training the model

In [39]:
reg = LogisticRegression()

In [40]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
reg.score(x_train, y_train)

0.9140625

### Manually check accuracy

In [42]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0])

In [43]:
y_train

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0])

In [44]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True]

In [45]:
np.sum(model_outputs==y_train)

117

In [46]:
model_outputs.shape[0]

128

In [47]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.9140625

### Finding the intercept and coefficients

In [48]:
reg.intercept_

array([0.61444096])

In [49]:
reg.coef_

array([[-1.30095379, -0.01652767,  0.00705287, -0.21375656,  1.1642379 ,
        -1.87690145,  2.66049331]])

In [50]:
unscaled_inputs.columns.values

array(['region_nord', 'region_centre', 'region_sud', 'Month Value',
       'Year Value', 'Rayonnement solaire global (W/m2)',
       'Production solaire (GWh)'], dtype=object)

In [51]:
feature_name = unscaled_inputs.columns.values

In [52]:
summary_table = pd.DataFrame (columns=['feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,feature_name,Coefficient
0,region_nord,-1.300954
1,region_centre,-0.016528
2,region_sud,0.007053
3,Month Value,-0.213757
4,Year Value,1.164238
5,Rayonnement solaire global (W/m2),-1.876901
6,Production solaire (GWh),2.660493


In [53]:
summary_table.index = summary_table.index+1

In [54]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,Coefficient
0,Intercept,0.614441
1,region_nord,-1.300954
2,region_centre,-0.016528
3,region_sud,0.007053
4,Month Value,-0.213757
5,Year Value,1.164238
6,Rayonnement solaire global (W/m2),-1.876901
7,Production solaire (GWh),2.660493


## Interpreting the coefficient

In [55]:
summary_table['Odd_ratio'] = np.exp(summary_table.Coefficient)

In [56]:
summary_table

Unnamed: 0,feature_name,Coefficient,Odd_ratio
0,Intercept,0.614441,1.848623
1,region_nord,-1.300954,0.272272
2,region_centre,-0.016528,0.983608
3,region_sud,0.007053,1.007078
4,Month Value,-0.213757,0.807545
5,Year Value,1.164238,3.203481
6,Rayonnement solaire global (W/m2),-1.876901,0.153064
7,Production solaire (GWh),2.660493,14.303343


In [57]:
summary_table.sort_values('Odd_ratio', ascending=False)

Unnamed: 0,feature_name,Coefficient,Odd_ratio
7,Production solaire (GWh),2.660493,14.303343
5,Year Value,1.164238,3.203481
0,Intercept,0.614441,1.848623
3,region_sud,0.007053,1.007078
2,region_centre,-0.016528,0.983608
4,Month Value,-0.213757,0.807545
1,region_nord,-1.300954,0.272272
6,Rayonnement solaire global (W/m2),-1.876901,0.153064


# Testing the model

In [58]:
reg.score(x_test, y_test)

0.8181818181818182

In [59]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[7.85428868e-03, 9.92145711e-01],
       [8.44640576e-01, 1.55359424e-01],
       [6.34556301e-01, 3.65443699e-01],
       [1.29401618e-01, 8.70598382e-01],
       [7.07085992e-01, 2.92914008e-01],
       [2.07656263e-01, 7.92343737e-01],
       [7.92980546e-01, 2.07019454e-01],
       [4.71923348e-01, 5.28076652e-01],
       [8.12977100e-01, 1.87022900e-01],
       [7.02900358e-02, 9.29709964e-01],
       [5.25460898e-01, 4.74539102e-01],
       [5.54238277e-01, 4.45761723e-01],
       [3.35874137e-01, 6.64125863e-01],
       [1.46717436e-07, 9.99999853e-01],
       [9.95505077e-01, 4.49492316e-03],
       [7.80045763e-01, 2.19954237e-01],
       [1.98899867e-06, 9.99998011e-01],
       [4.88558541e-02, 9.51144146e-01],
       [3.45550240e-04, 9.99654450e-01],
       [6.08104175e-01, 3.91895825e-01],
       [6.62252089e-01, 3.37747911e-01],
       [9.63714141e-01, 3.62858588e-02],
       [9.46985533e-01, 5.30144666e-02],
       [2.12544975e-01, 7.87455025e-01],
       [7.059575

In [60]:
predicted_proba.shape

(33, 2)

In [61]:
predicted_proba[:,1]

array([0.99214571, 0.15535942, 0.3654437 , 0.87059838, 0.29291401,
       0.79234374, 0.20701945, 0.52807665, 0.1870229 , 0.92970996,
       0.4745391 , 0.44576172, 0.66412586, 0.99999985, 0.00449492,
       0.21995424, 0.99999801, 0.95114415, 0.99965445, 0.39189582,
       0.33774791, 0.03628586, 0.05301447, 0.78745502, 0.92940424,
       0.95208106, 0.00416206, 0.8661949 , 0.11716368, 0.25676862,
       0.2008964 , 0.26400703, 0.383717  ])

# Saving the model

In [62]:
import pickle

In [63]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [64]:
with open('scaler','wb') as file:
    pickle.dump(solar_scaler, file)