## Análise em um conjunto de dados do Kaggle
Dataset com dados sobre a produção de energia eólica levando em consideração diversas variáveis.
* O objetivo é prever a geração de energia solar usando a Regressão Linear do Scikit-learn

In [None]:
import pandas as pd
import numpy as np

In [4]:
dados = pd.read_csv('Geração de energia solar.csv')

In [5]:
dados.head()

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
0,245,2008,9,1,1,False,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,245,2008,9,1,4,False,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,245,2008,9,1,7,True,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,245,2008,9,1,10,True,0.16581,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,245,2008,9,1,13,True,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069


In [6]:
#Verificando o tipo das colunas
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2920 entries, 0 to 2919
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Day of Year                           2920 non-null   int64  
 1   Year                                  2920 non-null   int64  
 2   Month                                 2920 non-null   int64  
 3   Day                                   2920 non-null   int64  
 4   First Hour of Period                  2920 non-null   int64  
 5   Is Daylight                           2920 non-null   bool   
 6   Distance to Solar Noon                2920 non-null   float64
 7   Average Temperature (Day)             2920 non-null   int64  
 8   Average Wind Direction (Day)          2920 non-null   int64  
 9   Average Wind Speed (Day)              2920 non-null   float64
 10  Sky Cover                             2920 non-null   int64  
 11  Visibility       

### Transformando a coluna de valor booleano em 0 ou 1

In [7]:
dados['Is Daylight'] = dados['Is Daylight'].astype('int') 
dados.replace('True',1)
dados.head()  

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
0,245,2008,9,1,1,0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,245,2008,9,1,4,0,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,245,2008,9,1,7,1,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,245,2008,9,1,10,1,0.16581,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,245,2008,9,1,13,1,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069


## Statsmodels

In [8]:
import statsmodels.api as sm

In [9]:
y = 'Power Generated' #variavel que esta tentando prever
dados.drop([y], axis=1)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period)
0,245,2008,9,1,1,0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82
1,245,2008,9,1,4,0,0.628535,69,28,7.5,0,10.0,77,5.0,29.85
2,245,2008,9,1,7,1,0.397172,69,28,7.5,0,10.0,70,0.0,29.89
3,245,2008,9,1,10,1,0.165810,69,28,7.5,0,10.0,33,0.0,29.91
4,245,2008,9,1,13,1,0.065553,69,28,7.5,0,10.0,21,3.0,29.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,243,2009,8,31,10,1,0.166453,63,27,13.9,4,10.0,75,10.0,29.93
2916,243,2009,8,31,13,1,0.064020,63,27,13.9,1,10.0,66,15.0,29.91
2917,243,2009,8,31,16,1,0.294494,63,27,13.9,2,10.0,68,21.0,29.88
2918,243,2009,8,31,19,1,0.524968,63,27,13.9,2,10.0,81,17.0,29.87


### Tratando valores ausentes

In [10]:
dados.isna().sum()

Day of Year                             0
Year                                    0
Month                                   0
Day                                     0
First Hour of Period                    0
Is Daylight                             0
Distance to Solar Noon                  0
Average Temperature (Day)               0
Average Wind Direction (Day)            0
Average Wind Speed (Day)                0
Sky Cover                               0
Visibility                              0
Relative Humidity                       0
Average Wind Speed (Period)             1
Average Barometric Pressure (Period)    0
Power Generated                         0
dtype: int64

In [11]:
dados['Average Wind Speed (Period)'].fillna(0, inplace=True)

In [12]:
modelo = sm.OLS(dados[y], dados.drop([y], axis=1))
res = modelo.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:        Power Generated   R-squared (uncentered):                   0.780
Model:                            OLS   Adj. R-squared (uncentered):              0.779
Method:                 Least Squares   F-statistic:                              686.2
Date:                Wed, 27 Apr 2022   Prob (F-statistic):                        0.00
Time:                        21:08:19   Log-Likelihood:                         -29468.
No. Observations:                2920   AIC:                                  5.897e+04
Df Residuals:                    2905   BIC:                                  5.906e+04
Df Model:                          15                                                  
Covariance Type:            nonrobust                                                  
                                           coef    std err          t      P>|t|      [0.025      0.975]
---------------

In [13]:
# calcular a média e o desvio padrão em um conjunto de treinamento
from sklearn.preprocessing import StandardScaler 

In [14]:
X = dados.drop([y], axis=1)
scaler = StandardScaler()
normalized_X = scaler.fit_transform(X)
normalized_X = pd.DataFrame(normalized_X, columns=X.columns)
normalized_X['intercept'] = 1

modelo = sm.OLS(dados[y], normalized_X)
res = modelo.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:        Power Generated   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.679
Method:                 Least Squares   F-statistic:                     413.3
Date:                Wed, 27 Apr 2022   Prob (F-statistic):               0.00
Time:                        21:08:20   Log-Likelihood:                -29459.
No. Observations:                2920   AIC:                         5.895e+04
Df Residuals:                    2904   BIC:                         5.905e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


## Scikit-learn

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Xtrain, Xval, ytrain, yval = train_test_split(X, dados[y], test_size=0.3, random_state=0)
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)

modelo = LinearRegression(normalize=True)
modelo.fit(Xtrain, ytrain)

p = modelo.predict(Xval)

(2044, 15) (876, 15) (2044,) (876,)


In [16]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(yval, p))

5892.91348405022

In [17]:
modelo.coef_

array([ 2.54468940e+02,  2.34825333e+03, -7.48359120e+03, -2.58240914e+02,
       -1.14476598e+01, -6.16610587e+03, -3.11856825e+04, -1.10988992e+02,
        5.31163058e+01, -1.25064356e+02, -8.06855824e+02,  1.32835491e+02,
       -1.57403024e+02,  1.64178091e+02,  1.50412940e+03])

## Scikit-learn standard

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


Xtrain, Xval, ytrain, yval = train_test_split(X, dados[y], test_size=0.3, random_state=0)
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)

scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xval_scaled = scaler.transform(Xval)

modelo = LinearRegression(fit_intercept=True, normalize=False)
modelo.fit(Xtrain_scaled, ytrain)

p = modelo.predict(Xval_scaled)

(2044, 15) (876, 15) (2044,) (876,)


In [19]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(yval, p))

5892.913484050154

In [20]:
modelo.coef_

array([ 26670.95125392,   1102.70815076, -25656.39169562,  -2246.38921879,
          -79.35100406,  -3006.04078954,  -9250.39327397,   -760.31971022,
          361.10720944,   -599.45903285,  -1141.95830667,    181.48424505,
        -2373.32479015,   1200.72577047,    213.01399324])

### Observação:

A regressão com StandardScaler conseguiu uma aproximação melhor do resultado.