In [1]:
# Import Libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm 
from statsmodels.iolib.summary2 import summary_col 

In [2]:
# Create a Spark Session
spark = SparkSession.builder.appName("LRM").getOrCreate()

In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('weather_august.csv')
# Create a Pandas Dataframe
aug_df = spark.createDataFrame(df)
# Convert the PySpark DataFrame to a Pandas DataFrame
aug_df_pd = aug_df.toPandas()
# Show the Dataframe
aug_df_pd

Unnamed: 0,Precipitation,Global_Radiation,Avarage_Atmospheric_Pressure,Avarage_Temperature,Avarage_Dew_Temperature,Avarage_Relative_Humidity,Avarage_Wind_Speed
0,0.0,0.1,929.30,12.15,12.10,100.0,2.70
1,0.0,5.0,929.75,12.35,12.35,100.0,2.45
2,0.0,135.8,930.20,12.55,12.55,100.0,2.50
3,0.0,488.0,930.40,13.35,13.35,100.0,2.90
4,0.0,1631.1,930.60,15.50,13.65,89.5,3.05
...,...,...,...,...,...,...,...
370,0.0,2859.9,933.70,17.75,9.65,59.5,4.85
371,0.0,2399.6,932.75,17.60,10.10,62.0,5.60
372,0.0,1711.2,932.35,16.30,11.00,71.0,6.20
373,0.0,896.8,932.15,14.80,11.25,79.5,6.10


In [4]:
# Estimating a linear multiple model with all variables
model_lrm = sm.OLS.from_formula("Avarage_Temperature ~ Precipitation +\
                                Global_Radiation +\
                                Avarage_Atmospheric_Pressure +\
                                Avarage_Dew_Temperature +\
                                Avarage_Relative_Humidity +\
                                Avarage_Wind_Speed", aug_df_pd).fit()

# Get the R² 
lrm_R2 = model_lrm.rsquared

# Model parameters
model_lrm.summary()

0,1,2,3
Dep. Variable:,Avarage_Temperature,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.95
Method:,Least Squares,F-statistic:,1177.0
Date:,"Mon, 18 Sep 2023",Prob (F-statistic):,1.13e-236
Time:,16:06:22,Log-Likelihood:,-559.76
No. Observations:,375,AIC:,1134.0
Df Residuals:,368,BIC:,1161.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,127.1026,19.158,6.634,0.000,89.430,164.775
Precipitation,0.2259,0.200,1.128,0.260,-0.168,0.620
Global_Radiation,-0.0003,0.000,-2.498,0.013,-0.000,-5.41e-05
Avarage_Atmospheric_Pressure,-0.1076,0.021,-5.188,0.000,-0.148,-0.067
Avarage_Dew_Temperature,0.8171,0.032,25.771,0.000,0.755,0.879
Avarage_Relative_Humidity,-0.2465,0.006,-40.919,0.000,-0.258,-0.235
Avarage_Wind_Speed,-0.1073,0.034,-3.173,0.002,-0.174,-0.041

0,1,2,3
Omnibus:,41.461,Durbin-Watson:,0.427
Prob(Omnibus):,0.0,Jarque-Bera (JB):,88.902
Skew:,0.594,Prob(JB):,4.96e-20
Kurtosis:,5.069,Cond. No.,528000.0


In [5]:
# Estimating a new multiple model with dependent variable transformed by Box-Cox

# Stepwise Procedure

# Installation and loading of the package's 'stepwise' function
#'statstests.process'
# Package authors: Helder Prado Santos and Luiz Paulo Fávero
# https://stats-tests.github.io/statstests/
# pip install statstests
from statstests.process import stepwise

# Model estimation using the Stepwise procedure
model_setpwise = stepwise(model_lrm, pvalue_limit=0.05)

# Get the R²
stepwise_R2 = model_setpwise.rsquared

Regression type: OLS 

Estimating model...: 
 Avarage_Temperature ~ Q('Precipitation') + Q('Global_Radiation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 Discarding atribute "Q('Precipitation')" with p-value equal to 0.2601333649746338 

Estimating model...: 
 Avarage_Temperature ~ Q('Global_Radiation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('Precipitation')", 'p-value': 0.2601333649746338}

 Model after stepwise process...: 
 Avarage_Temperature ~ Q('Global_Radiation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed') 

                             OLS Regression Results                            
Dep. Variable:     Avarage_Temperatu

In [6]:
# Test to verify the adherence of waste to normality

# Shapiro-Wilk test (n < 30)
# from scipy.stats import shapiro
# shapiro(modelo_step_empresas.resid)

# Shapiro-Francia test (n >= 30)
# Installation and loading of the package 'shapiro_francia' function
#'statstests.tests'
# Package authors: Luiz Paulo Fávero and Helder Prado Santos
# https://stats-tests.github.io/statstests/
# pip install statstests
from statstests.tests import shapiro_francia
shapiro_francia(model_setpwise.resid)

# Interpretation
teste_sf = shapiro_francia(model_setpwise.resid) # creation of the 'teste_sf' object
teste_sf = teste_sf.items() # returns the group of key-value pairs in the dictionary
method, statistics_W, statistics_z, p = teste_sf # definition of list elements (tuple)
print('Statistics W=%.5f, p-value=%.6f' % (statistics_W[1], p[1]))
alpha = 0.05 # level of significance
if p[1] > alpha:
	print('H0 is not rejected - Distribution in line with normality')
else:
	print('H0 is rejected - Distribution not adhering to normality')


method  :  Shapiro-Francia normality test
statistics W  :  0.9644255600889945
statistics z  :  4.952920495261633
p-value  :  3.655392519830028e-07
method  :  Shapiro-Francia normality test
statistics W  :  0.9644255600889945
statistics z  :  4.952920495261633
p-value  :  3.655392519830028e-07
Statistics W=0.96443, p-value=0.000000
H0 is rejected - Distribution not adhering to normality


In [7]:
# For the Box-Cox lambda calculation
from scipy.stats import boxcox

x, lmbda = boxcox(aug_df_pd['Avarage_Temperature'])
print("Lambda: ",lmbda)

# Copy the original DataFrame to a new one
new_df = aug_df_pd.copy()

# Inserting the Box-Cox lambda into the dataset to estimate a new model
new_df['bc_Avarage_Temperature'] = x

# Estimating a new multiple model with dependent variable transformed by Box-Cox
model_boxcox = sm.OLS.from_formula("bc_Avarage_Temperature ~ Precipitation +\
                                Global_Radiation +\
                                Avarage_Atmospheric_Pressure +\
                                Avarage_Dew_Temperature +\
                                Avarage_Relative_Humidity +\
                                Avarage_Wind_Speed", new_df).fit()

# Model parameters
model_boxcox.summary()

# Get the R²
model_boxcox_R2 = model_boxcox.rsquared

Lambda:  0.238363980285637


In [8]:
# Applying the Stepwise procedure to 'model_boxcox"
model_step_boxcox = stepwise(model_boxcox, pvalue_limit=0.05)

# Get the R²
model_step_boxcox_R2 = model_step_boxcox.rsquared

# Checking the normality of the residuals of 'model_step_boxcox'

# Shapiro-Francia test
shapiro_francia(model_step_boxcox.resid)

# Interpretation
teste_sf = shapiro_francia(model_step_boxcox.resid) # creation of the 'teste_sf' object
teste_sf = teste_sf.items() # returns the group of key-value pairs in the dictionary
method, statistics_W, statistics_z, p = teste_sf # definition of list elements (tuple)
print('Statistics W=%.5f, p-value=%.6f' % (statistics_W[1], p[1]))
alpha = 0.05 # level of significance
if p[1] > alpha:
	print('H0 is not rejected - Distribution in line with normality')
else:
	print('H0 is rejected - Distribution not adhering to normality')


Regression type: OLS 

Estimating model...: 
 bc_Avarage_Temperature ~ Q('Precipitation') + Q('Global_Radiation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 Discarding atribute "Q('Global_Radiation')" with p-value equal to 0.7983631522691781 

Estimating model...: 
 bc_Avarage_Temperature ~ Q('Precipitation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 Discarding atribute "Q('Precipitation')" with p-value equal to 0.46837865818917135 

Estimating model...: 
 bc_Avarage_Temperature ~ Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('Global_Radiation')", 'p-value': 0.7983631522691781}
{'atribute': "Q('Precipitation')", 'p-value': 0.468378658

In [9]:
# Create a DataFrame
R2_data = {
    "Model": ["Linear Regression", "Stepwise Regression", "BoxCox", "Stepwise Boxcox"],
    "R-squared": [lrm_R2, stepwise_R2, model_boxcox_R2, model_step_boxcox_R2]
}

R2_df = pd.DataFrame(R2_data)
R2_df

Unnamed: 0,Model,R-squared
0,Linear Regression,0.950467
1,Stepwise Regression,0.950295
2,BoxCox,0.968649
3,Stepwise Boxcox,0.968598
