In [1]:
# Import Libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm 
from statsmodels.iolib.summary2 import summary_col 

In [2]:
# Create a Spark Session
spark = SparkSession.builder.appName("LRM").getOrCreate()

In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('weather_august.csv')
# Create a Pandas Dataframe
aug_df = spark.createDataFrame(df)
# Convert the PySpark DataFrame to a Pandas DataFrame
aug_df_pd = aug_df.toPandas()
# Show the Dataframe
aug_df_pd.head()

Unnamed: 0,Precipitation,Avarage_Atmospheric_Pressure,Avarage_Temperature,Avarage_Dew_Temperature,Avarage_Relative_Humidity,Avarage_Wind_Speed
0,0.0,930.45,12.5,12.45,100.0,2.8
1,0.0,930.7,12.6,12.6,100.0,2.25
2,0.0,930.85,12.2,12.2,100.0,1.4
3,0.0,931.05,12.0,12.0,100.0,1.25
4,0.0,930.9,11.8,11.8,100.0,1.25


In [4]:
# Estimating a linear multiple model with all variables
model_lrm = sm.OLS.from_formula("Avarage_Temperature ~ Precipitation +\
                                Avarage_Atmospheric_Pressure +\
                                Avarage_Dew_Temperature +\
                                Avarage_Relative_Humidity +\
                                Avarage_Wind_Speed", aug_df_pd).fit()

# Get the R² 
lrm_R2 = model_lrm.rsquared

# Model parameters
model_lrm.summary()

0,1,2,3
Dep. Variable:,Avarage_Temperature,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,2696.0
Date:,"Sat, 23 Sep 2023",Prob (F-statistic):,0.0
Time:,14:45:21,Log-Likelihood:,-1037.3
No. Observations:,744,AIC:,2087.0
Df Residuals:,738,BIC:,2114.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,90.2761,10.358,8.716,0.000,69.941,110.611
Precipitation,0.2166,0.123,1.767,0.078,-0.024,0.457
Avarage_Atmospheric_Pressure,-0.0705,0.011,-6.309,0.000,-0.092,-0.049
Avarage_Dew_Temperature,0.8727,0.019,46.308,0.000,0.836,0.910
Avarage_Relative_Humidity,-0.2324,0.003,-86.756,0.000,-0.238,-0.227
Avarage_Wind_Speed,-0.0443,0.021,-2.127,0.034,-0.085,-0.003

0,1,2,3
Omnibus:,245.488,Durbin-Watson:,0.253
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1267.532
Skew:,1.404,Prob(JB):,5.7400000000000005e-276
Kurtosis:,8.745,Cond. No.,270000.0


In [5]:
# Estimating a new multiple model with dependent variable transformed by Box-Cox

# Stepwise Procedure

# Installation and loading of the package's 'stepwise' function
#'statstests.process'
# Package authors: Helder Prado Santos and Luiz Paulo Fávero
# https://stats-tests.github.io/statstests/
# pip install statstests
from statstests.process import stepwise

# Model estimation using the Stepwise procedure
model_setpwise = stepwise(model_lrm, pvalue_limit=0.05)

# Get the R²
stepwise_R2 = model_setpwise.rsquared

Regression type: OLS 

Estimating model...: 
 Avarage_Temperature ~ Q('Precipitation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 Discarding atribute "Q('Precipitation')" with p-value equal to 0.07767336832357906 

Estimating model...: 
 Avarage_Temperature ~ Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('Precipitation')", 'p-value': 0.07767336832357906}

 Model after stepwise process...: 
 Avarage_Temperature ~ Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed') 

                             OLS Regression Results                            
Dep. Variable:     Avarage_Temperature   R-squared:                       0.948
Model:                    

In [6]:
# Test to verify the adherence of waste to normality

# Shapiro-Wilk test (n < 30)
# from scipy.stats import shapiro
# shapiro(modelo_step_empresas.resid)

# Shapiro-Francia test (n >= 30)
# Installation and loading of the package 'shapiro_francia' function
#'statstests.tests'
# Package authors: Luiz Paulo Fávero and Helder Prado Santos
# https://stats-tests.github.io/statstests/
# pip install statstests
from statstests.tests import shapiro_francia
shapiro_francia(model_setpwise.resid)

# Interpretation
teste_sf = shapiro_francia(model_setpwise.resid) # creation of the 'teste_sf' object
teste_sf = teste_sf.items() # returns the group of key-value pairs in the dictionary
method, statistics_W, statistics_z, p = teste_sf # definition of list elements (tuple)
print('Statistics W=%.5f, p-value=%.6f' % (statistics_W[1], p[1]))
alpha = 0.05 # level of significance
if p[1] > alpha:
	print('H0 is not rejected - Distribution in line with normality')
else:
	print('H0 is rejected - Distribution not adhering to normality')


method  :  Shapiro-Francia normality test
statistics W  :  0.8882378251067146
statistics z  :  9.113505733123752
p-value  :  3.9881001958568025e-20
method  :  Shapiro-Francia normality test
statistics W  :  0.8882378251067146
statistics z  :  9.113505733123752
p-value  :  3.9881001958568025e-20
Statistics W=0.88824, p-value=0.000000
H0 is rejected - Distribution not adhering to normality


In [7]:
# For the Box-Cox lambda calculation
from scipy.stats import boxcox

x, lmbda = boxcox(aug_df_pd['Avarage_Temperature'])
print("Lambda: ",lmbda)

# Copy the original DataFrame to a new one
new_df = aug_df_pd.copy()

# Inserting the Box-Cox lambda into the dataset to estimate a new model
new_df['bc_Avarage_Temperature'] = x

# Estimating a new multiple model with dependent variable transformed by Box-Cox
model_boxcox = sm.OLS.from_formula("bc_Avarage_Temperature ~ Precipitation +\
                                Avarage_Atmospheric_Pressure +\
                                Avarage_Dew_Temperature +\
                                Avarage_Relative_Humidity +\
                                Avarage_Wind_Speed", new_df).fit()

# Model parameters
model_boxcox.summary()

# Get the R²
model_boxcox_R2 = model_boxcox.rsquared

Lambda:  -0.11337365811208733


In [8]:
# Applying the Stepwise procedure to 'model_boxcox"
model_step_boxcox = stepwise(model_boxcox, pvalue_limit=0.05)

# Get the R²
model_step_boxcox_R2 = model_step_boxcox.rsquared

# Checking the normality of the residuals of 'model_step_boxcox'

# Shapiro-Francia test
shapiro_francia(model_step_boxcox.resid)

# Interpretation
teste_sf = shapiro_francia(model_step_boxcox.resid) # creation of the 'teste_sf' object
teste_sf = teste_sf.items() # returns the group of key-value pairs in the dictionary
method, statistics_W, statistics_z, p = teste_sf # definition of list elements (tuple)
print('Statistics W=%.5f, p-value=%.6f' % (statistics_W[1], p[1]))
alpha = 0.05 # level of significance
if p[1] > alpha:
	print('H0 is not rejected - Distribution in line with normality')
else:
	print('H0 is rejected - Distribution not adhering to normality')


Regression type: OLS 

Estimating model...: 
 bc_Avarage_Temperature ~ Q('Precipitation') + Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 Discarding atribute "Q('Precipitation')" with p-value equal to 0.2836463674080907 

Estimating model...: 
 bc_Avarage_Temperature ~ Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('Precipitation')", 'p-value': 0.2836463674080907}

 Model after stepwise process...: 
 bc_Avarage_Temperature ~ Q('Avarage_Atmospheric_Pressure') + Q('Avarage_Dew_Temperature') + Q('Avarage_Relative_Humidity') + Q('Avarage_Wind_Speed') 

                              OLS Regression Results                              
Dep. Variable:     bc_Avarage_Temperature   R-squared:                       0.967
Model:       

In [9]:
# Create a DataFrame
R2_data = {
    "Model": ["Linear Regression", "Stepwise Regression", "BoxCox", "Stepwise Boxcox"],
    "R-squared": [lrm_R2, stepwise_R2, model_boxcox_R2, model_step_boxcox_R2]
}

R2_df = pd.DataFrame(R2_data)
R2_df

Unnamed: 0,Model,R-squared
0,Linear Regression,0.9481
1,Stepwise Regression,0.94788
2,BoxCox,0.967283
3,Stepwise Boxcox,0.967232
