# Weather

In [1]:
import helpers as hp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import boxcox
from scipy.stats import normaltest
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
import statsmodels.api as sm

import warnings
warnings.filterwarnings(action="ignore")

from config import usr, pwd, url, port, db

%matplotlib inline

## Load Data
Load the dataset and inspect it's contents.

In [2]:
engine = create_engine(f"postgresql+psycopg2://{usr}:{pwd}@{url}:{port}/{db}")

In [3]:
query = f'''
SELECT
    *
FROM
    {db};
'''

In [4]:
df = pd.read_sql_query(query, con=engine)

In [5]:
engine.dispose()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
date                   96453 non-null datetime64[ns, UTC]
summary                96453 non-null object
preciptype             96453 non-null object
temperature            96453 non-null float64
apparenttemperature    96453 non-null float64
humidity               96453 non-null float64
windspeed              96453 non-null float64
windbearing            96453 non-null float64
visibility             96453 non-null float64
loudcover              96453 non-null float64
pressure               96453 non-null float64
dailysummary           96453 non-null object
dtypes: datetime64[ns, UTC](1), float64(8), object(3)
memory usage: 8.8+ MB


### Target Variable
The target variable is the difference betweem _temperature_ and _apparenttemperature_.

In [7]:
df['temp_diff'] = df['apparenttemperature'] - df['temperature']

## Model 1
Only consider the following columns for this analysis.

In [8]:
columns = ['humidity', 'windspeed', 'temp_diff']

In [9]:
df_subset_1 = df.loc[:, columns].copy()

In [10]:
df_subset_1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
humidity,96453.0,0.734899,0.195473,0.0,0.6,0.78,0.89,1.0
windspeed,96453.0,10.81064,6.913571,0.0,5.8282,9.9659,14.1358,63.8526
temp_diff,96453.0,-1.07765,1.678694,-10.183333,-2.216667,0.0,0.0,4.811111


In [11]:
df_subset_1.corr()

Unnamed: 0,humidity,windspeed,temp_diff
humidity,1.0,-0.224951,-0.242212
windspeed,-0.224951,1.0,-0.411943
temp_diff,-0.242212,-0.411943,1.0


### Modeling

In [12]:
features = ['humidity', 'windspeed']
X = df_subset_1.loc[:, features]
y = df_subset_1['temp_diff']

X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

In [13]:
results.summary()

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.288
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,19490.0
Date:,"Wed, 14 Aug 2019",Prob (F-statistic):,0.0
Time:,12:00:51,Log-Likelihood:,-170460.0
No. Observations:,96453,AIC:,340900.0
Df Residuals:,96450,BIC:,340900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4381,0.021,115.948,0.000,2.397,2.479
humidity,-3.0292,0.024,-126.479,0.000,-3.076,-2.982
windspeed,-0.1193,0.001,-176.164,0.000,-0.121,-0.118

0,1,2,3
Omnibus:,3935.747,Durbin-Watson:,0.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4613.311
Skew:,-0.478,Prob(JB):,0.0
Kurtosis:,3.484,Cond. No.,88.1


**Observations:**

R<sup>2</sup> and adjusted R<sup>2</sup> are the same and are small, below 0.5. The model is doing a poor job of explaining the target variable.

## Model 2
Create an interaction term between _humidity_ and _windspeed_, re-run the model, and interpret the results.

In [14]:
df_subset_2 = df_subset_1.copy()

In [15]:
df_subset_2['humid_wind'] = df_subset_2['humidity']*df_subset_2['windspeed']

In [16]:
df_subset_2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
humidity,96453.0,0.734899,0.195473,0.0,0.6,0.78,0.89,1.0
windspeed,96453.0,10.81064,6.913571,0.0,5.8282,9.9659,14.1358,63.8526
temp_diff,96453.0,-1.07765,1.678694,-10.183333,-2.216667,0.0,0.0,4.811111
humid_wind,96453.0,7.640729,5.034842,0.0,3.820852,6.701464,10.21384,43.346835


### Modeling

In [17]:
features = ['humidity', 'windspeed', 'humid_wind']
X = df_subset_2.loc[:, features]
y = df_subset_2['temp_diff']

X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

In [18]:
results.summary()

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.341
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,16660.0
Date:,"Wed, 14 Aug 2019",Prob (F-statistic):,0.0
Time:,12:00:51,Log-Likelihood:,-166690.0
No. Observations:,96453,AIC:,333400.0
Df Residuals:,96449,BIC:,333400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0839,0.033,2.511,0.012,0.018,0.149
humidity,0.1775,0.043,4.133,0.000,0.093,0.262
windspeed,0.0905,0.002,36.797,0.000,0.086,0.095
humid_wind,-0.2971,0.003,-88.470,0.000,-0.304,-0.291

0,1,2,3
Omnibus:,4849.937,Durbin-Watson:,0.265
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9295.404
Skew:,-0.378,Prob(JB):,0.0
Kurtosis:,4.32,Cond. No.,193.0


**Observations:**
After adding the interaction term, _humid_wind_, both R<sup>2</sup> values have increased from 0.28 to 0.34. The model is performing better, but still not very well.

## Model 3
Add _visibility_ as an additional feature and evalutate the model.

In [19]:
df_subset_3 = df_subset_2.copy()
df_subset_3['visibility'] = df['visibility']

In [21]:
df_subset_3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
humidity,96453.0,0.734899,0.195473,0.0,0.6,0.78,0.89,1.0
windspeed,96453.0,10.81064,6.913571,0.0,5.8282,9.9659,14.1358,63.8526
temp_diff,96453.0,-1.07765,1.678694,-10.183333,-2.216667,0.0,0.0,4.811111
humid_wind,96453.0,7.640729,5.034842,0.0,3.820852,6.701464,10.21384,43.346835
visibility,96453.0,10.347325,4.192123,0.0,8.3398,10.0464,14.812,16.1


In [22]:
features = ['humidity', 'windspeed', 'humid_wind', 'visibility']
X = df_subset_3.loc[:, features]
y = df_subset_3['temp_diff']

X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

In [23]:
results.summary()

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.364
Model:,OLS,Adj. R-squared:,0.363
Method:,Least Squares,F-statistic:,13770.0
Date:,"Wed, 14 Aug 2019",Prob (F-statistic):,0.0
Time:,12:04:31,Log-Likelihood:,-165040.0
No. Observations:,96453,AIC:,330100.0
Df Residuals:,96448,BIC:,330100.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.1006,0.039,-28.459,0.000,-1.176,-1.025
humidity,0.8909,0.044,20.263,0.000,0.805,0.977
windspeed,0.1033,0.002,42.579,0.000,0.099,0.108
humid_wind,-0.3164,0.003,-95.355,0.000,-0.323,-0.310
visibility,0.0646,0.001,58.051,0.000,0.062,0.067

0,1,2,3
Omnibus:,5328.364,Durbin-Watson:,0.288
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11525.074
Skew:,-0.373,Prob(JB):,0.0
Kurtosis:,4.52,Cond. No.,246.0


**Observations:** Add the additional feature _visibility_, both R<sup>2</sup> values increased slightly to 0.36. The model is still performing poorly.

## Model Evaluation

In [32]:
model_name = ['model 1', 'model 2', 'model 3']
aic_scores = [3.409e+05, 3.334e+05, 3.301e+05]
bic_scores = [3.409e+05, 3.334e+05, 3.301e+05]

evaluation_dict = {'model': model_name,
                   'aic_score': aic_scores,
                   'bic_score': bic_scores}

df_evaluate = pd.DataFrame(evaluation_dict)

In [33]:
df_evaluate

Unnamed: 0,model,aic_score,bic_score
0,model 1,340900.0,340900.0
1,model 2,333400.0,333400.0
2,model 3,330100.0,330100.0


All AIC and BIC scores are really high. This is means none of the models are performing well. Model 3 has the lowest AIC and BIC scores, meaning this model is performing the best. Maybe adding additional features could improve the model.