## Imports and Data Creation



In [10]:
#Arrays and Dataframe
import numpy as np
import pandas as pd

#SQL
from sqlalchemy import create_engine

#Visualization
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

#Data Exploration
from scipy import stats

#Data Modeling
from sklearn import linear_model
from sklearn.svm import LinearSVC
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant

from statsmodels.tsa.stattools import acf

from scipy.stats import jarque_bera
from scipy.stats import normaltest

In [11]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'weatherinszeged'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('select * from weatherinszeged', con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

## Linear Regression Model

In [13]:
y = df['temperature']
X = df[['humidity', 'windspeed', 'windbearing',  'pressure']]

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
# We create a LinearRegression model object
# from scikit-learn's linear_model module.
lrm = linear_model.LinearRegression()

# fit method estimates the coefficients using OLS
lrm.fit(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
# Inspect the results.
print('\nCoefficients: \n', lrm.coef_)
print('\nIntercept: \n', lrm.intercept_)


Coefficients: 
 [-3.24962454e+01 -2.01413742e-01  4.04836936e-03 -6.91759209e-04]

Intercept: 
 37.9264381852854


In [16]:
X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,temperature,R-squared:,0.421
Model:,OLS,Adj. R-squared:,0.421
Method:,Least Squares,F-statistic:,17500.0
Date:,"Tue, 05 Nov 2019",Prob (F-statistic):,0.0
Time:,08:04:25,Log-Likelihood:,-328210.0
No. Observations:,96453,AIC:,656400.0
Df Residuals:,96448,BIC:,656500.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,37.9264,0.233,162.709,0.000,37.470,38.383
humidity,-32.4962,0.123,-264.288,0.000,-32.737,-32.255
windspeed,-0.2014,0.003,-57.557,0.000,-0.208,-0.195
windbearing,0.0040,0.000,18.463,0.000,0.004,0.004
pressure,-0.0007,0.000,-3.452,0.001,-0.001,-0.000

0,1,2,3
Omnibus:,3375.432,Durbin-Watson:,0.059
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3793.297
Skew:,-0.455,Prob(JB):,0.0
Kurtosis:,3.339,Cond. No.,10600.0


## Check Gauss-Markov Conditions

### Assumption one: linearity of the model in its coefficients

In [None]:
plt.scatter(df["temperature"], predictions)
plt.xlabel("feature")
plt.ylabel("target")
plt.title('Feature / Target Linearity')
plt.show()

**Conclusion:** This assumption is met since it's a fairly linear relationship. However, there are some outliers in the data. 

### Assumption two: the error term should be zero on average

In [None]:
predictions = lrm.predict(X)
errors = y - predictions

print("Mean of the errors in the medical costs model is: {}".format(np.mean(errors)))

**Conclusion:**  This assumption is met since the average error is very near zero. 

### Assumption three: homoscedasticity

In [None]:
plt.scatter(predictions, errors)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()

In [None]:
from scipy.stats import bartlett
from scipy.stats import levene

bart_stats = bartlett(predictions, errors)
lev_stats = levene(predictions, errors)

print("Bartlett test statistic value is {0:3g} and p value is {1:.3g}".format(bart_stats[0], bart_stats[1]))
print("Levene test statistic value is {0:3g} and p value is {1:.3g}".format(lev_stats[0], lev_stats[1]))

**Conclusion:**  This assumption is not met since p value is below .05.

### Assumption four: low multicollinearity

In [None]:
X = add_constant(df[['humidity', 'windspeed', 'windbearing',  'pressure']])
pd.Series([variance_inflation_factor(X.values, i)
for i in range (X.shape[1])], index=X.columns)

**Conclusion:**  This assumption is met since all are below 5. 

### Assumption five: error terms should be uncorrelated with one another

In [None]:
plt.plot(errors)
plt.show()

In [None]:
#autocorrelation features
acf_data = acf(errors)

plt.plot(acf_data[1:])
plt.show()

**Conclusion:** this assumption is not  met becaues there is a strong correlation, ranging from ~ .67 to .97

### Assumption six: features shouldn't be correlated with the errors

In [None]:
rand_nums = np.random.normal(np.mean(errors), np.std(errors), len(errors))

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.scatter(np.sort(rand_nums), np.sort(errors)) # we sort the arrays
plt.xlabel("the normally distributed random variable")
plt.ylabel("errors of the model")
plt.title("QQ plot")

plt.subplot(1,2,2)
plt.hist(errors)
plt.xlabel("errors")
plt.title("Histogram of the errors")

plt.tight_layout()
plt.show()

In [None]:
jb_stats = jarque_bera(errors)
norm_stats = normaltest(errors)

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

**Conclusion:** this assumption is not met because the Jarque-Bera and Normality test have pvalues below .05 which indicates that we can reject the null hypothesis and that they are not normally distributed. 