# Predicting temperatures

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from sqlalchemy import create_engine
import warnings
from sklearn import linear_model
import statsmodels.api as sm

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'weatherinszeged'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('select * from weatherinszeged',con=engine)

# no need for an open connection, 
# as we're only doing a single query
engine.dispose()

In [4]:
df.head()

Unnamed: 0,date,summary,preciptype,temperature,apparenttemperature,humidity,windspeed,windbearing,visibility,loudcover,pressure,dailysummary
0,2006-03-31 22:00:00+00:00,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-03-31 23:00:00+00:00,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 01:00:00+00:00,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 02:00:00+00:00,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
date                   96453 non-null datetime64[ns, UTC]
summary                96453 non-null object
preciptype             96453 non-null object
temperature            96453 non-null float64
apparenttemperature    96453 non-null float64
humidity               96453 non-null float64
windspeed              96453 non-null float64
windbearing            96453 non-null float64
visibility             96453 non-null float64
loudcover              96453 non-null float64
pressure               96453 non-null float64
dailysummary           96453 non-null object
dtypes: datetime64[ns, UTC](1), float64(8), object(3)
memory usage: 8.8+ MB


## Check the data

In [3]:
# Get column names
column_names = df.columns
print(column_names)
# Get column data types
df.dtypes
# Also check if the column is unique
for i in column_names:
  print('{} is unique: {}'.format(i, df[i].is_unique))

Index(['date', 'summary', 'preciptype', 'temperature', 'apparenttemperature',
       'humidity', 'windspeed', 'windbearing', 'visibility', 'loudcover',
       'pressure', 'dailysummary'],
      dtype='object')
date is unique: False
summary is unique: False
preciptype is unique: False
temperature is unique: False
apparenttemperature is unique: False
humidity is unique: False
windspeed is unique: False
windbearing is unique: False
visibility is unique: False
loudcover is unique: False
pressure is unique: False
dailysummary is unique: False


In [5]:
# Check for missing data

total_missing = df.isnull().sum().sort_values(ascending=False)
percent_missing = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
dailysummary,0,0.0
pressure,0,0.0
loudcover,0,0.0
visibility,0,0.0
windbearing,0,0.0
windspeed,0,0.0
humidity,0,0.0
apparenttemperature,0,0.0
temperature,0,0.0
preciptype,0,0.0


In [6]:
df.describe()

Unnamed: 0,temperature,apparenttemperature,humidity,windspeed,windbearing,visibility,loudcover,pressure
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


In [17]:
#df = df.drop(['loudcover', 'summary', 'date', 'preciptype', 'apparenttemperature'], axis=1)

In [18]:
df.head()

Unnamed: 0,temperature,humidity,windspeed,windbearing,visibility,pressure,dailysummary
0,9.472222,0.89,14.1197,251.0,15.8263,1015.13,Partly cloudy throughout the day.
1,9.355556,0.86,14.2646,259.0,15.8263,1015.63,Partly cloudy throughout the day.
2,9.377778,0.89,3.9284,204.0,14.9569,1015.94,Partly cloudy throughout the day.
3,8.288889,0.83,14.1036,269.0,15.8263,1016.41,Partly cloudy throughout the day.
4,8.755556,0.83,11.0446,259.0,15.8263,1016.51,Partly cloudy throughout the day.


In [19]:
df = df.drop('dailysummary', axis=1)

In [20]:
df.head()

Unnamed: 0,temperature,humidity,windspeed,windbearing,visibility,pressure
0,9.472222,0.89,14.1197,251.0,15.8263,1015.13
1,9.355556,0.86,14.2646,259.0,15.8263,1015.63
2,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,8.288889,0.83,14.1036,269.0,15.8263,1016.41
4,8.755556,0.83,11.0446,259.0,15.8263,1016.51


In [23]:
#split dataset in features and target variable
feature_cols = ['humidity', 'windspeed', 'windbearing', 'visibility','pressure']
X = df[feature_cols] # Features
y = df.temperature # Target variable

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [29]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [47]:
cross_val_score(clf, X, y, cv=10)

array([ 0.23374347,  0.32022787,  0.26759709,  0.32100628,  0.23483397,
        0.42781708,  0.27773068, -0.01810782,  0.19492838,  0.15415602])

In [35]:
print("Accuracy:",metrics.explained_variance_score(y_test, y_pred))
print("r2 Score:",metrics.r2_score(y_test, y_pred))

Accuracy: 0.36455117626653133
r2 Score: 0.3645081307216319


# Random forest

In [49]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestRegressor()

cross_val_score(rfc, X, y, cv=10)

array([0.53211428, 0.66892263, 0.57578191, 0.59423326, 0.53288853,
       0.63690222, 0.58430926, 0.47091026, 0.52803584, 0.4992774 ])

In [50]:
# Train Decision Tree Classifer
rfc = rfc.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = rfc.predict(X_test)

In [51]:
print("Accuracy:",metrics.explained_variance_score(y_test, y_pred))
print("r2 Score:",metrics.r2_score(y_test, y_pred))

Accuracy: 0.6457528879290544
r2 Score: 0.6456763739125815
