In [1]:
# ==============================================================================
# File: NYC_Temp_Classification.ipynb
# Author: Vraj Shah
# Revisions by: Francisco Cornejo-Garcia
# Description: Classify date and time
# ==============================================================================

# Load libraries
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Load additional libraries
from datetime import datetime 
from dateutil.parser import parse

# Load scikit-learn libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

# Load models
from downstream_models import LogRegClassifier, RandForestClassifier, LinearRegression, RandForestRegressor, MLPRegressorr, MLPClassifierr
from custom_functions import validate, findSeason

In [21]:
# Load data
nyc = pd.read_csv('dataset/nyc.csv')

# Convert 'Calm' to '0' for Wind SpeedMPH Column
for index, value in enumerate(nyc['Wind SpeedMPH']):
    if value == 'Calm':
        nyc['Wind SpeedMPH'][index] = '0'

# Convert NA to '0'
nyc = nyc.fillna('0')
nyc

Unnamed: 0,date,TimeEST,TemperatureF,Dew PointF,Humidity,Wind SpeedMPH,Conditions
0,2016-01-01,12:51 AM,42.1,25.0,51,4.6,Overcast
1,2016-01-01,1:51 AM,41.0,25.0,53,3.5,Overcast
2,2016-01-01,2:51 AM,41.0,26.1,55,4.6,Overcast
3,2016-01-01,3:51 AM,41.0,26.1,55,9.2,Overcast
4,2016-01-01,4:51 AM,39.9,26.1,58,10.4,Overcast
...,...,...,...,...,...,...,...
5136,2016-06-30,7:51 PM,78.1,60.1,54,5.8,Partly Cloudy
5137,2016-07-01,8:51 PM,77.0,61.0,58,3.5,Clear
5138,2016-07-01,9:51 PM,75.9,61.0,60,0,Clear
5139,2016-07-01,10:51 PM,75.9,61.0,60,4.6,Clear


In [20]:
# Create datetime dataframe
org_datetime_df = pd.DataFrame()
org_datetime_df['datetime'] = nyc['date'] + ' ' + nyc['TimeEST']
datetime_col = nyc['date'] + ' ' + nyc['TimeEST']

# Create arrays to be added to dataframe
season_arr = []
date_arr = []
year_arr = []
month_arr = []
day_arr = []
dayofyear_arr = []
time_arr = []
hour_arr = []
minute_arr = []

# Extract features from datetime values in column
for value in datetime_col:
    datetime = validate(value)
    date = datetime.date()
    time = datetime.time()
    
    season_arr.append(findSeason(date.month, date.day))
    date_arr.append(date)
    year_arr.append(date.year)
    month_arr.append(date.month)
    day_arr.append(date.day)
    dayofyear_arr.append(date.timetuple().tm_yday)
    time_arr.append(time)
    hour_arr.append(time.hour)
    minute_arr.append(time.minute)

# Combine arrays as dataframes into one dataframe
season_df = pd.DataFrame(season_arr)
date_df = pd.DataFrame(date_arr)
year_df = pd.DataFrame(year_arr)
month_df = pd.DataFrame(month_arr)
day_df = pd.DataFrame(day_arr)
dayofyear_df = pd.DataFrame(dayofyear_arr)
time_df = pd.DataFrame(time_arr)
hour_df = pd.DataFrame(hour_arr)
minute_df = pd.DataFrame(minute_arr)
datetime_df = pd.concat([season_df, date_df, year_df, month_df, day_df, dayofyear_df, time_df, hour_df, minute_df], axis = 1, sort = False)
datetime_df.columns = ['season', 'date', 'year', 'month', 'day', 'day_of_year', 'time', 'hour', 'minute']
datetime_df = datetime_df.fillna('0')
datetime_df

Unnamed: 0,season,date,year,month,day,day_of_year,time,hour,minute
0,winter,2016-01-01,2016,1,1,1,00:51:00,0,51
1,winter,2016-01-01,2016,1,1,1,01:51:00,1,51
2,winter,2016-01-01,2016,1,1,1,02:51:00,2,51
3,winter,2016-01-01,2016,1,1,1,03:51:00,3,51
4,winter,2016-01-01,2016,1,1,1,04:51:00,4,51
...,...,...,...,...,...,...,...,...,...
5136,summer,2016-06-30,2016,6,30,182,19:51:00,19,51
5137,summer,2016-07-01,2016,7,1,183,20:51:00,20,51
5138,summer,2016-07-01,2016,7,1,183,21:51:00,21,51
5139,summer,2016-07-01,2016,7,1,183,22:51:00,22,51


In [4]:
# Benchmark 1
# Create separate columns
target = nyc['Conditions']
numeric_cols = nyc[['TemperatureF', 'Dew PointF', 'Humidity', 'Wind SpeedMPH']]
ngram1_cols = nyc[['date', 'TimeEST']]

# Create ngram vectorizer
vectorizer = CountVectorizer(ngram_range = (2, 2), analyzer = 'char')

# Create dataframe
benchmark1_df = pd.DataFrame()
benchmark1_df = pd.concat([benchmark1_df, numeric_cols], axis = 1, sort = False)

# Apply ngram vectorizer to data
for col in ngram1_cols.columns:
    array = ngram1_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark1_df = pd.concat([benchmark1_df, vectorizer_df], axis = 1, sort = False)

In [5]:
# Log Regression Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark1_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5019762845849802, 0.50653694131955, 0.5127659574468085, 0.5130699088145897, 0.5106382978723404]
[0.5078979343863913, 0.503037667071689, 0.5097323600973236, 0.5182481751824818, 0.524330900243309]
[0.5121477162293488, 0.5238095238095238, 0.5228377065111759, 0.5170068027210885, 0.521865889212828]
Avg Train, Validation, and Test Accuracies:
0.5089974780076537
0.5126494073962389
0.519533527696793


In [6]:
# Random Forest Classifier Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark1_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.9245971419884463, 0.9276375798114929, 0.9249240121580548, 0.9246200607902736, 0.9194528875379939]
[0.6780072904009721, 0.6658566221142163, 0.6715328467153284, 0.6605839416058394, 0.683698296836983]
[0.6958211856171039, 0.6997084548104956, 0.6890184645286687, 0.6831875607385811, 0.6861030126336248]
Avg Train, Validation, and Test Accuracies:
0.9242463364572522
0.6719357995346679
0.6907677356656949


In [7]:
# MLP Classifier Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark1_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.5880206749771967, 0.582547886895713, 0.5863221884498481, 0.5869300911854103, 0.5805471124620061]
[0.5722964763061968, 0.5941676792223572, 0.5790754257907542, 0.5766423357664233, 0.6021897810218978]
[0.5519922254616132, 0.5519922254616132, 0.5519922254616132, 0.5519922254616132, 0.5519922254616132]
Avg Train, Validation, and Test Accuracies:
0.5848735907940348
0.5848743396215259
0.5519922254616132


In [8]:
# Benchmark 2
# Create separate columns
numeric_cols = nyc[['TemperatureF', 'Dew PointF', 'Humidity', 'Wind SpeedMPH']] + datetime_df[['year', 'day', 'day_of_year', 'hour', 'minute']]
ngram2_cols = nyc[['date', 'TimeEST']]

# Create dataframe
benchmark2_df = pd.DataFrame()
benchmark2_df = pd.concat([benchmark2_df, numeric_cols], axis = 1, sort = False)

# Apply ngram vectorizer to data
for col in ngram2_cols.columns:
    array = ngram2_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark2_df = pd.concat([benchmark2_df, vectorizer_df], axis = 1, sort = False)
benchmark2_df = benchmark2_df.fillna('0')

In [9]:
# Log Regression Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark2_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5408938887199757, 0.541501976284585, 0.5355623100303951, 0.5355623100303951, 0.5583586626139818]
[0.5236938031591738, 0.5164034021871203, 0.5279805352798054, 0.5206812652068127, 0.5413625304136253]
[0.5451895043731778, 0.5354713313896987, 0.5413022351797862, 0.5374149659863946, 0.5422740524781341]
Avg Train, Validation, and Test Accuracies:
0.5423758295358665
0.5260243072493075
0.5403304178814383


In [10]:
# Random Forest Classifier Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark2_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.9075706901793859, 0.9148677409546975, 0.9069908814589666, 0.9109422492401216, 0.9072948328267477]
[0.606318347509113, 0.6002430133657352, 0.610705596107056, 0.6228710462287105, 0.6277372262773723]
[0.6316812439261419, 0.6258503401360545, 0.6209912536443148, 0.6268221574344023, 0.6112730806608357]
Avg Train, Validation, and Test Accuracies:
0.9095332789319837
0.6135750458975974
0.6233236151603497


In [11]:
# MLP Classifier Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark2_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.9990878686530861, 0.9996959562176954, 0.9990881458966565, 0.9990881458966565, 0.9993920972644377]
[1.0, 0.9975698663426489, 1.0, 1.0, 0.9987834549878345]
[0.6248785228377065, 0.6248785228377065, 0.6248785228377065, 0.6248785228377065, 0.6248785228377065]
Avg Train, Validation, and Test Accuracies:
0.9992704427857063
0.9992706642660967
0.6248785228377065


In [16]:
# Benchmark 3
# Create separate columns
numeric_cols = nyc[['TemperatureF', 'Dew PointF', 'Humidity', 'Wind SpeedMPH']]
# Categ
ngram3_cols = datetime_df[['date', 'year', 'day', 'day_of_year', 'time', 'hour', 'minute']]

# Create dataframe
benchmark3_df = pd.DataFrame()
benchmark3_df = pd.concat([benchmark3_df, numeric_cols], axis = 1, sort = False)

# Apply ngram vectorizer to data
for col in ngram3_cols.columns:
    array = ngram3_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark3_df = pd.concat([benchmark3_df, vectorizer_df], axis = 1, sort = False)
benchmark3_df = benchmark3_df.fillna('0')

In [17]:
# Log Regression Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark3_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5016722408026756, 0.5038005472788082, 0.5036474164133738, 0.5206686930091186, 0.5069908814589665]
[0.5066828675577156, 0.4933171324422843, 0.5, 0.5170316301703163, 0.5182481751824818]
[0.5199222546161322, 0.5199222546161322, 0.5102040816326531, 0.5296404275996113, 0.5150631681243926]
Avg Train, Validation, and Test Accuracies:
0.5073559557925886
0.5070559610705596
0.5189504373177842


In [18]:
# Random Forest Classifier Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark3_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.9236850106415324, 0.9267254484645789, 0.9240121580547113, 0.9240121580547113, 0.9176291793313069]
[0.6852976913730255, 0.6767922235722965, 0.683698296836983, 0.6630170316301703, 0.694647201946472]
[0.6909620991253644, 0.6958211856171039, 0.6938775510204082, 0.6686103012633625, 0.6793002915451894]
Avg Train, Validation, and Test Accuracies:
0.9232127909093683
0.6806904890717895
0.6857142857142857


In [19]:
# MLP Classifier Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark3_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.773791425965339, 0.7722712070538158, 0.7683890577507598, 0.7747720364741641, 0.7689969604863222]
[0.7630619684082625, 0.7691373025516404, 0.7846715328467153, 0.7591240875912408, 0.7822384428223844]
[0.6656948493683188, 0.6656948493683188, 0.6656948493683188, 0.6656948493683188, 0.6656948493683188]
Avg Train, Validation, and Test Accuracies:
0.7716441375460802
0.7716466668440487
0.6656948493683188
