In [1]:
# ==============================================================================
# File: NYC_Temp_Classification.ipynb
# Author: Vraj Shah
# Revisions by: Francisco Cornejo-Garcia
# Description: Classify date and time
# ==============================================================================

# Load models
from downstream_models import LogRegClassifier, RandForestClassifier, LinearRegression, RandForestRegressor, MLPRegressorr, MLPClassifierr

# Load libraries
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Load scikit-learn libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
nyc = pd.read_csv('dataset/nyc.csv')
# Rename columns
nyc.columns = ['date', 'est_time', 'tempF', 'dewpointF', 'humidity', 'windspeedmph', 'conditions']

In [3]:
# Create separate columns
numeric_cols = nyc[['tempF']]
ngram_cols = nyc[['date', 'est_time']]
target = nyc['conditions']

# Create ngram vectorizer
vectorizer = CountVectorizer(ngram_range = (2,2),analyzer = 'char')

# Create dataframe
table = pd.DataFrame()
table = pd.concat([table, numeric_cols], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram_cols.columns:
    # Transform values in column as strings
    array = ngram_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    dataframe = pd.DataFrame(X.toarray())
    table = pd.concat([table, dataframe], axis = 1, sort = False)

In [4]:
# Log Regression
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(table, target)

(2466, 133)
(823, 133)
0.5010641532380663
0.4896719319562576
0.5043731778425656
(2466, 133)
(823, 133)
0.492550927333536
0.47023086269744835
0.48882410106899904
(2467, 133)
(823, 133)
0.5306990881458966
0.51338199513382
0.5286686103012633
(2467, 133)
(823, 133)
0.506079027355623
0.4975669099756691
0.5092322643343051
(2467, 133)
(823, 133)
0.4987841945288754
0.5121654501216545
0.5063168124392614
5 fold Train, Validation, and Test Accuracies:
[0.5010641532380663, 0.492550927333536, 0.5306990881458966, 0.506079027355623, 0.4987841945288754]
[0.4896719319562576, 0.47023086269744835, 0.51338199513382, 0.4975669099756691, 0.5121654501216545]
[0.5043731778425656, 0.48882410106899904, 0.5286686103012633, 0.5092322643343051, 0.5063168124392614]
Avg Train, Validation, and Test Accuracies:
0.5058354781203995
0.49660342997696993
0.5074829931972789


In [5]:
# Random Forest Classifier
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(table, target)

(2466, 133)
(823, 133)
0.9106111280024324
0.6452004860267315
0.6550048590864918
(2466, 133)
(823, 133)
0.9206445728184859
0.6342648845686513
0.6647230320699709
(2467, 133)
(823, 133)
0.9145896656534954
0.648418491484185
0.6462585034013606
(2467, 133)
(823, 133)
0.9176291793313069
0.6520681265206812
0.6656948493683188
(2467, 133)
(823, 133)
0.9151975683890577
0.6532846715328468
0.652089407191448
5 fold Train, Validation, and Test Accuracies:
[0.9106111280024324, 0.9206445728184859, 0.9145896656534954, 0.9176291793313069, 0.9151975683890577]
[0.6452004860267315, 0.6342648845686513, 0.648418491484185, 0.6520681265206812, 0.6532846715328468]
[0.6550048590864918, 0.6647230320699709, 0.6462585034013606, 0.6656948493683188, 0.652089407191448]
Avg Train, Validation, and Test Accuracies:
0.9157344228389557
0.6466473320266192
0.6567541302235179


In [6]:
# MLP Classifier & Regressor
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(table, target)

(2466, 133)
(823, 133)
0.8528428093645485
0.850546780072904
0.6550048590864918
(2466, 133)
(823, 133)
0.8534508969291578
0.8481166464155528
0.6550048590864918
(2467, 133)
(823, 133)
0.8519756838905775
0.8540145985401459
0.6550048590864918
(2467, 133)
(823, 133)
0.8516717325227964
0.8552311435523114
0.6550048590864918
(2467, 133)
(823, 133)
0.8519756838905775
0.8540145985401459
0.6550048590864918
5-fold Train, Validation, and Test Accuracies:
[0.8528428093645485, 0.8534508969291578, 0.8519756838905775, 0.8516717325227964, 0.8519756838905775]
[0.850546780072904, 0.8481166464155528, 0.8540145985401459, 0.8552311435523114, 0.8540145985401459]
[0.6550048590864918, 0.6550048590864918, 0.6550048590864918, 0.6550048590864918, 0.6550048590864918]
Avg Train, Validation, and Test Accuracies:
0.8523833613195315
0.8523847534242119
0.6550048590864918
