In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
trainData = pd.read_csv("trainData.csv")
testData = pd.read_csv("testData.csv")
valData = pd.read_csv("valData.csv")

In [3]:
print(trainData.shape, testData.shape, valData.shape)
print(trainData.shape[0]/168,testData.shape[0]/168, valData.shape[0]/168)
print(trainData.shape[0]/24,testData.shape[0]/24, valData.shape[0]/24)

(1344000, 8) (1008000, 8) (360192, 8)
8000.0 6000.0 2144.0
56000.0 42000.0 15008.0


In [4]:
trainFeatures, trainLabels = np.array(trainData.iloc[:,:7]).astype(int), np.array(trainData.iloc[:,7])
testFeatures, testLabels = np.array(testData.iloc[:,:7]).astype(int), np.array(testData.iloc[:,7])
valFeatures, valLabels = np.array(valData.iloc[:,:7]).astype(int), np.array(valData.iloc[:,7])

In [5]:
def nulltozero(labels):
    for i, label in enumerate(labels):
        if label=="null":
            labels[i] = 0
    return labels

In [6]:
trainLabels = nulltozero(trainLabels).astype(int)
testLabels = nulltozero(testLabels).astype(int)
valLabels = nulltozero(valLabels).astype(int)

In [7]:
def groupFeatures(features, n):
    
    featureSize = features.shape
    trainFeaturesGroup = np.zeros((int(featureSize[0]/n), featureSize[1]*n))
    
    for i in range(0, len(features), n):
        if (i+n < len(features)):
            ind = int(i/n)
            trainFeaturesGroup[ind] = np.concatenate(features[i:i+n])

    return trainFeaturesGroup

In [8]:
def groupLabels(labels, n):

    trainLabelsGroup = np.zeros(int(len(labels)/n))

    for i in range(0, len(labels), n):
        if (i+n < len(labels)):
            subsetLabels = labels[i:i+n]
            ind = int(i/n)
            if (sum(subsetLabels) > 0):
                trainLabelsGroup[ind] = 1
            else:
                trainLabelsGroup[ind] = 0

    return trainLabelsGroup

In [9]:
def errorCalc(groundTruth, predProb):
    mse = np.square(groundTruth - predProb).mean()
    ssd = np.sum(np.square(groundTruth - predProb))
    return(mse, ssd)

In [61]:
trainDayFeatures, trainDayLabels = groupFeatures(trainFeatures, 24), groupLabels(trainLabels, 24)
testDayFeatures, testDayLabels = groupFeatures(testFeatures, 24), groupLabels(testLabels, 24)
valDayFeatures, valDayLabels = groupFeatures(valFeatures, 24), groupLabels(valLabels, 24)

In [62]:
print(trainDayFeatures.shape, testDayFeatures.shape, valDayFeatures.shape)
print(trainDayLabels.shape, testDayLabels.shape, valDayLabels.shape)

(56000, 168) (42000, 168) (15008, 168)
(56000,) (42000,) (15008,)


In [65]:
clf = LogisticRegression()
mlp = MLPClassifier()
rf = RandomForestClassifier()

for n in range(12, 60, 12):
    print("\n\nN: ", n)
    trainDayFeatures, trainDayLabels = groupFeatures(trainFeatures, n), groupLabels(trainLabels, n)
    testDayFeatures, testDayLabels = groupFeatures(testFeatures, n), groupLabels(testLabels, n)
    valDayFeatures, valDayLabels = groupFeatures(valFeatures, n), groupLabels(valLabels, n)

    clf.fit(trainDayFeatures, trainDayLabels)
    mlp.fit(trainDayFeatures, trainDayLabels)
    rf.fit(trainDayFeatures, trainDayLabels)
    print("\nValidation Scores")
    print("Logistic Regression Score: ", clf.score(valDayFeatures, valDayLabels))
    print("MLP Score: ", mlp.score(valDayFeatures, valDayLabels))
    print("Random Forest Score: ", rf.score(valDayFeatures, valDayLabels))

    print("\nTesting Scores")
    print("Logistic Regression Score: ", clf.score(testDayFeatures, testDayLabels))
    print("MLP Score: ", mlp.score(testDayFeatures, testDayLabels))
    print("Random Forest Score: ", rf.score(testDayFeatures, testDayLabels))



N:  12

Validation Scores
Logistic Regression Score:  0.836054104477612
MLP Score:  0.6677105543710021
Random Forest Score:  0.8791644456289979

Testing Scores
Logistic Regression Score:  0.843297619047619
MLP Score:  0.6688214285714286
Random Forest Score:  0.8855833333333333


N:  24

Validation Scores
Logistic Regression Score:  0.7993070362473348
MLP Score:  0.707022921108742
Random Forest Score:  0.8488805970149254

Testing Scores
Logistic Regression Score:  0.8064285714285714
MLP Score:  0.7039761904761904
Random Forest Score:  0.8539285714285715


N:  36

Validation Scores
Logistic Regression Score:  0.7687156421789105
MLP Score:  0.6881559220389805
Random Forest Score:  0.8154922538730635

Testing Scores
Logistic Regression Score:  0.7730714285714285
MLP Score:  0.6774642857142857
Random Forest Score:  0.818


N:  48

Validation Scores
Logistic Regression Score:  0.7507995735607675
MLP Score:  0.7260127931769723
Random Forest Score:  0.8010394456289979

Testing Scores
Logisti

# Test n = 24

In [10]:
clf = LogisticRegression()
mlp = MLPClassifier()
rf = RandomForestClassifier()
n = 24

trainDayFeatures, trainDayLabels = groupFeatures(trainFeatures, n), groupLabels(trainLabels, n)
testDayFeatures, testDayLabels = groupFeatures(testFeatures, n), groupLabels(testLabels, n)
valDayFeatures, valDayLabels = groupFeatures(valFeatures, n), groupLabels(valLabels, n)

clf.fit(trainDayFeatures, trainDayLabels)
mlp.fit(trainDayFeatures, trainDayLabels)
rf.fit(trainDayFeatures, trainDayLabels)
print("\nTesting Scores")
print("Logistic Regression Score: ", clf.score(testDayFeatures, testDayLabels))
print("MLP Score: ", mlp.score(testDayFeatures, testDayLabels))
print("Random Forest Score: ", rf.score(testDayFeatures, testDayLabels))
print("MSE and SSE: ", errorCalc(testDayLabels,rf.predict_proba(testDayFeatures)[:,1]))


Testing Scores
Logistic Regression Score:  0.8064285714285714
MLP Score:  0.6709047619047619
Random Forest Score:  0.8531904761904762
MSE and SSE:  (0.11044263148148147, 4638.590522222222)


In [11]:
# Test Precipitation Probability vs. Actual
for i in range(0,100):
    print("Predicted: ", rf.predict_proba(testDayFeatures)[i,1], "Actual :", testDayLabels[i])

Predicted:  0.0 Actual : 0.0
Predicted:  0.3 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  1.0 Actual : 0.0
Predicted:  1.0 Actual : 0.0
Predicted:  0.1 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.1 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  1.0 Actual : 1.0
Predicted:  0.2 Actual : 0.0
Predicted:  0.6 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.2 Actual : 0.0
Predicted:  0.7 Actual : 1.0
Predicted:  0.2 Actual : 0.0
Predicted:  0.6 Actual : 1.0
Predicted:  0.4 Actual : 1.0
Predicted:  0.5 Actual : 0.0
Predicted:  0.5 Actual : 0.0
Predicted:  0.2 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.0 Actual : 0.0
Predicted:  0.

# Precip Prob Distribution Feature

In [88]:
trainDayFeatures.shape

(28000, 336)

In [110]:
test = trainDayFeatures[1]
probFeature = test[4::7]
print(probFeature)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [90]:
trainData

Unnamed: 0.1,Unnamed: 0,temperature,dewPoint,precipRate,precipProbability,pressure,cloudCover,didPrecip
0,0,41.00,41.00,0.0000,0.00,1014.92,1.00,0
1,1,41.35,38.79,0.0000,0.00,1015.98,1.00,0
2,2,42.75,37.51,0.0000,0.00,1017.12,1.00,0
3,3,43.29,36.40,0.0000,0.00,1018.35,1.00,0
4,4,42.39,35.45,0.0003,0.03,1019.75,0.93,0
5,5,41.03,34.83,0.0002,0.02,1021.23,0.84,0
6,6,40.08,34.14,0.0001,0.01,1022.42,0.75,0
7,7,39.49,33.25,0.0000,0.00,1023.10,0.65,0
8,8,38.84,32.30,0.0000,0.00,1023.48,0.51,0
9,9,37.92,31.51,0.0000,0.00,1023.84,0.48,0


# Sanity Check Area

In [149]:
print(trainDayFeatures.shape, trainDayLabels.shape)
print(trainDayFeatures, trainDayLabels)
print(np.count_nonzero(trainDayLabels))

(56000, 168) (56000,)
[[   0.   41.   41. ...    0. 1024.    0.]
 [  24.   58.   32. ...    0. 1026.    0.]
 [  48.   55.   33. ...    0. 1026.    0.]
 ...
 [  96.   31.   21. ...    0. 1029.    0.]
 [ 120.   35.   27. ...    0. 1029.    0.]
 [ 144.   35.   29. ...    0. 1019.    0.]] [0. 0. 0. ... 0. 0. 0.]
14742


In [150]:
print(testDayFeatures.shape, testDayLabels.shape)
print(testDayFeatures, testDayLabels)
print(np.count_nonzero(testDayLabels))

(42000, 168) (42000,)
[[   0.   17.   10. ...    0. 1026.    0.]
 [  24.   10.    5. ...    0. 1029.    0.]
 [  48.    7.    0. ...    0. 1040.    0.]
 ...
 [  96.   32.   11. ...    0. 1020.    0.]
 [ 120.   49.   12. ...    0. 1027.    0.]
 [ 144.   42.   15. ...    0. 1016.    0.]] [0. 0. 0. ... 0. 0. 0.]
10774


In [151]:
print(valDayFeatures.shape, valDayLabels.shape)
print(valDayFeatures, valDayLabels)
print(np.count_nonzero(valDayLabels))

(15008, 168) (15008,)
[[ 0.000e+00  3.000e+00 -8.000e+00 ...  0.000e+00  1.022e+03  0.000e+00]
 [ 2.400e+01 -1.600e+01 -2.500e+01 ...  0.000e+00  1.025e+03  0.000e+00]
 [ 4.800e+01 -2.000e+01 -2.800e+01 ...  0.000e+00  1.029e+03  0.000e+00]
 ...
 [ 9.600e+01  2.300e+01  6.000e+00 ...  0.000e+00  1.033e+03  0.000e+00]
 [ 1.200e+02  1.800e+01  1.000e+00 ...  0.000e+00  1.033e+03  0.000e+00]
 [ 1.440e+02  1.600e+01  2.000e+00 ...  0.000e+00  1.023e+03  0.000e+00]] [0. 0. 0. ... 0. 0. 0.]
3990
