In [143]:
import pandas as pd
dataAddress = "weatherAUS.csv"
rawData = []
with open(dataAddress, 'r') as file:
    first = True
    for line in file:
        if (first):
            columnLabels = line.strip().split(',')
            first = False
        else:
            line = line.strip().split(',')
            rawData.append(line)
data = pd.DataFrame(rawData, columns=columnLabels)
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44,W,...,22,1007.7,1007.1,8,,16.9,21.8,No,0,No
1,2008-12-02,Albury,7.4,25.1,0,,,WNW,44,NNW,...,25,1010.6,1007.8,,,17.2,24.3,No,0,No
2,2008-12-03,Albury,12.9,25.7,0,,,WSW,46,W,...,30,1007.6,1008.7,,2,21,23.2,No,0,No
3,2008-12-04,Albury,9.2,28,0,,,NE,24,SE,...,16,1017.6,1012.8,,,18.1,26.5,No,1,No
4,2008-12-05,Albury,17.5,32.3,1,,,W,41,ENE,...,33,1010.8,1006,7,8,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0,,,E,31,SE,...,24,1024.6,1020.3,,,10.1,22.4,No,0,No
145456,2017-06-22,Uluru,3.6,25.3,0,,,NNW,22,SE,...,21,1023.5,1019.1,,,10.9,24.5,No,0,No
145457,2017-06-23,Uluru,5.4,26.9,0,,,N,37,SE,...,24,1021,1016.8,,,12.5,26.1,No,0,No
145458,2017-06-24,Uluru,7.8,27,0,,,SE,28,SSE,...,24,1019.4,1016.5,3,2,15.1,26,No,0,No


In [144]:
data.drop('RISK_MM', axis=1, inplace=True)

def toMonth(x):
    x = x.split('-')[1]
    return x
data['Date'] = data['Date'].apply(toMonth)

The first step is to remove the W column, because it correlates directly with the labels. Then I split out the month from the date, partly to turn this value into an integer and partly because month seems likelier to correlate with rain than either year or day. 

In [145]:
import numpy as np
data = data.replace("NA", np.nan)
numerics = data.loc[:, ["Date", "MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", 
                 "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", 
                "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm"]]
numerics = numerics.apply(pd.to_numeric)
numerics = numerics.fillna(numerics.mean())
data.update(numerics)
data = data.dropna()

We first replace all "NA" string with NaN values everywhere in the dataframe. Then we format the numeric columns correctly, turning them from columns of strings into numeric columns and replacing the NaN values in them with the means of those columns. After this, since the only remaining NaN values are in non-numeric columns, we drop all rows containing NaN values, since there are no "average" values for these non-numeric features, and we have enough data to do so (there are still 123k data points even after this operation, which is more than enough). 

In [146]:
toNormalize = ["Pressure9am", "Pressure3pm"]
data[toNormalize] = data[toNormalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
labels = data["RainTomorrow"]
data.drop("RainTomorrow", axis=1, inplace=True)
labels = pd.DataFrame(labels)
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,12,Albury,13.4,22.9,0.6,5.46823,7.61118,W,44,W,...,24,71,22,0.449587,0.48,8,4.50993,16.9,21.8,No
1,12,Albury,7.4,25.1,0,5.46823,7.61118,WNW,44,NNW,...,22,44,25,0.497521,0.4912,4.44746,4.50993,17.2,24.3,No
2,12,Albury,12.9,25.7,0,5.46823,7.61118,WSW,46,W,...,26,38,30,0.447934,0.5056,4.44746,2,21,23.2,No
3,12,Albury,9.2,28,0,5.46823,7.61118,NE,24,SE,...,9,45,16,0.613223,0.5712,4.44746,4.50993,18.1,26.5,No
4,12,Albury,17.5,32.3,1,5.46823,7.61118,W,41,ENE,...,20,82,33,0.500826,0.4624,7,8,17.8,29.7,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,6,Uluru,3.5,21.8,0,5.46823,7.61118,E,31,ESE,...,13,59,27,0.730579,0.7056,4.44746,4.50993,9.4,20.9,No
145455,6,Uluru,2.8,23.4,0,5.46823,7.61118,E,31,SE,...,11,51,24,0.728926,0.6912,4.44746,4.50993,10.1,22.4,No
145456,6,Uluru,3.6,25.3,0,5.46823,7.61118,NNW,22,SE,...,9,56,21,0.710744,0.672,4.44746,4.50993,10.9,24.5,No
145457,6,Uluru,5.4,26.9,0,5.46823,7.61118,N,37,SE,...,9,53,24,0.669421,0.6352,4.44746,4.50993,12.5,26.1,No


I also decided to normalize the values for the pressure columns, since they were fairly high numbers. Now we are ready to extract the labels from the dataset, which is the last column (RainTomorrow). 

In [147]:
data = pd.get_dummies(data, columns=["Location"])
data = pd.get_dummies(data, columns=["WindGustDir"])
data = pd.get_dummies(data, columns=["WindDir9am"])
data = pd.get_dummies(data, columns=["WindDir3pm"])
data = pd.get_dummies(data, columns=["RainToday"])

data = data.apply(pd.to_numeric)
labels = labels.values.ravel()
data

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
0,12,13.4,22.9,0.6,5.468232,7.611178,44.0,20.0,24.0,71.0,...,0,0,0,0,0,0,1,0,1,0
1,12,7.4,25.1,0.0,5.468232,7.611178,44.0,4.0,22.0,44.0,...,0,0,0,0,0,0,0,1,1,0
2,12,12.9,25.7,0.0,5.468232,7.611178,46.0,19.0,26.0,38.0,...,0,0,0,0,0,0,0,1,1,0
3,12,9.2,28.0,0.0,5.468232,7.611178,24.0,11.0,9.0,45.0,...,0,0,0,0,0,0,0,0,1,0
4,12,17.5,32.3,1.0,5.468232,7.611178,41.0,7.0,20.0,82.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,6,3.5,21.8,0.0,5.468232,7.611178,31.0,15.0,13.0,59.0,...,0,0,0,0,0,0,0,0,1,0
145455,6,2.8,23.4,0.0,5.468232,7.611178,31.0,13.0,11.0,51.0,...,0,0,0,0,0,0,0,0,1,0
145456,6,3.6,25.3,0.0,5.468232,7.611178,22.0,13.0,9.0,56.0,...,0,0,0,0,0,0,0,0,1,0
145457,6,5.4,26.9,0.0,5.468232,7.611178,37.0,9.0,9.0,53.0,...,0,0,0,0,0,0,1,0,1,0


This rather hideous looking step converts the categorical values into one hot encodings, which is necessary in some form for them to be interpreted. It is possible that the number of columns could be reduced by simplifying NNW to N, WNW to NW, etc, but I do not know that this would help anything other than readability, and would be a loss of data. 

In [159]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
n_estimators = 500
learning_rate = 0.25
max_depth = 3
cvfolds = 2
classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
scores = cross_val_score(classifier, data, labels, cv=cvfolds)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.83 (+/- 0.00)


Dozens of trials went into choosing these hyperparameters, and 83% accuracy was the best that could be obtained with this division of data. It is worth noting that k fold verification actually reduces accuracy here - 86% accuracy is possible if only one train/test split of the data is used, and the accuracy goes down substantially as more folds of validation are used. This is perplexing - my best explanation is that k fold validation may be giving a more accurate representation of the accuracy and level of overfitting of the model, and that this simply reflects that the "actual" accuracy of the model on real data will be lower than 83%. 

Regardless, it is also interesting that the ideal max depth of the tree is 3, and not 5. Perhaps this indicates that there are certain categories that have relatively powerful predictiveness on their own (like month or whether it rained yesterday), and that adding additional tree levels does nothing more to help predictiveness, even causing overfitting. Regularization is included in the default implementation of the classifier, so adding it explicitly is not necessary. 