# Bike Sharing Demand Kaggle project

W207 Final Project

Chris Murray, Rahul Ragunathan, Rajagopalan Mahadevan

https://www.kaggle.com/c/bike-sharing-demand/


In [34]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Define a function to read data file

Note that we delete the **holiday** and **temp** columns and instead use **workingday** and **atemp**.

In [35]:
# really simple function that returns the hour as an integer
getHour = lambda x: int(x[11:13])

# convert string to float and round to integer
getRounded = lambda x: int(round(float(x)))

def readdata(filename):
    # open the training file and extract column names
    datafile = open(filename)
    headers = datafile.readline()

    # read data file and convert everything to integers
    data = np.genfromtxt(datafile, skip_header=1, delimiter=",", dtype=int, converters=
                    {0: getHour, 5: getRounded, 6: getRounded, 8: getRounded})

    # delete the holiday column because it is correlated with workingday
    data = np.delete(data, 2, 1)

    # delete the temp column because it is correlated with atemp
    data = np.delete(data, 4, 1)

    return data


## Read training data

In [36]:
train_filename = "data/train.csv"

train = readdata(train_filename)

# extract all but the last 3 columns into train_data
train_data = train[:, :-3]

# extract the last 3 columns (casual + registered = count)
train_casual = train[:, 7]
train_registered = train[:, 8]
train_count = train[:, 9]

print "Sample training data:"
print train_data[0:5]

print "\nSample training labels (count):"
print train_count[0:5]


Sample training data:
[[ 1  1  0  1 14 80  0]
 [ 2  1  0  1 14 80  0]
 [ 3  1  0  1 14 75  0]
 [ 4  1  0  1 14 75  0]
 [ 5  1  0  2 13 75  6]]

Sample training labels (count):
[40 32 13  1  1]


## Read test data


In [21]:
test_filename = "data/test.csv"

test = readdata(test_filename)

# re-read file to get just the datetime field
test_file = open(test_filename)
column_names_test = test_file.readline()

test_datetimes = []
for line in test_file.readlines():
    datetime = line.split(',')[0]
    test_datetimes.append(datetime)

test_data = test
    
# print a sample of the test data
print test_data[0:5]



Test data headers:
datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed

[[ 0  1  1  1 11 56 26]
 [ 1  1  1  1 14 56  0]
 [ 2  1  1  1 14 56  0]
 [ 3  1  1  1 13 56 11]
 [ 4  1  1  1 13 56 11]]


## Define CSV output function

In [22]:
def writecsv(filename, predictions):
    
    # open output file and write header row
    outfile = open(filename, "w")
    outfile.write("datetime,count\n")

    # output all the predictions to the file
    for i in range(len(predictions)):
        outfile.write("{},{}\n".format(test_datetimes[i], preds[i]))

    print "{} results written to {}".format(i, output_filename)


## Generate predictions

### K Nearest Neighbors

In [23]:
for k in [1,3,5,7,9]:
    output_filename = 'submission_knn_{}.csv'.format(k)

    # train a model and generate predictions
    kn = KNeighborsClassifier(n_neighbors=k)
    kn.fit(train_data, train_count)
    preds = kn.predict(test_data)
    
    writecsv(output_filename, preds)


6492 results written to submission_knn_1.csv
6492 results written to submission_knn_3.csv
6492 results written to submission_knn_5.csv
6492 results written to submission_knn_7.csv
6492 results written to submission_knn_9.csv


Kaggle submission results for KNN (k=1):  0.96793

### Decision Tree

In [27]:
output_filename = 'submission_dtree.csv'
    
# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_count)
preds = clf.predict(test_data)

writecsv(output_filename, preds)


6492 results written to submission_dtree.csv


Predict **casual** users and **registered** users separately and then add together to get total **count** of users.

Kaggle submission results for Decision Tree: 0.62940

In [28]:
output_filename = 'submission_dtree_separate.csv'

# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_casual)
preds_casual = clf.predict(test_data)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_registered)
preds_registered = clf.predict(test_data)

preds = preds_casual + preds_registered

writecsv(output_filename, preds)


6492 results written to submission_dtree_separate.csv


Kaggle submission results for Decision Tree with separate predictions: 0.58618

### Random Forest

In [29]:
for n in [10, 50, 100]:
    output_filename = 'submission_rforest_{}.csv'.format(n)

    # train a model and generate predictions
    rfc = RandomForestClassifier(n_estimators=n)
    rfc.fit(train_data, train_casual)
    preds_casual = rfc.predict(test_data)

    rfc = RandomForestClassifier(n_estimators=n)
    rfc.fit(train_data, train_registered)
    preds_registered = rfc.predict(test_data)

    preds = preds_casual + preds_registered

    writecsv(output_filename, preds)


6492 results written to submission_rforest_10.csv
6492 results written to submission_rforest_50.csv
6492 results written to submission_rforest_100.csv


Kaggle submission results for Random Forest (n=100): 0.65084