# Bike Sharing Demand Kaggle project

W207 Final Project

Chris Murray, Rahul Ragunathan, Rajagopalan Mahadevan

https://www.kaggle.com/c/bike-sharing-demand/


In [57]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Read training data

Data files have been downloaded into `data/` .

Fields are described here: https://www.kaggle.com/c/bike-sharing-demand/data

The last field **count** is what we are trying to predict.  It is equal to **casual + registered**.

In [75]:
train_filename = "data/train.csv"
test_filename = "data/test.csv"

# open the training file and extract column names
train_file = open(train_filename)
column_names_train = train_file.readline()

# really simple function that returns the hour as an integer
getHour = lambda x: int(x[11:13])

# convert string to float and round to integer
getRounded = lambda x: int(round(float(x)))

# read data file and convert everything to integers
train = np.genfromtxt(train_file, delimiter=",", dtype=int, converters=
                {0: getHour, 5: getRounded, 6: getRounded, 8: getRounded})

# print a sample of the train data
print "Train data headers:"
print column_names_train
print train[0:5]
print

# extract all but the last 3 columns into train_data
train_data = train[:, :-3]

# extract the last 3 columns (casual + registered = count)
train_casual = train[:, 9]
train_registered = train[:, 10]
train_count = train[:, 11]

print "Sample training data:"
print train_data[0:5]

print "\nSample training labels (count):"
print train_count[0:5]


Train data headers:
datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

[[ 0  1  0  0  1 10 14 81  0  3 13 16]
 [ 1  1  0  0  1  9 14 80  0  8 32 40]
 [ 2  1  0  0  1  9 14 80  0  5 27 32]
 [ 3  1  0  0  1 10 14 75  0  3 10 13]
 [ 4  1  0  0  1 10 14 75  0  0  1  1]]

Sample training data:
[[ 0  1  0  0  1 10 14 81  0]
 [ 1  1  0  0  1  9 14 80  0]
 [ 2  1  0  0  1  9 14 80  0]
 [ 3  1  0  0  1 10 14 75  0]
 [ 4  1  0  0  1 10 14 75  0]]

Sample training labels (count):
[16 40 32 13  1]


## Read test data


In [53]:
# open the test file and extract column names
test_file = open(test_filename)
column_names_test = test_file.readline()

# read data file and convert everything to integers
test_data = np.genfromtxt(test_file, delimiter=",", dtype=int, converters=
                {0: getHour, 5: getRounded, 6: getRounded, 8: getRounded})
    
# print a sample of the test data
print "\nTest data headers:"
print column_names_test
print test_data[0:5]

# re-read file to get just the datetime field
test_file.seek(0)
column_names_test = test_file.readline()

test_datetimes = []
for line in test_file.readlines():
    datetime = line.split(',')[0]
    test_datetimes.append(datetime)



Test data headers:
datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed

[[ 0  1  0  1  1 11 11 56 26]
 [ 1  1  0  1  1 11 14 56  0]
 [ 2  1  0  1  1 11 14 56  0]
 [ 3  1  0  1  1 11 13 56 11]
 [ 4  1  0  1  1 11 13 56 11]]


## Define CSV output function

In [70]:
def writecsv(filename, predictions):
    
    # open output file and write header row
    outfile = open(filename, "w")
    outfile.write("datetime,count\n")

    # output all the predictions to the file
    for i in range(len(predictions)):
        outfile.write("{},{}\n".format(test_datetimes[i], preds[i]))

    print "{} results written to {}".format(i, output_filename)


## Generate predictions

### K Nearest Neighbors

In [72]:
for k in [1,3,5,7,9]:
    output_filename = 'submission_knn_{}.csv'.format(k)

    # train a model and generate predictions
    kn = KNeighborsClassifier(n_neighbors=k)
    kn.fit(train_data, train_count)
    preds = kn.predict(test_data)
    
    writecsv(output_filename, preds)


6492 results written to submission_knn_1.csv
6492 results written to submission_knn_3.csv
6492 results written to submission_knn_5.csv
6492 results written to submission_knn_7.csv
6492 results written to submission_knn_9.csv


### Decision Tree

In [77]:
output_filename = 'submission_dtree_1.csv'
    
# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_count)
preds = clf.predict(test_data)

writecsv(output_filename, preds)


6492 results written to submission_dtree_1.csv


Predict **casual** and **registered** separately and then add together to get **count**

In [78]:
output_filename = 'submission_dtree_2.csv'

# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_casual)
preds_casual = clf.predict(test_data)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_registered)
preds_registered = clf.predict(test_data)

preds = preds_casual + preds_registered

writecsv(output_filename, preds)


6492 results written to submission_dtree_2.csv
