# Bike Sharing Demand Kaggle project

W207 Final Project

Chris Murray, Rahul Ragunathan, Rajagopalan Mahadevan

https://www.kaggle.com/c/bike-sharing-demand/


In [20]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

## Define a function to read data file

In [21]:
def transform_data(data, is_test):
    day_of_week=[]
    months=[]
    hours=[]
    for i in range(data.shape[0]):
       day_of_week.append(datetime.datetime.weekday(datetime.datetime.strptime(data[i][0], "%Y-%m-%d %H:%M:%S")))
       #months.append(datetime.datetime.strptime(data[i][0], "%Y-%m-%d %H:%M:%S").month)
       hours.append(datetime.datetime.strptime(data[i][0], "%Y-%m-%d %H:%M:%S").hour)
    if not is_test:
      data1 = np.empty([data.shape[0], 11], dtype=int)
    else:
      data1 = np.empty([data.shape[0], 8], dtype=int)
    times = np.empty([data.shape[0]], dtype="S19")
    for i in range(data.shape[0]):
       data1[i][0] = data[i][7]      # humidity
       data1[i][1] = data[i][8]      # windspeed
       data1[i][2] = hours[i]        # hour of the day
       data1[i][3] = data[i][6]      # atemp
       data1[i][4] = day_of_week[i]  # day of the week
       data1[i][5] =  data[i][1]     # season    
           
       # Snow conditions
       if data[i][4]==4:             # Weather
         data1[i][6] = 3
       else:
         data1[i][6] = data[i][4]
       data1[i][7] =  data[i][3]      # Is it a working day
       
       # For test data the casual rental count, reg rental count, and total count 
       if not is_test:
         data1[i][8] = data[i][9]      # casual rentals
         data1[i][9] = data[i][10]     # registered rentals
         data1[i][10]= data[i][11]     # Total rental count
                               
       times[i] = data[i][0]         # datetime field
    
    return data1,times


def readdata(filename, is_test):
    # open the training file and extract column names
    datafile = open(filename)
    headers = datafile.readline()
    print headers

    # read data file and convert everything to integers
    if is_test:
        data1 = np.genfromtxt(datafile, delimiter=",", dtype=("|S19", int, int, int, int, int, int, int, int))
    else:
        data1 = np.genfromtxt(datafile, delimiter=",", dtype=("|S19", int, int, int, int, int, int, int, int, int, int, int))
    
    
    
    data, times = transform_data(data1, is_test)
    
    return times, headers, data


## Read training data

In [22]:
train_filename = "data/train.csv"
test_filename = "data/test.csv"

times, column_names_train, train = readdata(train_filename, False)

# extract all but the last 3 columns into train_data
train_data = train[:, :-3]

# extract the last 3 columns (casual + registered = count)
train_casual = train[:, 8]
train_registered = train[:, 9]
train_count = train[:, 10]

print "Train data headers:"
print column_names_train

print "Sample training data:"
print train_data[0:5]

print "\nSample training labels (count):"
print train_casual[0:5]
print train_registered[0:5]
print train_count[0:5]
print times


datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

Train data headers:
datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

Sample training data:
[[81  0  0 14  5  1  1  0]
 [80  0  1 13  5  1  1  0]
 [80  0  2 13  5  1  1  0]
 [75  0  3 14  5  1  1  0]
 [75  0  4 14  5  1  1  0]]

Sample training labels (count):
[3 8 5 3 0]
[13 32 27 10  1]
[16 40 32 13  1]
['2011-01-01 00:00:00' '2011-01-01 01:00:00' '2011-01-01 02:00:00' ...,
 '2012-12-19 21:00:00' '2012-12-19 22:00:00' '2012-12-19 23:00:00']


## Read test data


In [23]:
test_datetimes,column_names_test, test_data = readdata(test_filename,True )

# print a sample of the test data
print "\nTest data headers:"
print column_names_test
print test_data[0:5]


datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed


Test data headers:
datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed

[[56 26  0 11  3  1  1  1]
 [56  0  1 13  3  1  1  1]
 [56  0  2 13  3  1  1  1]
 [56 11  3 12  3  1  1  1]
 [56 11  4 12  3  1  1  1]]


## Define CSV output function

In [24]:
def writecsv(filename, predictions):
    
    # open output file and write header row
    outfile = open(filename, "w")
    outfile.write("datetime,count\n")

    # output all the predictions to the file
    for i in range(len(predictions)):
        outfile.write("{},{}\n".format(test_datetimes[i], predictions[i]))

    print "{} results written to {}".format(i, filename)


## Generate predictions

### K Nearest Neighbors

In [122]:
for k in [1,3,5,7,9]:
    output_filename = 'submission_knn_{}.csv'.format(k)

    # train a model and generate predictions
    kn = KNeighborsClassifier(n_neighbors=k)
    kn.fit(train_data, train_count)
    preds = kn.predict(test_data)
    
    writecsv(output_filename, preds)


6492 results written to submission_knn_1.csv
6492 results written to submission_knn_3.csv
6492 results written to submission_knn_5.csv
6492 results written to submission_knn_7.csv
6492 results written to submission_knn_9.csv


### Decision Tree

In [123]:
output_filename = 'submission_dtree_1.csv'
    
# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_count)
preds = clf.predict(test_data)

writecsv(output_filename, preds)


6492 results written to submission_dtree_1.csv


Predict **casual** and **registered** separately and then add together to get **count**

In [124]:
output_filename = 'submission_dtree_2.csv'

# train a model and generate predictions
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_casual)
preds_casual = clf.predict(test_data)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_data, train_registered)
preds_registered = clf.predict(test_data)

preds = preds_casual + preds_registered

writecsv(output_filename, preds)


6492 results written to submission_dtree_2.csv


Linear Regression

In [28]:
output_filename = 'submission_lm_2.csv'
# train GLM model
lm = LinearRegression()
lm.fit(train_data, train_casual)

# make casual prediction
preds_lm_casual = np.round(lm.predict(test_data))
preds_lm_casual[preds_lm_casual < 0] = 0
print preds_lm_casual

#cmake registered prediction
lm2 = LinearRegression()
lm2.fit(train_data, train_registered)
preds_lm_reg = np.round(lm2.predict(test_data))
preds_lm_reg[preds_lm_reg < 0] = 0
print preds_lm_reg
preds = preds_lm_casual + preds_lm_reg
print preds
writecsv(output_filename, preds)

[ 0.  0.  0. ...,  3.  9.  3.]
[  31.   35.   41. ...,  157.  172.  166.]
[  31.   35.   41. ...,  160.  181.  169.]
6492 results written to submission_lm_2.csv


In [19]:
# Instantiate the classifier

clf =   RandomForestClassifier(n_estimators=100, max_features=8)

# Fit the training data
clf.fit(train_data, train_casual)
print clf.feature_importances_


# Make predictions
preds1= clf.predict(test_data)

clf2 = RandomForestClassifier(n_estimators=100, max_features=8)
clf2.fit(train_data, train_registered)
print clf2.feature_importances_

preds2= clf2.predict(test_data)

preds = preds1 + preds2

            
print preds
# Write the predictions to a csv file
writecsv('RF.csv', preds)

[ 0.21978756  0.19465538  0.19444223  0.15889514  0.10758317  0.06409945
  0.04836203  0.01217505]
[ 0.2413638   0.19827992  0.12020111  0.1949944   0.11421827  0.05924704
  0.05614994  0.01554552]
[ 13   4   4 ...,  89 102  41]
6492 results written to RF.csv
