### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import pydot

In [3]:
data_pathname = "/Users/danielbrand/Documents/Uni/Master/1. Semester/Big Data Management/Final Project/final_full_station70_cyclic_<5_yesno.csv"

In [4]:
data = pd.read_csv(data_pathname)

#### Clean data to get rid of empty instance features

In [5]:
data = data.dropna(axis=0)

### Calculate class distribution of target variable

In [6]:
data.iloc[:,53].value_counts()

yes    884692
no     143756
Name: enough_bikes, dtype: int64

In [7]:
class_distribution = 884692/(143756+884692)
print(class_distribution)

0.8602204486760634


### Assign y as target variable

In [8]:
y = data.enough_bikes

### Assign X as features used to predict y

In [9]:
data_features = ['daytime_cyclic', 'day_is_wednesday', 'day_is_saturday',
       'day_is_sunday', 'is_weekend', 'month_is_march', 'month_is_june',
       'month_is_december', 'max_visibility_miles']

In [10]:
X = data[data_features]

In [11]:
clf = LogisticRegression(solver="liblinear").fit(X,y)

In [12]:
def cross_validate():
    val_scores = cross_val_score(clf, X, y, cv=5)
    mean_accuracy = val_scores.mean()
    standard_dev = val_scores.std() * 2
    print("Mean Accuracy: {} \t\t (+/- {})".format(mean_accuracy, standard_dev))

In [13]:
cross_validate()

Mean Accuracy: 0.8729872582535488 		 (+/- 0.017713874505093985)


### Predict values using a training and a testing split and the model from above

In [14]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

In [15]:
clf2 = LogisticRegression(solver="liblinear").fit(train_X, train_y)

In [16]:
val_predictions = clf2.predict(val_X)

### Creating a confusion matrix to calculate expected value

In [17]:
y_true = val_y
y_pred = val_predictions

In [18]:
conf_matr = confusion_matrix(y_true, y_pred, labels=["yes", "no"])

In [19]:
df = pd.DataFrame({'Positve':conf_matr[:,0],'Negative':conf_matr[:,1]}, index=["Yes", "No"])

In [20]:
df

Unnamed: 0,Positve,Negative
Yes,211190,10027
No,22429,13466


In [21]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

In [22]:
positive_rate = (tp + fp) / val_y.count()
negative_rate = (tn + fn) / val_y.count()
print(positive_rate)

0.9086273686175674


In [23]:
tp_rate = tp/ (tp + fp)
fp_rate = fp/ (tp + fp)
tn_rate = tn/ (tn + fn)
fn_rate = fn/ (tn + fn) 

###### After looking at the distribution of departures per hour over the course of a day we found that in the morning and afternoon there are two peaks with 6-16 departures per hour. We will focus on the morning as there is clearly the highest business value here. We expect that it takes an employee 15 minutes to restock the station, indicating up to 4 departures during that time. with 3 dollar per trip that makes 12 dollar potential revenue. With an average wage of 21 dollar/hour in the US, expecting that the employees are rather cheap with a wage of 12 dollar/hour. Therefore a restock costs 3 dollar. 

In [24]:
tp_benefit = 12 # our service is running and makes 12 dollar in 15 minutes
fp_benefit = 9  # our service is running and makes 12 dollar in 15 minutes, but we have to pay our employee 3 for the restock
tn_benefit = 9  # our service is running and makes 12 dollar in 15 minutes because our employee (3 dollar) restocked it
fn_benefit = 0  # our service is not running and makes no money. We also do not pay our employee

In [25]:
expected_value = positive_rate * ((tp_rate * tp_benefit) + (fp_rate * fp_benefit)) + negative_rate * ((tn_rate * tn_benefit) + (fn_rate * fn_benefit))

##### With a positive_rate of 86% we have a baseline average revenue of 10.32 dollar

In [26]:
expected_value

11.113191916363295

##### Our model raises the average revenue to 11.11 dollar