In [13]:
import pandas as pd
import numpy as np
from IPython.display import display
from os import listdir
sessions_df = pd.read_csv("data_toolkit/data/sessions.csv")
practices_df = pd.read_csv("data_toolkit/data/practices.csv")
ccgs_df = pd.read_csv("data_toolkit/data/ccgs.csv")

In [14]:
sessions_df.count()

id                      128618
practice_id             128618
posted_datetime         128618
start_datetime          128618
end_datetime            128618
hourly_rate             128618
original_hourly_rate     87316
status                  128618
locum_id                 47732
dtype: int64

In [15]:
print(sessions_df.status.unique())

['completed' 'withdrawn' 'expired' 'system_invalidated' 'posted' 'filled']


In [16]:
print("Completed: {}".format(sessions_df[sessions_df.status == 'completed'].count()[0]))
print("Filled: {}".format(sessions_df[sessions_df.status == 'filled'].count()[0]))
print("Expired: {}".format(sessions_df[sessions_df.status == 'expired'].count()[0]))

Completed: 40451
Filled: 3444
Expired: 27978


In [45]:
import datetime

length_seconds = lambda row: (row['end_datetime'] - row['start_datetime']).total_seconds()
one_hour = 60 * 60
is_short = lambda row: row['length_seconds'] <= 2 * one_hour
is_medium = lambda row: 2 * one_hour < row['length_seconds'] <= 4 * one_hour
is_long = lambda row: 4 * one_hour < row['length_seconds']


# These methods may exclude sessions that are very very long, but those
# appear to be outliers. Keeping it simple. 
def at_night(row): 
    return row['start_datetime'].time() >= datetime.time(18) or \
           row['end_datetime'].time() <= datetime.time(8)
   
   
def at_morning(row):
    start_in_morning = datetime.time(11) >= row['start_datetime'].time() >= datetime.time(6)
    end_in_morning = datetime.time(8) >= row['end_datetime'].time() >= datetime.time(12)
    if start_in_morning or end_in_morning:
        return True
    else:
        return False
    

def at_afternoon(row):
    start_in_aft = datetime.time(16) >= row['start_datetime'].time() > datetime.time(12)
    end_in_aft = datetime.time(20) >= row['end_datetime'].time() >= datetime.time(15)
    if start_in_aft or end_in_aft:
        return True
    else:
        return False

In [46]:
# Transform data
# Add new features that come from the session time
sessions_df['start_datetime'] = pd.to_datetime(sessions_df['start_datetime'])
sessions_df['end_datetime'] = pd.to_datetime(sessions_df['start_datetime'])

sessions_df['length_seconds'] = sessions_df[['start_datetime', 'end_datetime']].apply(
    length_seconds, axis=1
)
sessions_df['is_short'] = sessions_df[['length_seconds']].apply(
    is_short, axis=1
)
sessions_df['is_medium'] = sessions_df[['length_seconds']].apply(
    is_short, axis=1
)
sessions_df['is_long'] = sessions_df[['length_seconds']].apply(
    is_short, axis=1
)

sessions_df['at_night'] = sessions_df[['start_datetime', 'end_datetime']].apply(
    at_night, axis=1
)
sessions_df['at_afternoon'] = sessions_df[['start_datetime', 'end_datetime']].apply(
    at_afternoon, axis=1
)
sessions_df['at_morning'] = sessions_df[['start_datetime', 'end_datetime']].apply(
    at_morning, axis=1
)

In [50]:
# Clean data
# We care about Completed / Filled / Expired sessions
clean_df = sessions_df[
    (sessions_df.status == 'completed') |
    (sessions_df.status == 'expired') |
    (sessions_df.status == 'filled')
]
clean_df['filled'] = clean_df[['status']].apply(
    lambda row: row['status'] in ['completed', 'expired'], axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [51]:
display(clean_df.describe())

Unnamed: 0,id,practice_id,hourly_rate,original_hourly_rate,locum_id,length_seconds
count,71873.0,71873.0,71873.0,56544.0,43915.0,71873.0
mean,368640000.0,2139361.0,81.944086,82.027766,15224470.0,0.0
std,103349800.0,1279175.0,7.199402,6.891379,50011060.0,0.0
min,40166880.0,30112.0,60.0,1.85,20110.0,0.0
25%,401121200.0,301917.0,80.0,80.0,2012139.0,0.0
50%,401155000.0,3011153.0,80.0,80.0,2013710.0,0.0
75%,401185000.0,3011767.0,85.0,85.0,2015509.0,0.0
max,401220300.0,3012808.0,150.0,150.0,201711100.0,0.0


In [None]:
print("Dependency Scores")
for column in sessions_df.columns:
    
    temp_df = sessions_df.drop(column, 1)
    
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        temp_df, sessions_df[''], 
        train_size=int(sessions_df[column].count() * 0.25), 
        random_state=1
    )
    
    # TODO: Create a decision tree regressor and fit it to the training se
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state=1)
    regressor.fit(X_train, y_train)
    score = regressor.score(X_test, y_test)
    print("{} score: {}".format(str(column), score))