In [105]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import bokeh
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go

In [117]:
train_df = pd.read_csv('train.csv')

In [122]:
train_df['Graduation.Date'].min()

'01/01/1900'

In [7]:
train_df['License.Type']

array(['Dentist License'], dtype=object)

Title                   52
Address.1st.Line         0
Address.2nd.Line      2877
City                     0
State                    0
Zip.Code                 0
License.Type             0
License.Issue.Date       0
Dental.School            1
Graduation.Date          0
broker.ID                0
approach.date            0
convert                  0
dtype: int64

In [27]:
train_df['License.Issue.Date'] = pd.to_datetime(train_df.loc[:, 'License.Issue.Date'], )

In [28]:
train_df['approach.date'] = pd.to_datetime(train_df.loc[:, 'approach.date'])

In [29]:
train_df['Graduation.Date'] = pd.to_datetime(train_df.loc[:, 'Graduation.Date'])

In [30]:
date_today = dt.datetime.now()

In [31]:
train_df['time_since_license'] = train_df['License.Issue.Date'].apply(lambda x: date_today - x)

In [32]:
def get_days(timedelta_obj):
    obj = str(timedelta_obj)
    return int(obj.split(' ')[0])#get just the days

In [33]:
train_df.time_since_license = train_df.time_since_license.apply(lambda x: get_days(x))

In [35]:
train_df['approach_license_diff'] = train_df['approach.date'] - train_df['License.Issue.Date']

In [43]:
train_df['approach_license_diff'] = train_df['approach_license_diff'].apply(lambda x: get_days(x))

In [50]:
train_df['Graduation.Date'] = train_df['Graduation.Date'].apply(lambda x: x.toordinal())

In [51]:
train_df['License.Issue.Date'] = train_df['License.Issue.Date'].apply(lambda x: x.toordinal())

In [91]:
def make_dummies(test_col, train_unique_vals, col_name):

    """
    Return a df containing len(train_unique_vals) columns for 
    each unique value in train_unique_vals. If the test_col has more 
    unique values that are not seen in train_unique_vals, value
    will be 0
    """

    dummies = {}
    for val in train_unique_vals:
        dummies[col_name + '_' + val] = (test_col == val).astype(int)
    return pd.DataFrame(dummies, index = test_col.index)

def make_dummies_dataframe(data, categories):

    """
    creates dummy variables for multiple categories
    ex categories = ['city', 'phone'], make_dummies_dataframe(data, categories)
    """
    dummy_dfs = []
    for category in categories:
        temp_df = make_dummies(data[category], data[category].unique(), category)
        dummy_dfs.append(temp_df)
    for i in dummy_dfs:
        data_transformed = pd.concat([data, i], axis=1)
        data = data_transformed
    return data_transformed


def datetime_cleans(df):
    df['License.Issue.Date'] = pd.to_datetime(df.loc[:, 'License.Issue.Date'], )
    df['Graduation.Date'] = pd.to_datetime(df.loc[:, 'Graduation.Date'], )
    df['approach.date'] = pd.to_datetime(df.loc[:, 'approach.date'], )

    date_today = dt.datetime.now()
    df['years_in_state'] = df['License.Issue.Date'].apply(lambda x: date_today - x)
    df['years_in_state'] = round(df['years_in_state'].apply(lambda x: get_days(x)) / 365, 2)

    df['years_in_practice'] = df['Graduation.Date'].apply(lambda x: date_today - x)
    df['years_in_practice'] = round(df['years_in_practice'].apply(lambda x: get_days(x)) / 365, 2)

    categories = ['broker.ID', 'City']
    df = make_dummies_dataframe(df, categories)
    df.drop(columns = categories, inplace=True)

    return df


In [92]:
train_df = datetime_cleans(train_df)

In [112]:
train_df['years_in_practice'].head()

0    49.1
1    49.1
2    49.1
3    49.1
4    49.1
Name: years_in_practice, dtype: float64

In [109]:
# Create a trace
trace = go.Scatter(
    x = train_df.loc[:, 'years_in_practice'],
    y = train_df.loc[:,'convert'],
    mode = 'markers'
)



In [110]:
data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')

In [93]:
train_df_chris = train_df[train_df['broker.ID'] == 'chris']

KeyError: 'broker.ID'

In [94]:
train_df_karl = train_df[train_df['broker.ID'] == 'karl']

KeyError: 'broker.ID'

In [54]:
train_df_sarah = train_df[train_df['broker.ID'] == 'sarah']

In [55]:
train_df_sarah

Unnamed: 0,Title,Address.1st.Line,Address.2nd.Line,City,State,Zip.Code,License.Type,License.Issue.Date,Dental.School,Graduation.Date,broker.ID,approach.date,convert,time_since_license,approach_license_diff
4,DDS,6938 E. Gary,,Scottsdale,AZ,85254,Dentist License,722796,U of Detroit,717337,sarah,2017-08-27,0,14289,13772
7,DMD,"5406 W. Glenn Drive, Suite #4",,Glendale,AZ,85301,Dentist License,721728,U of OR,721354,sarah,2017-06-08,0,15357,14760
12,DDS,11875 N 110th Way,,Scottsdale,AZ,85259,Dentist License,733811,Marquette,733180,sarah,2018-05-01,0,3274,3004
21,DDS,8625 Riverside Dr,Unit 12,Parker,AZ,85344,Dentist License,729796,Marquette,726243,sarah,2017-06-04,0,7289,6688
22,DDS,4230 N 21st #14,,Phoenix,AZ,85016,Dentist License,731642,U of MN,731569,sarah,2017-06-12,0,5443,4850
23,DDS,13430 N Scottsdale Rd #100,,Scottsdale,AZ,85254,Dentist License,721888,Loyola/Chicago,721506,sarah,2018-12-04,0,15197,15144
29,DDS,1550 N Stapley Dr Unit 55,,Mesa,AZ,85203,Dentist License,717502,Georgetown,716971,sarah,2018-04-27,0,19583,19309
31,DDS,9172 W Black Hill Rd.,,Peoria,AZ,85383,Dentist License,732594,U of Detroit,731716,sarah,2016-06-23,0,4491,3544
35,DDS,11861 E Desert Trail Rd,,Scottsdale,AZ,85259,Dentist License,732103,U of MI,729873,sarah,2016-06-26,0,4982,4038
38,DMD,NARCH,PO Box 490,Littlefield,AZ,86432,Dentist License,732880,Case Western Reserve,732797,sarah,2016-04-26,0,4205,3200


In [58]:
X = train_df_sarah.loc[:, ['License.Issue.Date', 'Graduation.Date', 'time_since_license', 'approach_license_diff']]

In [59]:
y = train_df_sarah.loc[:, ['convert']]

In [64]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

In [82]:
model = RandomForestClassifier(n_estimators=1000, max_depth=5)

In [83]:
model.fit(train_X, train_y)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [84]:
preds = model.predict_proba(test_X)[:,1]

In [85]:
log_loss(test_y, preds)

0.3013507054889863

In [88]:
list(zip(X.columns, model.feature_importances_))

[('License.Issue.Date', 0.23579891829159286),
 ('Graduation.Date', 0.2561720150311513),
 ('time_since_license', 0.22912205300820881),
 ('approach_license_diff', 0.27890701366904747)]