In [40]:
import graphlab as gl

In [54]:
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 48)

# Baseline: Matrix Factorization with Explicit Data 2015Y

In [42]:
baseline_mf_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2015-days180_withHeader.csv'

In [46]:
usecols = ['Initial Physician NPI', 'Secondary Physician NPI', 'Number Unique Beneficiaries']

In [48]:
baseline_mf_sf = gl.SFrame.read_csv(baseline_mf_url, usecols=usecols, column_type_hints=[int, int, float])

In [50]:
baseline_mf_sf_train, baseline_mf_sf_test = baseline_mf_sf.random_split(0.9)

In [51]:
baseline_mf_sf_train

Initial Physician NPI,Secondary Physician NPI,Number Unique Beneficiaries ...
1000000004,1790775229,12.0
1000026017,1598773715,23.0
1000310429,1144645573,12.0
1003000126,1003951625,68.0
1003000126,1003975400,31.0
1003000126,1013051119,28.0
1003000126,1013902600,25.0
1003000126,1023027109,25.0
1003000126,1023029964,15.0
1003000126,1053306746,44.0


In [52]:
baseline_mf_model = gl.recommender.factorization_recommender.create(baseline_mf_sf_train, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Number Unique Beneficiaries')

In [57]:
baseline_mf_model.save('../model/baseline')

In [53]:
baseline_mf_eval = baseline_mf_model.evaluate(baseline_mf_sf_test)


Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 9.71178839727e-05 | 1.24423364948e-05 |
|   2    | 9.99742923248e-05 | 4.12946639805e-05 |
|   3    | 0.000100450360384 | 5.97533651377e-05 |
|   4    | 9.49755777086e-05 | 7.28070967445e-05 |
|   5    | 9.54040389614e-05 | 9.36263915548e-05 |
|   6    | 9.49755777086e-05 | 0.000113503319945 |
|   7    | 9.26292422765e-05 | 0.000126485596392 |
|   8    | 9.17621183124e-05 | 0.000138157724347 |
|   9    | 9.33093395032e-05 | 0.000158517520599 |
|   10   | 9.39758347853e-05 |  0.00017562861298 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]



('\nOverall RMSE: ', 87.56388030132392)

Per User RMSE (best)
+-----------------------+-------+-------------------+
| Initial Physician NPI | count |        rmse       |
+-----------------------+-------+-------------------+
|       1235204959      |   1   | 1.16958084675e-05 |
+-----------------------+-------+-------------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-----------------------+-------+---------------+
| Initial Physician NPI | count |      rmse     |
+-----------------------+-------+---------------+
|       1073610754      |  973  | 1702.35499043 |
+-----------------------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+-------------------------+-------+-------------------+
| Secondary Physician NPI | count |        rmse       |
+-------------------------+-------+-------------------+
|        1174541114       |   1   | 0.000212417655895 |
+-------------------------+-------+-------------------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+---

In [58]:
baseline_mf_sf_train['Number Unique Beneficiaries'].mean()

37.58873342137935

In [66]:
baseline_mf_sf_train['Number Unique Beneficiaries'].std()

In [74]:
baseline_mf_sf.show()

Canvas is updated and available in a tab in the default browser.


In [75]:
baseline_mf_model.predict(baseline_mf_sf_test)

dtype: float
Rows: 6577167
[29.89374183278651, 6.769066558891247, 28.270759330802868, 31.22428153615565, 39.448966728263805, 29.883739219718883, 57.55620621305079, 37.61472819905848, 56.37817548375697, 33.46594892602534, 35.4931151733932, 33.92295645337672, 40.94651387792201, 29.3443616495666, 46.379675613456676, 42.820578323417614, 52.037629829460094, 45.45086072545619, 40.774424539619396, 49.46977256398768, 58.98856423955531, 21.995775924735973, 37.312797175460766, 15.488871322685192, 11.567042575889538, 28.614205585533092, 7.9660899506148795, 23.35399983983607, 24.0933987961349, 16.517703758293102, 25.212088810020397, 9.476412044578503, 13.562885509544323, 33.519629941993664, 38.11576865773768, 9.150790439659069, 36.156507717185924, 37.44723384242625, 13.963498102241466, 16.908394561820934, 42.616669879966686, 13.672273622566173, 33.983730541282604, 25.243554340415905, 99.47665427785486, 83.17562888722986, 16.740295635276745, 37.95246862035365, 13.278234706932018, 13.112536655479381

In [3]:
import numpy as np

In [4]:
edges_test_sf = np.array([10, 7])
mf2_predictions = np.array([12, 5])

In [7]:
mf2_rmse = (sum(((edges_test_sf - mf2_predictions) ** 2)) / len(edges_test_sf)) ** (1/2)

In [8]:
mf2_rmse

1