In [1]:
import graphlab as gl

In [2]:
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 48)

This non-commercial license of GraphLab Create for academic use is assigned to gpinho@gmail.com and will expire on April 26, 2019.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1525983348.log


# MF1: Matrix Factorization with Explicit Data 2009-2015Y

In [12]:
url_2015 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2015-days180_withHeader.csv'
url_2014 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2014-days180_withHeader.csv'
url_2013 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2013-days180_withHeader.csv'
url_2012 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2012-days180_withHeader.csv'
url_2011 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2011-days180_withHeader.csv'
url_2010 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2010-days180_withHeader.csv'
url_2009 = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/physician-shared-patient-patterns-2009-days180_withHeader.csv'

In [13]:
usecols = ['Initial Physician NPI', 'Secondary Physician NPI', 'Number Unique Beneficiaries']
column_type_hints = [int, int, float]

In [14]:
datasets = [('2009', url_2009), ('2010', url_2010), ('2011', url_2011), ('2012', url_2012), ('2013', url_2013), ('2014', url_2014), ('2015', url_2015)]

In [25]:
def combine_datasets(datasets, usecols=usecols, column_type_hints=column_type_hints):
    
    left_year, left_file = datasets[0]
    left_sf = gl.SFrame.read_csv(left_file, usecols=usecols, column_type_hints=column_type_hints)
    left_sf.rename({'Number Unique Beneficiaries': left_year})
    
    for idx in range(1, len(datasets)):
        right_year, right_file = datasets[idx]
        right_sf = gl.SFrame.read_csv(right_file, usecols=usecols, column_type_hints=column_type_hints)
        right_sf.rename({'Number Unique Beneficiaries': right_year})
        left_sf = left_sf.join(right_sf, on=['Initial Physician NPI', 'Secondary Physician NPI'], how='outer')
        
    return left_sf

In [26]:
combined_sf = combine_datasets(datasets)

In [36]:
cols = combined_sf.column_names()
for idx in range(2, len(cols)):
    combined_sf = combined_sf.fillna(cols[idx], 0.0)

In [41]:
combined_sf.save('../data/physician-shared-patient-patterns-unique-beneficiaries-2009-2015')

In [44]:
combined_sf

Initial Physician NPI,Secondary Physician NPI,2009,2010,2011,2012,2013,2014,2015
1841205325,1881656536,18.0,0.0,0.0,0.0,0.0,0.0,0.0
1982790325,1992703540,1152.0,0.0,0.0,0.0,0.0,0.0,0.0
1841205283,1518076280,12.0,0.0,0.0,0.0,0.0,0.0,0.0
1841205283,1225175714,11.0,0.0,0.0,12.0,13.0,0.0,0.0
1841205275,1699717124,17.0,0.0,26.0,23.0,76.0,36.0,0.0
1841205275,1689641433,13.0,0.0,0.0,0.0,0.0,0.0,0.0
1437187473,1588702872,52.0,0.0,0.0,0.0,0.0,0.0,0.0
1538184569,1669443495,50.0,0.0,0.0,0.0,0.0,0.0,0.0
1841205275,1336134360,14.0,0.0,0.0,0.0,0.0,0.0,0.0
1841205275,1225038441,15.0,0.0,0.0,0.0,12.0,18.0,0.0


In [58]:
sum_col = combined_sf['2009']
for idx in range(3, len(cols)):
    col = cols[idx]
    sum_col += combined_sf[col]

In [60]:
average_col = sum_col / (len(cols) - 2)

In [63]:
edges_sf = gl.SFrame(combined_sf)

In [67]:
edges_sf.remove_columns(cols[2:])

Initial Physician NPI,Secondary Physician NPI
1841205325,1881656536
1982790325,1992703540
1841205283,1518076280
1841205283,1225175714
1841205275,1699717124
1841205275,1689641433
1437187473,1588702872
1538184569,1669443495
1841205275,1336134360
1841205275,1225038441


In [68]:
edges_sf.add_column(average_col, name='Referrals')

Initial Physician NPI,Secondary Physician NPI,Referrals
1841205325,1881656536,2.57142857143
1982790325,1992703540,164.571428571
1841205283,1518076280,1.71428571429
1841205283,1225175714,5.14285714286
1841205275,1699717124,25.4285714286
1841205275,1689641433,1.85714285714
1437187473,1588702872,7.42857142857
1538184569,1669443495,7.14285714286
1841205275,1336134360,2.0
1841205275,1225038441,6.42857142857


In [69]:
edges_sf.save('../data/edges_full')

In [72]:
edges_train_sf, edges_test_sf = edges_sf.random_split(0.9)

In [83]:
edges_train_sf.save('../data/edges_train')
edges_test_sf.save('../data/edges_test')

In [77]:
mf1_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals')

In [78]:
mf1_model.save('../model/mf1')

# MF2: Non Negative Matrix Factorization (NMF) with Explicit Data 2009-2015Y

In [79]:
mf2_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals', nmf=True)

In [82]:
mf2_model.save('../model/mf2')

In [84]:
edges_sf.show()

Canvas is accessible via web browser at the URL: http://localhost:46676/index.html
Opening Canvas in default web browser.


# MF3: Matrix Factorization with Explicit Data [Referrals 2009-2015Y] and with Side Data [Location]

In [86]:
physician_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/npidata_pfile_20050523-20180408_withHeader.csv'

In [87]:
mf3_physician_cols = ['NPI', 'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 'Provider Business Practice Location Address Country Code (If outside US)']

In [88]:
mf3_physician_cols_types = [int, str, str, str]

In [102]:
physician_mf3_sf = gl.SFrame.read_csv(physician_url, usecols=mf3_physician_cols, column_type_hints=mf3_physician_cols_types)

In [103]:
physician_mf3_sf

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2
1679576722,KEARNEY,NE,US
1588667638,JACKSONVILLE,FL,US
1497758544,FAYETTEVILLE,NC,US
1306849450,ATHENS,TX,US
1215930367,HOUSTON,TX,US
1023011178,NAPA,CA,US
1932102084,TOLEDO,OH,US
1841293990,NEW YORK,NY,US
1750384806,LUBBOCK,TX,US
1669475711,SUGAR LAND,TX,US


In [104]:
initial_physician_mf3_sf = gl.SFrame(physician_mf3_sf)
initial_physician_mf3_sf.rename({'NPI': 'Initial Physician NPI'})

Initial Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2
1679576722,KEARNEY,NE,US
1588667638,JACKSONVILLE,FL,US
1497758544,FAYETTEVILLE,NC,US
1306849450,ATHENS,TX,US
1215930367,HOUSTON,TX,US
1023011178,NAPA,CA,US
1932102084,TOLEDO,OH,US
1841293990,NEW YORK,NY,US
1750384806,LUBBOCK,TX,US
1669475711,SUGAR LAND,TX,US


In [107]:
secondary_physician_mf3_sf = gl.SFrame(physician_mf3_sf)
secondary_physician_mf3_sf.rename({'NPI': 'Secondary Physician NPI'})

Secondary Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2
1679576722,KEARNEY,NE,US
1588667638,JACKSONVILLE,FL,US
1497758544,FAYETTEVILLE,NC,US
1306849450,ATHENS,TX,US
1215930367,HOUSTON,TX,US
1023011178,NAPA,CA,US
1932102084,TOLEDO,OH,US
1841293990,NEW YORK,NY,US
1750384806,LUBBOCK,TX,US
1669475711,SUGAR LAND,TX,US


In [109]:
mf3_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals', user_data=initial_physician_mf3_sf, item_data=secondary_physician_mf3_sf)

In [110]:
mf3_model.save('../model/mf3')

# MF4: Matrix Factorization with Explicit Data [Referrals 2009-2015Y] and with Side Data [Location, Organization]

In [124]:
physician_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/npidata_pfile_20050523-20180408_withHeader.csv'

In [125]:
mf4_physician_cols = mf3_physician_cols + ['Entity Type Code', 'Provider Gender Code', 'Is Sole Proprietor', 'Is Organization Subpart', 'Healthcare Provider Taxonomy Group_1']

In [126]:
mf4_physician_cols_types = mf3_physician_cols_types + [int, str, str, str, str]

In [127]:
physician_mf4_sf = gl.SFrame.read_csv(physician_url, usecols=mf4_physician_cols, column_type_hints=mf4_physician_cols_types)

In [123]:
physician_mf4_sf

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Entity Type Code,Provider Gender Code
1679576722,KEARNEY,NE,US,1,M
1588667638,JACKSONVILLE,FL,US,1,M
1497758544,FAYETTEVILLE,NC,US,2,
1306849450,ATHENS,TX,US,1,M
1215930367,HOUSTON,TX,US,1,M
1023011178,NAPA,CA,US,2,
1932102084,TOLEDO,OH,US,1,M
1841293990,NEW YORK,NY,US,1,F
1750384806,LUBBOCK,TX,US,1,M
1669475711,SUGAR LAND,TX,US,1,F

Is Sole Proprietor,Is Organization Subpart,Healthcare Provider Taxonomy Group_1 ...
X,,
N,,
,N,
N,,
N,,
,N,
N,,
N,,
N,,
Y,,


In [137]:
entity_col = physician_mf4_sf.apply(lambda x: 'Individual' if x['Entity Type Code'] == 1 else ('Organization' if x['Entity Type Code'] == 2 else ''), dtype=str)

KeyError: 'Entity Type Code'

In [134]:
physician_mf4_sf.add_column(entity_col, 'Entity')

RuntimeError: Runtime Exception. Column Entity already exists.

In [135]:
physician_mf4_sf.remove_column('Entity Type Code')

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Is Sole Proprietor,Is Organization Subpart,Healthcare Provider Taxonomy Group_1 ...,Entity
X,,,Individual
N,,,Individual
,N,,Organization
N,,,Individual
N,,,Individual
,N,,Organization
N,,,Individual
N,,,Individual
N,,,Individual
Y,,,Individual


In [152]:
ownership_col = physician_mf4_sf.apply(lambda x: 'Yes' if x['Is Sole Proprietor'] == 'X' else ('Yes' if x['Is Sole Proprietor'] == 'Y' else 'No'), dtype=str)

In [154]:
physician_mf4_sf.add_column(ownership_col, 'Sole Proprietor')

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Is Sole Proprietor,Is Organization Subpart,Healthcare Provider Taxonomy Group_1 ...,Entity,Sole Proprietor
X,,,Individual,Yes
N,,,Individual,No
,N,,Organization,No
N,,,Individual,No
N,,,Individual,No
,N,,Organization,No
N,,,Individual,No
N,,,Individual,No
N,,,Individual,No
Y,,,Individual,Yes


In [155]:
physician_mf4_sf.remove_columns(['Is Sole Proprietor','Is Organization Subpart'])

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Healthcare Provider Taxonomy Group_1 ...,Entity,Sole Proprietor
,Individual,Yes
,Individual,No
,Organization,No
,Individual,No
,Individual,No
,Organization,No
,Individual,No
,Individual,No
,Individual,No
,Individual,Yes


In [169]:
group_col = physician_mf4_sf.apply(lambda x: '' if x['Healthcare Provider Taxonomy Group_1'] == '' else ('Multi Specialty' if x['Healthcare Provider Taxonomy Group_1'][3] ==  '2' else ('Single Specialty' if x['Healthcare Provider Taxonomy Group_1'][3] == '4' else '')), dtype=str)                                   

In [170]:
physician_mf4_sf.add_column(group_col, 'Group Specialty')

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Healthcare Provider Taxonomy Group_1 ...,Entity,Sole Proprietor,Group Specialty
,Individual,Yes,
,Individual,No,
,Organization,No,
,Individual,No,
,Individual,No,
,Organization,No,
,Individual,No,
,Individual,No,
,Individual,No,
,Individual,Yes,


In [172]:
physician_mf4_sf.remove_column('Healthcare Provider Taxonomy Group_1')

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Entity,Sole Proprietor,Group Specialty
Individual,Yes,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Individual,No,
Individual,Yes,


In [173]:
initial_physician_mf4_sf = gl.SFrame(physician_mf4_sf)
initial_physician_mf4_sf.rename({'NPI': 'Initial Physician NPI'})

Initial Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Entity,Sole Proprietor,Group Specialty
Individual,Yes,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Individual,No,
Individual,Yes,


In [174]:
secondary_physician_mf4_sf = gl.SFrame(physician_mf4_sf)
secondary_physician_mf4_sf.rename({'NPI': 'Secondary Physician NPI'})

Secondary Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Business Practice Location Add ....2,Provider Gender Code
1679576722,KEARNEY,NE,US,M
1588667638,JACKSONVILLE,FL,US,M
1497758544,FAYETTEVILLE,NC,US,
1306849450,ATHENS,TX,US,M
1215930367,HOUSTON,TX,US,M
1023011178,NAPA,CA,US,
1932102084,TOLEDO,OH,US,M
1841293990,NEW YORK,NY,US,F
1750384806,LUBBOCK,TX,US,M
1669475711,SUGAR LAND,TX,US,F

Entity,Sole Proprietor,Group Specialty
Individual,Yes,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Organization,No,
Individual,No,
Individual,No,
Individual,No,
Individual,Yes,


In [175]:
mf4_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals', user_data=initial_physician_mf4_sf, item_data=secondary_physician_mf4_sf)

In [176]:
mf4_model.save('../model/mf4')

# MF5: Matrix Factorization with Explicit Data [Referrals 2009-2015Y] and with Side Data [Gender, City, State ]

In [178]:
physician_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/npidata_pfile_20050523-20180408_withHeader.csv'

In [179]:
mf5_physician_cols = ['NPI', 'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 'Provider Gender Code']

In [182]:
mf5_physician_cols_types = [int, str, str, str]

In [183]:
physician_mf5_sf = gl.SFrame.read_csv(physician_url, usecols=mf5_physician_cols, column_type_hints=mf5_physician_cols_types)

In [184]:
initial_physician_mf5_sf = gl.SFrame(physician_mf5_sf)
initial_physician_mf5_sf.rename({'NPI': 'Initial Physician NPI'})

Initial Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Gender Code
1679576722,KEARNEY,NE,M
1588667638,JACKSONVILLE,FL,M
1497758544,FAYETTEVILLE,NC,
1306849450,ATHENS,TX,M
1215930367,HOUSTON,TX,M
1023011178,NAPA,CA,
1932102084,TOLEDO,OH,M
1841293990,NEW YORK,NY,F
1750384806,LUBBOCK,TX,M
1669475711,SUGAR LAND,TX,F


In [185]:
secondary_physician_mf5_sf = gl.SFrame(physician_mf5_sf)
secondary_physician_mf5_sf.rename({'NPI': 'Secondary Physician NPI'})

Secondary Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Gender Code
1679576722,KEARNEY,NE,M
1588667638,JACKSONVILLE,FL,M
1497758544,FAYETTEVILLE,NC,
1306849450,ATHENS,TX,M
1215930367,HOUSTON,TX,M
1023011178,NAPA,CA,
1932102084,TOLEDO,OH,M
1841293990,NEW YORK,NY,F
1750384806,LUBBOCK,TX,M
1669475711,SUGAR LAND,TX,F


In [187]:
mf5_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals', user_data=initial_physician_mf5_sf, item_data=secondary_physician_mf5_sf)

In [188]:
mf5_model.save('../model/mf5')

# MF6: Matrix Factorization with Explicit Data [Referrals 2009-2015Y] and with Side Data [Specialties, Gender, City, State]

In [216]:
specialty_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/nucc_taxonomy_180.csv'

In [217]:
specialty_cols = ['Code', 'Grouping', 'Classification', 'Specialization']

In [218]:
specialty_cols_types = [str, str, str, str]

In [219]:
specialty_mf6_sf = gl.SFrame.read_csv(specialty_url, usecols=specialty_cols, column_type_hints=specialty_cols_types)

In [220]:
specialty_mf6_sf.rename({'Code': 'Healthcare Provider Taxonomy Code_1'})

Healthcare Provider Taxonomy Code_1 ...,Grouping,Classification,Specialization
101Y00000X,Behavioral Health & Social Service Providers ...,Counselor,
101YA0400X,Behavioral Health & Social Service Providers ...,Counselor,Addiction (Substance Use Disorder) ...
101YM0800X,Behavioral Health & Social Service Providers ...,Counselor,Mental Health
101YP1600X,Behavioral Health & Social Service Providers ...,Counselor,Pastoral
101YP2500X,Behavioral Health & Social Service Providers ...,Counselor,Professional
101YS0200X,Behavioral Health & Social Service Providers ...,Counselor,School
102L00000X,Behavioral Health & Social Service Providers ...,Psychoanalyst,
102X00000X,Behavioral Health & Social Service Providers ...,Poetry Therapist,
103G00000X,Behavioral Health & Social Service Providers ...,Clinical Neuropsychologist ...,
103GC0700X,Behavioral Health & Social Service Providers ...,Clinical Neuropsychologist ...,Clinical


In [221]:
physician_url = 'https://s3-us-west-1.amazonaws.com/physician-referral-graph/npidata_pfile_20050523-20180408_withHeader.csv'

In [222]:
mf6_physician_cols = ['NPI', 'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 'Provider Gender Code', 'Healthcare Provider Taxonomy Code_1']

In [223]:
mf6_physician_cols_types = [int, str, str, str, str]

In [224]:
physician_mf6_sf = gl.SFrame.read_csv(physician_url, usecols=mf6_physician_cols, column_type_hints=mf6_physician_cols_types)

In [226]:
physician_mf6_sf = physician_mf6_sf.join(specialty_mf6_sf, on='Healthcare Provider Taxonomy Code_1', how='left')

In [227]:
physician_mf6_sf.remove_column('Healthcare Provider Taxonomy Code_1')

NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Gender Code,Grouping
1679576722,KEARNEY,NE,M,Allopathic & Osteopathic Physicians ...
1588667638,JACKSONVILLE,FL,M,Allopathic & Osteopathic Physicians ...
1497758544,FAYETTEVILLE,NC,,Agencies
1306849450,ATHENS,TX,M,Allopathic & Osteopathic Physicians ...
1215930367,HOUSTON,TX,M,Other Service Providers
1023011178,NAPA,CA,,Agencies
1932102084,TOLEDO,OH,M,Allopathic & Osteopathic Physicians ...
1841293990,NEW YORK,NY,F,"Speech, Language and Hearing Service Provi ..."
1750384806,LUBBOCK,TX,M,Allopathic & Osteopathic Physicians ...
1669475711,SUGAR LAND,TX,F,Allopathic & Osteopathic Physicians ...

Classification,Specialization
Orthopaedic Surgery,
Internal Medicine,Cardiovascular Disease
"Hospice Care, Community Based ...",
Radiology,Diagnostic Radiology
Specialist,
"Hospice Care, Community Based ...",
Internal Medicine,Cardiovascular Disease
Audiologist,
Internal Medicine,
Pediatrics,


In [228]:
initial_physician_mf6_sf = gl.SFrame(physician_mf6_sf)
initial_physician_mf6_sf.rename({'NPI': 'Initial Physician NPI'})

Initial Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Gender Code
1679576722,KEARNEY,NE,M
1588667638,JACKSONVILLE,FL,M
1497758544,FAYETTEVILLE,NC,
1306849450,ATHENS,TX,M
1215930367,HOUSTON,TX,M
1023011178,NAPA,CA,
1932102084,TOLEDO,OH,M
1841293990,NEW YORK,NY,F
1750384806,LUBBOCK,TX,M
1669475711,SUGAR LAND,TX,F

Grouping,Classification,Specialization
Allopathic & Osteopathic Physicians ...,Orthopaedic Surgery,
Allopathic & Osteopathic Physicians ...,Internal Medicine,Cardiovascular Disease
Agencies,"Hospice Care, Community Based ...",
Allopathic & Osteopathic Physicians ...,Radiology,Diagnostic Radiology
Other Service Providers,Specialist,
Agencies,"Hospice Care, Community Based ...",
Allopathic & Osteopathic Physicians ...,Internal Medicine,Cardiovascular Disease
"Speech, Language and Hearing Service Provi ...",Audiologist,
Allopathic & Osteopathic Physicians ...,Internal Medicine,
Allopathic & Osteopathic Physicians ...,Pediatrics,


In [229]:
secondary_physician_mf6_sf = gl.SFrame(physician_mf6_sf)
secondary_physician_mf6_sf.rename({'NPI': 'Secondary Physician NPI'})

Secondary Physician NPI,Provider Business Practice Location Add ...,Provider Business Practice Location Add ....1,Provider Gender Code
1679576722,KEARNEY,NE,M
1588667638,JACKSONVILLE,FL,M
1497758544,FAYETTEVILLE,NC,
1306849450,ATHENS,TX,M
1215930367,HOUSTON,TX,M
1023011178,NAPA,CA,
1932102084,TOLEDO,OH,M
1841293990,NEW YORK,NY,F
1750384806,LUBBOCK,TX,M
1669475711,SUGAR LAND,TX,F

Grouping,Classification,Specialization
Allopathic & Osteopathic Physicians ...,Orthopaedic Surgery,
Allopathic & Osteopathic Physicians ...,Internal Medicine,Cardiovascular Disease
Agencies,"Hospice Care, Community Based ...",
Allopathic & Osteopathic Physicians ...,Radiology,Diagnostic Radiology
Other Service Providers,Specialist,
Agencies,"Hospice Care, Community Based ...",
Allopathic & Osteopathic Physicians ...,Internal Medicine,Cardiovascular Disease
"Speech, Language and Hearing Service Provi ...",Audiologist,
Allopathic & Osteopathic Physicians ...,Internal Medicine,
Allopathic & Osteopathic Physicians ...,Pediatrics,


In [230]:
mf6_model = gl.recommender.factorization_recommender.create(edges_train_sf, user_id='Initial Physician NPI', item_id='Secondary Physician NPI', target='Referrals', user_data=initial_physician_mf6_sf, item_data=secondary_physician_mf6_sf)

In [None]:
mf6_model.save('../model/mf6')

# Final Test on MF2

In [3]:
mf2_model = gl.load_model('../model/mf2')

In [4]:
edges_test_sf = gl.SFrame('../data/edges_test')

In [6]:
mf2_predictions = mf2_model.predict(edges_test_sf)

In [9]:
from sklearn import metrics

In [16]:
y_true = edges_test_sf['Referrals'].to_numpy()

In [14]:
y_pred = mf2_predictions.to_numpy()

In [17]:
mf2_rmse = metrics.mean_squared_error(y_true, y_pred)

In [18]:
mf2_r2 = metrics.r2_score(y_true, y_pred)

In [19]:
mf2_rmse

4252.012493466882

In [20]:
mf2_r2

0.12916815831150474

# Final Test on MF6

In [22]:
mf6_model = gl.load_model('../model/mf6')

In [23]:
mf6_predictions = mf6_model.predict(edges_test_sf)

In [24]:
y_true = edges_test_sf['Referrals'].to_numpy()

In [25]:
y_m6_pred = mf6_predictions.to_numpy()

In [53]:
mf6_rmse = np.sqrt(metrics.mean_squared_error(y_true, y_m6_pred))

In [54]:
mf6_rmse

55.71300671087123

In [29]:
mf2_r6 = metrics.r2_score(y_true, y_m6_pred)

In [30]:
mf2_r6

0.36429890041596924

# Don't forget to compare these metrics to the baseline, mean model

In [33]:
edges_train_sf = gl.SFrame('../data/edges_train')

In [36]:
import numpy as np

In [40]:
y_pred_average = np.full(len(edges_test_sf), edges_train_sf['Referrals'].mean())

In [51]:
average_rmse = np.sqrt(metrics.mean_squared_error(y_true, y_pred_average))

In [44]:
average_r2 = metrics.r2_score(y_true, y_pred_average)

In [48]:
int(average_r2)

0

In [47]:
y_true.mean()

19.22259303226491

In [52]:
average_rmse

69.87633480013113

In [50]:
y_pred_average

array([19.21719408, 19.21719408, 19.21719408, ..., 19.21719408,
       19.21719408, 19.21719408])