In [1]:
import pandas as pd
import numpy as np

In [20]:
q = '''
SELECT
  DISTINCT practice,
  pct
FROM
  ebmdatalab.hscic.normalised_prescribing_standard
LEFT JOIN
  ebmdatalab.hscic.practices
ON
  practice = code
  AND setting = 4
ORDER BY
  practice
'''
practices = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')


phe = pd.read_csv('indicators-GP.data.csv')

mask = (phe['Indicator Name'] == 'Deprivation score (IMD 2015)') & (phe['Area Type'] == 'GP')
imd = phe.loc[mask, ['Area Code','Value']]
imd = imd.set_index('Area Code')

mask = (phe['Indicator Name'] == '% who have a positive experience of their GP practice') & (phe['Area Type'] == 'GP') & (phe['Time period'] == 2018)
satisfaction = phe.loc[mask, ['Area Code','Value']]
satisfaction = satisfaction.set_index('Area Code')

mask = (phe['Indicator Name'] == '% with a long-standing health condition') & (phe['Area Type'] == 'GP') & (phe['Time period'] == 2018)
long_term_health = phe.loc[mask, ['Area Code','Value']]
long_term_health = long_term_health.set_index('Area Code')

mask = (phe['Indicator Name'] == '% aged 65+ years') & (phe['Area Type'] == 'GP') & (phe['Time period'] == 2018)
over_65 = phe.loc[mask, ['Area Code','Value']]
over_65 = over_65.set_index('Area Code')

mask = (phe['Indicator Name'] == '% aged under 18 years') & (phe['Area Type'] == 'GP') & (phe['Time period'] == 2018)
under_18 = phe.loc[mask, ['Area Code','Value']]
under_18 = under_18.set_index('Area Code')



qof = pd.read_csv('ACHIEVEMENT_EXCEPTIONS.csv')
qof_map = pd.read_csv('INDICATOR_MAPPINGS.csv')
qof = qof.merge(qof_map[['INDICATOR_CODE','DOMAIN_CODE']], how='inner', on='INDICATOR_CODE', copy=False)
qof_all = qof.loc[qof.MEASURE == 'ACHIEVED_POINTS'].groupby(['PRACTICE_CODE','DOMAIN_CODE']).sum()
qof_all = qof_all.unstack(level=1) #.reset_index(col_level=0)
qof_all.columns = qof_all.columns.droplevel(0)
qof_all['TOTAL'] = qof_all['CL'] + qof_all['PH'] + qof_all['PHAS']



q = '''
SELECT
  code,
  dispensing_patients
FROM
  ebmdatalab.bsa.dispensing_practices_jan2017
ORDER BY
  code ASC
'''
dispensing = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')



q = '''
SELECT
  practice,
  SUM(items) AS total_items
FROM
  ebmdatalab.hscic.normalised_prescribing_standard
WHERE
  month >= TIMESTAMP("2018-01-01")
  AND month <= TIMESTAMP("2018-12-01")
GROUP BY
  practice
'''
prescribing_volume = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')



q = '''
SELECT
  code AS practice, ru.LSOA11NM, SUBSTR(RUC11CD,1,1) AS ruc11cd, RUC11
FROM
  ebmdatalab.ONS.small_area_rural_urban ru
INNER JOIN
  ebmdatalab.ONS.postcode_to_lsoa_map m
ON
  ru.LSOA11CD = m.lsoa11cd
INNER JOIN
  ebmdatalab.hscic.practices
ON
  pcds = postcode
'''
urban_rural = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')


q = '''
SELECT
  practice,
  AVG(total_list_size) as list_size
FROM
  ebmdatalab.hscic.practice_statistics
WHERE
  month >= TIMESTAMP("2018-01-01")
  AND month <= TIMESTAMP("2018-12-01")
GROUP BY
  practice
ORDER BY
  practice
'''
list_size = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')



gps_per_practice = pd.read_csv('gps_per_practice_sept_2017.csv')



measures = {'ktt9_cephalosporins':'Antibiotic stewardship: co-amoxiclav, cephalosporins &amp; quinolones (KTT9)',
            'ktt9_uti_antibiotics':'Antibiotic stewardship: three-day courses for uncomplicated UTIs (KTT9)',
            'ktt9_antibiotics':'Antibiotic stewardship: volume of antibiotic prescribing (KTT9)',
            'ciclosporin':'Ciclosporin and tacrolimus oral preparations prescribed generically',
            'coproxamol':'Co-proxamol',
            'desogestrel':'Desogestrel prescribed as a branded product',
            'diltiazem':'Diltiazem preparations (>60mg) prescribed generically',
            #'doacs':'Direct Oral Anticoagulants (DOACs)',
            'quetiapine':'Extended-release quetiapine',
            'glaucoma':'Glaucoma eye drops prescribed by brand',
            'ace':'High-cost ACE inhibitors',
            'arb':'High-cost ARBs',
            'sildenafil':'High-cost drugs for erectile dysfunction',
            'ppi':'High-cost PPIs',
            'statins':'High-cost statins',
            'icsdose':'High dose inhaled corticosteroids',
            'opioidspercent':'High dose opioids as percentage regular opioids',
            'opioidper1000':'High dose opioids per 1000 patients',
            'ppidose':'Higher dose Proton Pump Inhibitors (PPIs)',
            'keppra':'Keppra vs. levetiracetam',
            'ktt12_diabetes_insulin':'Long-acting insulin analogues (KTT12)',
            'statinintensity':'Low and medium intensity statins',
            'methotrexate':'Methotrexate 10 mg tablets',
            'nebivolol':'Nebivolol 2.5mg tablets',
            'lpzomnibus':'NHS England Low Priority Treatment - All Low Priority Treatments',
            #'lpcoprox':'NHS England Low Priority Treatment - co-proxamol',
            #'lpdosulepin':'NHS England Low Priority Treatment - dosulepin',
            #'lpdoxazosin':'NHS England Low Priority Treatment - doxazosin modified release',
            #'lpfentanylir':'NHS England Low Priority Treatment - fentanyl immediate release',
            #'lpglucosamine':'NHS England Low Priority Treatment - glucosamine and chondroitin',
            #'lphomeopathy':'NHS England Low Priority Treatment - homeopathy',
            #'lplidocaine':'NHS England Low Priority Treatment - lidocaine plasters',
            #'lpliothyronine':'NHS England Low Priority Treatment - liothyronine',
            #'lplutein':'NHS England Low Priority Treatment - lutein and antioxidants',
            #'lpomega3':'NHS England Low Priority Treatment - omega-3 fatty acid compounds',
            #'lpoxycodone':'NHS England Low Priority Treatment - oxycodone and naloxone combination product',
            #'lptramadolpara':'NHS England Low Priority Treatment - paracetamol and tramadol combination',
            #'lpperindopril':'NHS England Low Priority Treatment - perindopril arginine',
            #'lprubefacients':'NHS England Low Priority Treatment - rubefacients',
            #'lptadalafil':'NHS England Low Priority Treatment - tadalafil once daily',
            #'lptravelvacs':'NHS England Low Priority Treatment - travel vaccines',
            #'lptrimipramine':'NHS England Low Priority Treatment - trimipramine',
            #'glutenfree':'Prescribing of gluten free products',
            'ktt13_nsaids_ibuprofen':'Non-preferred NSAIDs and COX-2 inhibitors (KTT13)',
            'other_lipid_modifying_drugs':'Other lipid-modifying drugs',
            'lyrica':'Pregabalin prescribed as Lyrica',
            'dipyridamole':'Prescribing of dipyridamole',
            'tramadol':'Prescribing of high cost tramadol preparations ',
            #'pregabalin':'Prescribing of pregabalin',
            #'pregabalinmg':'Prescribing of pregabalin (total mg)',
            'trimethoprim':'Prescribing of trimethoprim vs nitrofurantoin',
            'saba':'Short acting beta agonist inhalers',
            'silver':'Silver dressings',
            'solublepara':'Soluble/effervescent forms of paracetamol and co-codamol',
            'fungal':'Topical treatment of fungal nail infections',
            'vitb':'Vitamin B complex',
            'bdzper1000':'Anxiolytics and Hypnotics: Average Daily Quantity per 1000 patients',
            'bdzadq':'Anxiolytics and Hypnotics: Average Daily Quantity per item',
            'opioidome':'Prescribing of opioids (total oral morphine equivalence)'
}
all_measures = []
for measure, label in measures.items():
    q = '''
    SELECT
      practice_id,
      AVG(percentile) as mean_percentile
    FROM
      ebmdatalab.measures.practice_data_%s
    WHERE
      month >= "2018-01-01"
      AND month <= "2018-12-01"
    GROUP BY
      practice_id
    ''' % (measure)

    df = pd.read_gbq(q, 'ebmdatalab', verbose=False, dialect='standard')
    df['measure'] = measure
    all_measures.append(df)
    del df
all_measures = pd.concat(all_measures)
summ_measures = all_measures.groupby('practice_id').mean()


software = pd.read_csv("GPSoC Marketshare - Jan 2018b.csv", usecols = ['ODS','Principal Supplier','Principal System'])

In [21]:
mask = (phe['Indicator Name'] == '% who have a positive experience of their GP practice') & (phe['Area Type'] == 'GP') & (phe['Time period'] == 2018)
phe.loc[mask, ['Area Code','Value']]

Unnamed: 0,Area Code,Value
375872,A81001,83.627562
375873,A81002,80.971049
375874,A81004,87.585723
375875,A81005,84.936370
375876,A81006,87.409959
375877,A81007,85.123423
375878,A81009,71.234806
375879,A81011,85.690537
375880,A81012,80.630308
375881,A81013,96.751233


In [22]:
phe['Indicator Name'].unique()

array(['% aged 0 to 4 years', '% aged 5 to 14 years',
       '% aged under 18 years', '% aged 65+ years', '% aged 75+ years',
       '% aged 85+ years', 'Deprivation score (IMD 2015)',
       '% who have a positive experience of their GP practice',
       'IDACI (Income Depr. - Children)', '% satisfied with phone access',
       'IDAOPI (Income Depr. - Older People)',
       '% satisfied with practice appointment times',
       '% reporting good overall experience of making appointment',
       '% with a long-standing health condition',
       '% with caring responsibility',
       'Working status - Paid work or full-time education',
       'Working status - Unemployed', 'Total QOF points',
       'Nursing home patients', 'Life expectancy - MSOA based'],
      dtype=object)

In [23]:
all = practices.merge(urban_rural,
                how='left',
                left_on='practice',
                right_on='practice',
                copy=False)

all = all.merge(summ_measures,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False)

all = all.merge(gps_per_practice,
                how='left',
                on='practice',
                copy=False)

all = all.merge(qof_all,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False)

all = all.merge(imd,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False)
all = all.merge(satisfaction,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False,
                suffixes=('_imd', ''))
all = all.merge(long_term_health,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False,
                suffixes=('_satisfaction', ''))
all = all.merge(over_65,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False,
                suffixes=('_long_term_health', ''))
all = all.merge(under_18,
                how='left',
                left_on='practice',
                right_index=True,
                copy=False,
                suffixes=('_over_65', '_under_18'))
all = all.merge(dispensing,
                how='left',
                left_on='practice',
                right_on='code',
                copy=False)
all = all.merge(prescribing_volume,
                how='left',
                left_on='practice',
                right_on='practice',
                copy=False)
all = all.merge(list_size,
                how='left',
                left_on='practice',
                right_on='practice',
                copy=False)
all = all.merge(software,
                how='left',
                left_on='practice',
                right_on='ODS',
                copy=False)

#all.to_csv('methotrexate_for_analysis.csv')
all.describe()
all = all.drop(columns=['ODS'])
all.head()

Unnamed: 0,practice,pct,LSOA11NM,ruc11cd,RUC11,mean_percentile,num_gps_june,num_gps_sept,CL,PH,...,Value_satisfaction,Value_long_term_health,Value_over_65,Value_under_18,code,dispensing_patients,total_items,list_size,Principal Supplier,Principal System
0,A81001,00K,Stockton-on-Tees 014G,C,Urban city and town,0.43603,2.0,,425.69,97.0,...,83.627562,49.450467,23.323544,20.264317,A81001,0.0,94138.0,4097.083333,TPP,SystmOne
1,A81002,00K,Stockton-on-Tees 014F,C,Urban city and town,0.484001,12.0,12.0,435.0,97.0,...,80.971049,58.535567,21.194615,18.908872,A81002,0.0,451497.0,19829.75,TPP,SystmOne
2,A81003,00K,Hartlepool 007D,C,Urban city and town,,,,425.34,97.0,...,,,,,,,,,,
3,A81004,00M,Middlesbrough 015C,C,Urban city and town,0.514046,2.0,3.0,427.63,97.0,...,87.585723,50.589328,18.819479,18.937531,A81004,0.0,214788.0,10307.333333,TPP,SystmOne
4,A81005,00M,Redcar and Cleveland 018A,C,Urban city and town,0.333274,8.0,9.0,429.72,91.76,...,84.93637,63.431023,27.956587,17.851796,A81005,0.0,195485.0,8020.416667,TPP,SystmOne


In [24]:
all.to_csv('data_for_regression_2019_05.csv', index=False)