In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

from glob import glob
import functools
import os

import numpy as np
import pandas as pd

from pandas_linker import get_linker, run_linker

pd.options.display.max_rows = 999

from eurosfordoctors import utils
from eurosfordoctors import fixers
from eurosfordoctors import checks
from eurosfordoctors import dedupe
from eurosfordoctors import geocode
from eurosfordoctors import export

utils.progress_pandas_df()

In [2]:
os.makedirs('data/cleaned', exist_ok=True)

In [3]:
DEFAULT_COUNTRY = 'PL'
DEFAULT_YEAR = 2015

def parse_address_location_country_comma(s):
    s = s.rsplit(',', 2)
    return {
        'country': (' '.join(s[2:])).strip(),
        'location': s[1].strip(),
        'address': s[0].strip()
    }

ADDRESS_PARSERS = {
    ('abbvie',): parse_address_location_country_comma,
}

def flatten_parsers(d):
    for tup, func in d.items():
        for t in tup:
            yield (t, func)

COMPANY_SETTINGS = {
    'bad_name_order': [],
    'comma_split_title': [],
    'comma_split_title_name': [],
    'semicolon_name_split': [],
    'last_name_capitals': [],
    'no_postcode': ['abbvie', 'bayer'],
    'proper_postcode': [],
    'no_pdf': ['bayer'],
    'hcp_company_in_address': [],
    'address_rules': dict(flatten_parsers(ADDRESS_PARSERS))
}

def load_dataframe(filename, force_clean=False):
    print(filename)
    basename = os.path.basename(filename)
    basename = basename.split('.')[0]
    company = basename.split('_')[0]
    df = pd.read_csv(filename, encoding='utf-8', escapechar='\\', converters={'postcode': str, 'uci': str},
                     na_values=['-'])

    # Drop everything with empty name -> useless
    if df['name'].isnull().sum() != 0:
        print('%d null names!' % df['name'].isnull().sum())
    df = df[df['name'].notnull()]

    df['type'] = df['type'].str.lower()
    assert ((df['type'] == 'hcp').sum() + (df['type'] == 'hco').sum()) == len(df)

    # TODO: fix country
    df['recipient_detail'] = None
    df['base_country'] = DEFAULT_COUNTRY
    df['origin'] = DEFAULT_COUNTRY
    df['year'] = DEFAULT_YEAR
    df['company'] = company
    df = fixers.make_money(df)
    df = df.apply(lambda x: fixers.fix_name(x, COMPANY_SETTINGS), 1)
    df = df.apply(lambda x: fixers.split_name(x, COMPANY_SETTINGS), 1)
    df = df.apply(lambda x: fixers.fix_address(x, COMPANY_SETTINGS), 1)
    df['country'] = df['country'].apply(lambda x: fixers.fix_country(x, default=DEFAULT_COUNTRY))
    if 'postcode' in df:
        df['postcode'] = df['postcode'].apply(lambda x: np.nan if pd.notnull(x) and not x else x)
    else:
        df['postcode'] = np.nan

    df['uid'] = None
        
    if not 'gender' in df:
        df['gender'] = np.nan
    
    clean_name = './data/cleaned/%s_cleaned.csv' % basename
    df.to_csv(clean_name, index=False, encoding='utf-8')
    return df

In [4]:
df = pd.concat([load_dataframe(filename) for filename in glob('data/pl/raw_csv/*.csv')]).reset_index()
df.head()

data/pl/raw_csv/abbvie_pl_2015.csv
data/pl/raw_csv/bayer_pl_2015.csv


Unnamed: 0,index,address,base_country,clean_name,company,company_doc_id,country,currency,donations_grants,donations_grants_dirty,...,sponsorship_dirty,title,total,total_dirty,travel_accommodation,travel_accommodation_dirty,type,uci,uid,year
0,0,al.Wojska Polskiego 30,PL,tomasz-aleksiejczyk,abbvie,,PL,,,,...,,,2600.0,2600.0,,,hcp,,,2015
1,1,ul.Szpitalna 27/33,PL,alicja-bartkowska-sniatkowska,abbvie,,PL,,,,...,,,1849.0,1849.0,,,hcp,,,2015
2,2,ul.Roentgena 5,PL,witold-bartnik,abbvie,,PL,,,,...,,,4030.0,4030.0,,,hcp,,,2015
3,3,ul.Zjednoczenia 10,PL,marek-beniowski,abbvie,,PL,,,,...,,,13272.0,13272.0,9517.0,9517.0,hcp,,,2015
4,4,ul. 3 Maja 13/15,PL,szymon-bialka,abbvie,,PL,,,,...,,,154.0,154.0,154.0,154.0,hcp,,,2015


In [5]:
df.dtypes

index                           int64
address                        object
base_country                   object
clean_name                     object
company                        object
company_doc_id                 object
country                        object
currency                       object
donations_grants              float64
donations_grants_dirty        float64
fees                          float64
fees_dirty                    float64
first_name                     object
gender                        float64
last_name                      object
location                       object
name                           object
origin                         object
postcode                      float64
recipient_detail               object
registration_fees             float64
registration_fees_dirty       float64
related_expenses              float64
related_expenses_dirty        float64
sponsorship                   float64
sponsorship_dirty             float64
title       

### Spot check addresses

In [6]:
utils.company_sample(df)[['company', 'title', 'name', 'address', 'location', 'postcode', 'country']]

Unnamed: 0,company,title,name,address,location,postcode,country
178,bayer,,Joanna Kubiak,ul.Ujejskiego 75,Bydgoszcz,,PL
277,bayer,,Comarch S.A,Ul. Aleja Jana Pawła Ii 39A,Kraków,,PL
309,bayer,,Mazowiecki Szpital Specjalistyczny Sp. z o.o,Ul. Aleksandrowicza 5,Radom,,PL
108,abbvie,,Polskie Towarzystwo Neonatologiczne,ul.Karowa 2,Warszawa,,PL
21,abbvie,,Maciej Gonciarz,pl.Medyków 1,Sosnowiec,,PL
116,abbvie,,SP Wojewódzki Szpital Zespolony,ul.Arkońska 4,Szczecin,,PL


### Spot check names

In [7]:
cname = ['company', 'gender', 'title', 'clean_name', 'name', 'first_name', 'last_name', 'type']
utils.company_sample(df[df['type'] == 'hcp'])[cname]

Unnamed: 0,company,gender,title,clean_name,name,first_name,last_name,type
37,abbvie,,,przemyslaw-laniewski-wollk,Przemysław Łaniewski-Wołłk,Przemysław,Łaniewski-Wołłk,hcp
53,abbvie,,,malgorzata-przygodzka,Małgorzata Przygodzka,Małgorzata,Przygodzka,hcp
69,abbvie,,,jadwiga-swikszcz-gniadek,Jadwiga Świkszcz-Gniadek,Jadwiga,Świkszcz-Gniadek,hcp
192,bayer,,,ewa-wywial,Ewa Wywiał,Ewa,Wywiał,hcp
163,bayer,,Prof. Dr.,konrad-rejdak,Konrad Rejdak,Konrad,Rejdak,hcp
178,bayer,,,joanna-kubiak,Joanna Kubiak,Joanna,Kubiak,hcp


In [8]:
cname = ['company', 'recipient_detail', 'name']
utils.company_sample(df[df['type'] == 'hco'])[cname]

Unnamed: 0,company,recipient_detail,name
251,bayer,,Wojewódzkie Centrum Szpitalne
245,bayer,,BCO Szpital Miejski-Poradnie
307,bayer,,Powiatowy Szpital Specjalistyczny W Stalowej Woli
90,abbvie,,Fundacja Rozwoju Neonatologii w Wielkopolsce
107,abbvie,,Polskie Towarzystwo Naukowe AIDS
93,abbvie,,Fundacja Watch Health Care


In [9]:
df.to_csv('./data/combined_cleaned.csv', index=False, encoding='utf-8')

### Find Dirty Totals

In [10]:
dirty_total = checks.check_computed_total(df)
dirty_total.to_csv('data/dirty_totals.csv', index=False, encoding='utf-8')
dirty_total.head()

Unnamed: 0,company,index,name,address,total,total_dirty,computed_total,registration_fees,travel_accommodation,fees,donations_grants,sponsorship,related_expenses


### Find Duplicates

In [11]:
companies = list(df['company'].value_counts().index)
WINDOW_SIZE = round(len(companies) * 3)

compare_rows = dedupe.compare_rows

with get_linker(df, field='uid') as linker:

    print('Comparing sort by name')
    linker(sort_cols=['name', 'address', 'location'], window_size=WINDOW_SIZE, cmp=compare_rows)

    print('Comparing hcp sort by last name')
    linker(sort_cols=['last_name', 'address', 'location'], condition=(df['type'] == 'hcp'),
           window_size=WINDOW_SIZE, cmp=compare_rows)

    print('Comparing sort by address')
    linker(sort_cols=['address', 'location', 'name'],
       window_size=WINDOW_SIZE, cmp=compare_rows)

    print('Comparing sort by location,address')
    linker(sort_cols=['location', 'address', 'name'],
       window_size=WINDOW_SIZE, cmp=compare_rows)


[ 22 %] Time elapsed: 00:00:00 | ETA: 00:00:00

Comparing sort by name


[ 17 %] Time elapsed: 00:00:00 | ETA: 00:00:00

Comparing hcp sort by last name


[ 28 %] Time elapsed: 00:00:00 | ETA: 00:00:00

Comparing sort by address


[ 31 %] Time elapsed: 00:00:00 | ETA: 00:00:00

Comparing sort by location,address


[ 98 %] Time elapsed: 00:00:00 | ETA: 00:00:00

In [12]:
print(len(df), len(df.groupby('uid')))

394 387


In [13]:
duplicates = df.groupby('uid').size() > 1
duplicates = duplicates[duplicates]
print('duplicates: ', len(duplicates))
df[df['uid'].isin(duplicates.index)].sort_values('uid')[['company', 'name', 'address', 'postcode', 'location', 'type']].head(50)

duplicates:  7


Unnamed: 0,company,name,address,postcode,location,type
128,abbvie,Szpital Specjalistyczny im. J. Dietla,Ul.Skarbowa 4,,Kraków,hco
264,bayer,Szpital Specjalistyczny im. J. Dietla w Krakowie,Ul. Skarbowa 4,,Kraków,hco
130,abbvie,Uniwersytecki Szpital Kliniczny,Ul.Skłodowskiej-Curie 24A,,Białystok,hco
215,bayer,Uniwersytecki Szpital Kliniczny W Białymstoku,UL. M. SKŁODOWSKIEJ-CURIE 24 a,,Białystok,hco
146,abbvie,ZOZ Szpital w Suchej Beskidzkiej,Ul.Szpitalna 22,,Sucha Beskidzka,hco
322,bayer,ZOZ Szpital w Suchej Beskidzkiej,ul.Szpitalna 22,,Sucha Beskidzka,hco
106,abbvie,Polskie Towarzystwo Kardiologiczne,ul.Stawki 3A/1-2,,Warszawa,hco
341,bayer,Polskie Towarzystwo Kardiologiczne,Ul. Stawki 3A Lok. 1-2,,Warszawa,hco
116,abbvie,SP Wojewódzki Szpital Zespolony,ul.Arkońska 4,,Szczecin,hco
320,bayer,SP Wojewódzki Szpital Zespolony,ul.Arkońska 4,,Szczecin,hco


### Run Geocoding

In [14]:
df = df.progress_apply(functools.partial(geocode.run_geocoding, country=DEFAULT_COUNTRY.lower()), 1)

[100 %] Time elapsed: 00:00:02 | ETA: 00:00:00
Total time elapsed: 00:00:02


In [15]:
df['postcode'] = df.progress_apply(geocode.get_postcode, 1)

[ 99 %] Time elapsed: 00:00:01 | ETA: 00:00:00

In [16]:
df['uid_original'] = df['uid'].copy()
print('Comparing sort by lat,lng')

compare_rows = functools.partial(dedupe.compare_geocoded_rows)
run_linker(df, sort_cols=['lat', 'lng', 'name'], window_size=WINDOW_SIZE, cmp=compare_rows)

[ 27 %] Time elapsed: 00:00:00 | ETA: 00:00:00

Comparing sort by lat,lng


[ 98 %] Time elapsed: 00:00:00 | ETA: 00:00:00

In [17]:
df[df['uid'] != df['uid_original']][['name', 'address', 'lat', 'lng', 'location', 'uid', 'uid_original', 'company']]

Unnamed: 0,name,address,lat,lng,location,uid,uid_original,company
268,Sp Zoz Szpital Uniwersytecki W Krakowie,Ul. Kopernika 36,50.063024,19.951229,Kraków,506ebd61-6e6c-4750-91cc-384667bb2d13,d34ec775-189e-44f3-afff-3948fc52783d,bayer


In [18]:
duplicates = df.groupby('uid').size() > 1
duplicates = duplicates[duplicates]
print('duplicates: ', len(duplicates))
df[df['uid'].isin(duplicates.index)].sort_values('uid')[['company', 'name', 'address', 'postcode', 'location', 'uid', 'uid_original']].head(50)

duplicates:  8


Unnamed: 0,company,name,address,postcode,location,uid,uid_original
128,abbvie,Szpital Specjalistyczny im. J. Dietla,Ul.Skarbowa 4,33-332,Kraków,170d7600-7066-4430-8930-302e4f2a974d,170d7600-7066-4430-8930-302e4f2a974d
264,bayer,Szpital Specjalistyczny im. J. Dietla w Krakowie,Ul. Skarbowa 4,33-332,Kraków,170d7600-7066-4430-8930-302e4f2a974d,170d7600-7066-4430-8930-302e4f2a974d
118,abbvie,SP ZOZ Szpital Uniwersytecki,ul.Kopernika 36,,Kraków,506ebd61-6e6c-4750-91cc-384667bb2d13,506ebd61-6e6c-4750-91cc-384667bb2d13
268,bayer,Sp Zoz Szpital Uniwersytecki W Krakowie,Ul. Kopernika 36,,Kraków,506ebd61-6e6c-4750-91cc-384667bb2d13,d34ec775-189e-44f3-afff-3948fc52783d
130,abbvie,Uniwersytecki Szpital Kliniczny,Ul.Skłodowskiej-Curie 24A,,Białystok,51b6be13-f941-4e0b-b8f3-6f84e8599c29,51b6be13-f941-4e0b-b8f3-6f84e8599c29
215,bayer,Uniwersytecki Szpital Kliniczny W Białymstoku,UL. M. SKŁODOWSKIEJ-CURIE 24 a,,Białystok,51b6be13-f941-4e0b-b8f3-6f84e8599c29,51b6be13-f941-4e0b-b8f3-6f84e8599c29
146,abbvie,ZOZ Szpital w Suchej Beskidzkiej,Ul.Szpitalna 22,34-200,Sucha Beskidzka,908e72ab-c72d-49b4-a17f-975a1a609b99,908e72ab-c72d-49b4-a17f-975a1a609b99
322,bayer,ZOZ Szpital w Suchej Beskidzkiej,ul.Szpitalna 22,34-200,Sucha Beskidzka,908e72ab-c72d-49b4-a17f-975a1a609b99,908e72ab-c72d-49b4-a17f-975a1a609b99
106,abbvie,Polskie Towarzystwo Kardiologiczne,ul.Stawki 3A/1-2,,Warszawa,920db4f3-f61e-4ffb-af00-e16df18e1fe1,920db4f3-f61e-4ffb-af00-e16df18e1fe1
341,bayer,Polskie Towarzystwo Kardiologiczne,Ul. Stawki 3A Lok. 1-2,,Warszawa,920db4f3-f61e-4ffb-af00-e16df18e1fe1,920db4f3-f61e-4ffb-af00-e16df18e1fe1


In [19]:
df.to_csv('data/geocoded.csv', encoding='utf-8', index=False)

In [20]:
final_df = export.make_entities_df(df)
final_df.head()

[100 %] Time elapsed: 00:00:10 | ETA: 00:00:00
Total time elapsed: 00:00:10


Unnamed: 0,address,base_country,clean_name,company_doc_id,computed_total,country,donations_grants_dirty,fees_dirty,first_name,gender,...,registration_fees_dirty,related_expenses_dirty,sponsorship_dirty,title,total,total_dirty,travel_accommodation_dirty,type,uci,uid_original
0,Ul. Prosta 2/14 Lok 5,PL,,BAYER-2015-61549,14000,PL,14000.0,,,,...,,,,,,,,hco,,005f8e35-0e26-4078-85b6-236e4a399588
1,Ul.Dojazd 34,PL,barbara-sobczak-moryson,BAYER-2015-61696,4298,PL,,,Barbara,,...,1802.0,,,,,,2496.0,hcp,,022a0287-1d01-4c93-ad4b-d86f391b84da
2,Strzelców Bytomskich 11,PL,,BAYER-2015-61589,600,PL,600.0,,,,...,,,,,,,,hco,,024f0148-814a-4d82-820e-bc11afb3b4b1
3,Ul.Traugutta 112 E,PL,,,15598,PL,15598.0,,,,...,,,,,15598.0,15598.0,,hco,,02bb48f1-b8d5-42b1-bf31-da45b88319c7
4,ul.Długa 1/2,PL,tomasz-losy,,2000,PL,,2000.0,Tomasz,,...,,,,,2000.0,2000.0,,hcp,,03ff4073-6e55-4368-a6c4-7d7f28645d96


In [21]:
final_df = export.make_slugs(final_df)
final_df.head()

Unnamed: 0,address,base_country,clean_name,company_doc_id,computed_total,country,donations_grants_dirty,fees_dirty,first_name,gender,...,sponsorship_dirty,title,total,total_dirty,travel_accommodation_dirty,type,uci,uid_original,slug_raw,slug
0,Ul. Prosta 2/14 Lok 5,PL,,BAYER-2015-61549,14000,PL,14000.0,,,,...,,,,,,hco,,005f8e35-0e26-4078-85b6-236e4a399588,Fundacja-Walki-Ze-Slepota-I-Rehabilitacji-Slab...,Fundacja-Walki-Ze-Slepota-I-Rehabilitacji-Slab...
1,Ul.Dojazd 34,PL,barbara-sobczak-moryson,BAYER-2015-61696,4298,PL,,,Barbara,,...,,,,,2496.0,hcp,,022a0287-1d01-4c93-ad4b-d86f391b84da,Barbara-Sobczak-Moryson-Poznan-pl,Barbara-Sobczak-Moryson-Poznan-pl
2,Strzelców Bytomskich 11,PL,,BAYER-2015-61589,600,PL,600.0,,,,...,,,,,,hco,,024f0148-814a-4d82-820e-bc11afb3b4b1,Polskie-Towarzystwo-Lekarskie-Zarzad-O-W-Katow...,Polskie-Towarzystwo-Lekarskie-Zarzad-O-W-Katow...
3,Ul.Traugutta 112 E,PL,,,15598,PL,15598.0,,,,...,,,15598.0,15598.0,,hco,,02bb48f1-b8d5-42b1-bf31-da45b88319c7,Fundacja-Rozwoju-Ochrony-Zdrowia-Wroclaw-pl,Fundacja-Rozwoju-Ochrony-Zdrowia-Wroclaw-pl
4,ul.Długa 1/2,PL,tomasz-losy,,2000,PL,,2000.0,Tomasz,,...,,,2000.0,2000.0,,hcp,,03ff4073-6e55-4368-a6c4-7d7f28645d96,Tomasz-Losy-Poznan-pl,Tomasz-Losy-Poznan-pl


In [22]:
final_df.columns

Index(['address', 'base_country', 'clean_name', 'company_doc_id',
       'computed_total', 'country', 'donations_grants_dirty', 'fees_dirty',
       'first_name', 'gender', 'index', 'last_name', 'lat', 'lng', 'location',
       'name', 'origin', 'payments', 'postcode', 'registration_fees_dirty',
       'related_expenses_dirty', 'sponsorship_dirty', 'title', 'total',
       'total_dirty', 'travel_accommodation_dirty', 'type', 'uci',
       'uid_original', 'slug_raw', 'slug'],
      dtype='object')

In [23]:
final_df.to_csv('data/pl_final_geocoded.csv', index=False, encoding='utf-8')