In [1]:
import pandas as pd
import numpy as np
import datetime
from neo4j import GraphDatabase
import requests
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Set constants

UK_API_KEY = os.environ.get('UK_COMPANYHOUSE_API_KEY')
#NL_USERNAME = os.environ.get('NL_COMPANYINFO_USER_NAME')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            #NL_USER_NAME = os.environ.get(NL_COMPANYINFO_USER_NAME)
#NL_PASSWORD = os.environ.get(NL_COMPANYINFO_PASSWORD)

# Steps

1. Search companies and get list
2. Discard irrelevant companies
3. Get officers from those companies
4. Get other companies that are linked to officers
5. Get persons with significant control
6. Download all filings

## TODO
- refactor functions - functions with the same purpose can be merged
- better error handling

# Define functions for UK Companies House

In [3]:
def search_uk_companies(query):
    '''Takes a company name and
    searches for a list of company data
    and prints the names with an index.
    '''
    url = 'https://api.company-information.service.gov.uk/advanced-search/companies'
    
    companies = []
    
    result = requests.get(f'{url}?company_name_includes={query}&size=100', auth=(UK_API_KEY, '')).json()
    
    for i, company in enumerate(result['items']):
        print(i, company.get('company_name'))
        
        companies.append(company)
    
    return companies

def filter_uk_companies(companies, filter_out):
    '''Takes a list of items to filter
    and filters companies
    '''
    dfs_companies = []
    
    for i, company in enumerate(companies):
        
        if i not in filter_out:
            df = pd.json_normalize(company)
            dfs_companies.append(df)   
            #print(i, company.get('company_name'))
        else:
            print(f'filtered out {i}')
            continue
    df_companies = pd.concat(dfs_companies)
    return df_companies


def get_uk_officers(companies):
    '''Takes a companies df and returns
    a df of all officers appointed
    in past and present'''
    
    dfs_officers = []
    
    for company in companies['company_number']:
        #company_number = company['company_number']
        url = f'https://api.company-information.service.gov.uk/company/{company}/officers?items_per_page=1'
        result = requests.get(url, auth=(UK_API_KEY, '')).json()        
        if result:
            total_items = result['total_results']
            start_index = 0
        else:
            continue
        

        while total_items > 0:
            url = f'https://api.company-information.service.gov.uk/company/{company}/officers?start_index={str(start_index)}&items_per_page={str(total_items)}'
            result = requests.get(url, auth=(UK_API_KEY, '')).json()
            df = result_to_df(result['items'])
            dfs_officers.append(df)
            start_index += 50
            total_items -= 50
    
    df_officers = pd.concat(dfs_officers)
    df_officers['appointments'] = df_officers['officer'].apply(lambda x: x.get('appointments'))
    
    return df_officers


def get_uk_officer_history(officers):
    '''Takes a df of officers and returns
    all their known appointments in past and present
    as a df'''
    
    #officers['appointments'] = officers['officer'].apply(lambda x: x.get('appointments'))
    
    dfs_officer_history = []
    
    for appointments in officers['appointments']:
        url = f'https://api.company-information.service.gov.uk{appointments}?start_index=0&items_per_page=1'
        result = requests.get(url, auth=(UK_API_KEY, ''))
        if result:
            result = result.json()
        else:
            continue
        try:
            total_items = result['total_results']
            #print(f'found officer {appointments}')
            start_index = 0
        except KeyError:
            print(f'could not find {appointments}')
            continue

        while total_items > 0:
            url = f'https://api.company-information.service.gov.uk{appointments}?start_index={str(start_index)}&items_per_page={str(total_items)}'
            result = requests.get(url, auth=(UK_API_KEY, ''))
            if result:
                result = result.json()
            else:
                continue
            df = result_to_df(result['items'])
            dfs_officer_history.append(df)
            start_index += 50
            total_items -= 50
 
    df_officer_history = pd.concat(dfs_officer_history)

    return df_officer_history


def get_uk_ubos(companies):
    '''Takes the dataframe with
    companies and returns a df of 
    entities with significant control'''
    
    url = 'https://api.company-information.service.gov.uk/company/'
    
    dfs_ubos = []
    
    for company_number in companies['company_number']:

        try:
            result = requests.get(f'{url}{str(company_number)}/persons-with-significant-control', 
                              auth=(UK_API_KEY, '')).json()
        except:
            print(f'could not parse {company_number}')
            continue
        if not result.get('errors'):
            df = pd.DataFrame.from_records(result['items'])
            df['company_number'] = company_number
            dfs_ubos.append(df)
        else:
            continue
    
    df_ubos = pd.concat(dfs_ubos)
    
    return df_ubos


def result_to_df(result):
    
    df = pd.DataFrame.from_records(result)
    
    dict_columns = [i for i in df.columns if isinstance(df[i][0],dict)]

    for dict_col in dict_columns:
        df = pd.concat([df.drop([dict_col], axis=1), df[dict_col].apply(pd.Series)], axis=1)
    
    return df

In [7]:
# Create lists of dataframes

search_total = []
companies_total = []
officers_total = []
officers_history_total = []
ubos_total = []

In [66]:
# Search companies

company_name = 'BRITOIL'

companies = search_uk_companies(company_name)
search_total.append(companies)

0 BRITOIL SOLUTIONS LIMITED
1 BRITOIL OFFSHORE (UK) LIMITED
2 BRITOIL LIMITED
3 BRITOIL (TRUSTEES) LIMITED


In [67]:
# Filter companies with a list of index numbers

filter_out = []

# Get a dataframe of all filtered companies

df_companies = filter_uk_companies(companies, filter_out)

# Append dataframe to list of company dataframes

companies_total.append(df_companies)

In [68]:
df_companies.head()

Unnamed: 0,company_name,company_number,company_status,company_type,kind,date_of_creation,sic_codes,links.company_profile,registered_office_address.address_line_1,registered_office_address.address_line_2,registered_office_address.locality,registered_office_address.postal_code,registered_office_address.region,registered_office_address.country,date_of_cessation
0,BRITOIL SOLUTIONS LIMITED,07817029,active,ltd,search-results#company,2011-10-20,[82990],/company/07817029,Elizabeth House,13-19 London Road,Newbury,RG14 1JL,Berkshire,United Kingdom,
0,BRITOIL OFFSHORE (UK) LIMITED,08279974,dissolved,ltd,search-results#company,2012-11-05,[09100],/company/08279974,27 Old Gloucester Street,,London,WC1N 3AX,,United Kingdom,2020-09-22
0,BRITOIL LIMITED,SC077750,active,ltd,search-results#company,1982-03-04,[06100],/company/SC077750,1 Wellheads Avenue,Dyce,Aberdeen,AB21 7PB,,,
0,BRITOIL (TRUSTEES) LIMITED,SC061272,dissolved,ltd,search-results#company,1976-12-07,[7487],/company/SC061272,1 Wellheads Avenue,Dyce,Aberdeen,AB21 7PB,,,2011-09-27


In [69]:
# Create registration no column

df_companies['registration_no'] = df_companies['links.company_profile'].str.replace('/company/', '', regex=False)

In [70]:
# Write to csv

df_companies.to_csv(f'../data/company_info/registries/uk_companies_{company_name}.csv', index=False)

In [71]:
# Create dataframe of uk officers of these companies

df_officers = get_uk_officers(df_companies)

# Append dataframe to list of officer dataframes

officers_total.append(df_officers)

In [72]:
df_officers.head()

Unnamed: 0,officer_role,country_of_residence,name,occupation,nationality,appointed_on,locality,country,region,address_line_1,...,self,officer,resigned_on,date_of_birth,identification_type,registration_number,0,address_line_2,place_registered,appointments
0,director,United Kingdom,"BRITTAN, David Kenneth",Director,British,2011-10-20,Newbury,United Kingdom,Berkshire,13-19 London Road,...,/company/07817029/appointments/BcuqpMCC5ZQoTdN...,{'appointments': '/officers/TLmTgLyW-uYM5ik5rd...,,,,,,,,/officers/TLmTgLyW-uYM5ik5rdYNi9y6qoI/appointm...
1,director,England,"BRITTAN, Paula Joy",Director,British,2011-10-20,Newbury,United Kingdom,Berkshire,13-19 London Road,...,/company/07817029/appointments/7X4aciBtKFJUCtN...,{'appointments': '/officers/q0-pNFrw6G3K4-zs-s...,,,,,,,,/officers/q0-pNFrw6G3K4-zs-sNONLx-XBU/appointm...
0,director,United Kingdom,"MOLTONI, Peter",Company Director,Italian,2016-03-03,London,United Kingdom,,Old Gloucester Street,...,/company/08279974/appointments/Uccb39urF-nJHz0...,{'appointments': '/officers/s-_zJ3eAM08h2UX8dH...,,,,,,,,/officers/s-_zJ3eAM08h2UX8dHYAMeI2gUA/appointm...
1,director,England,"BAIN, Katie May",Company Director,British,2012-11-05,London,United Kingdom,,2 Exchange Court,...,/company/08279974/appointments/r7j2x_CUJ_U2cVS...,{'appointments': '/officers/nLgQFcQU0QmPjPQ7Lb...,2016-03-03,,,,,,,/officers/nLgQFcQU0QmPjPQ7LbTZRmqE-Dk/appointm...
2,director,Singapore,"HILL, David John",Managing Director,British,2012-11-05,London,United Kingdom,,Old Gloucester Street,...,/company/08279974/appointments/H20By3I69mbrw74...,{'appointments': '/officers/WWG14T9I5w1xmbGian...,2020-01-13,,,,,,,/officers/WWG14T9I5w1xmbGianm1jEU2FMU/appointm...


In [73]:
# Create a copy

df = df_officers.copy()

In [74]:
# Unpack a few column that still have dictionaries (refactor later)

dict_columns = ['date_of_birth', 'self']

for dict_col in dict_columns:
    df = pd.concat([df.drop([dict_col], axis=1), df[dict_col].apply(pd.Series)], axis=1)

In [75]:
df.to_csv(f'../data/company_info/registries/uk_officers_{company_name}.csv', index=False)

In [76]:
df_officers.head()

Unnamed: 0,officer_role,country_of_residence,name,occupation,nationality,appointed_on,locality,country,region,address_line_1,...,self,officer,resigned_on,date_of_birth,identification_type,registration_number,0,address_line_2,place_registered,appointments
0,director,United Kingdom,"BRITTAN, David Kenneth",Director,British,2011-10-20,Newbury,United Kingdom,Berkshire,13-19 London Road,...,/company/07817029/appointments/BcuqpMCC5ZQoTdN...,{'appointments': '/officers/TLmTgLyW-uYM5ik5rd...,,,,,,,,/officers/TLmTgLyW-uYM5ik5rdYNi9y6qoI/appointm...
1,director,England,"BRITTAN, Paula Joy",Director,British,2011-10-20,Newbury,United Kingdom,Berkshire,13-19 London Road,...,/company/07817029/appointments/7X4aciBtKFJUCtN...,{'appointments': '/officers/q0-pNFrw6G3K4-zs-s...,,,,,,,,/officers/q0-pNFrw6G3K4-zs-sNONLx-XBU/appointm...
0,director,United Kingdom,"MOLTONI, Peter",Company Director,Italian,2016-03-03,London,United Kingdom,,Old Gloucester Street,...,/company/08279974/appointments/Uccb39urF-nJHz0...,{'appointments': '/officers/s-_zJ3eAM08h2UX8dH...,,,,,,,,/officers/s-_zJ3eAM08h2UX8dHYAMeI2gUA/appointm...
1,director,England,"BAIN, Katie May",Company Director,British,2012-11-05,London,United Kingdom,,2 Exchange Court,...,/company/08279974/appointments/r7j2x_CUJ_U2cVS...,{'appointments': '/officers/nLgQFcQU0QmPjPQ7Lb...,2016-03-03,,,,,,,/officers/nLgQFcQU0QmPjPQ7LbTZRmqE-Dk/appointm...
2,director,Singapore,"HILL, David John",Managing Director,British,2012-11-05,London,United Kingdom,,Old Gloucester Street,...,/company/08279974/appointments/H20By3I69mbrw74...,{'appointments': '/officers/WWG14T9I5w1xmbGian...,2020-01-13,,,,,,,/officers/WWG14T9I5w1xmbGianm1jEU2FMU/appointm...


In [77]:
# Dedup officers

df_officers = df.drop_duplicates(subset=['appointments']).copy()

# Create dataframe of uk officer history of found officers

df_officer_history = get_uk_officer_history(df_officers)

# Append dataframe to list of officer history dataframes

officers_history_total.append(df_officer_history)

In [78]:
len(df_officer_history)

2280

In [79]:
df_officer_history.to_csv(f'../data/company_info/registries/uk_officer_history_{company_name}.csv', index=False)

In [80]:
# Create dataframe of uk ubos of companies in dataframe

df_ubos = get_uk_ubos(df_companies)

# Append dataframe to list of ubo dataframes

ubos_total.append(df_ubos)

In [81]:
# Clean up dictionaries in columns

df = df_ubos.copy()

dict_columns = ['identification', 'address', 'links', 'date_of_birth', 'name_elements']

for dict_col in dict_columns:
    df = pd.concat([df.drop([dict_col], axis=1), df[dict_col].apply(pd.Series)], axis=1)

In [82]:
# Clean up columns

df = df[['natures_of_control',
         'kind',
         'registration_number',
         'place_registered',
         'name',
         'title',
         'forename',
         #'middle_name',
         'surname',
         'month',
         'year',
         'nationality',
         'country_of_residence',
         'country_registered',
         'address_line_1',
         'premises',
         'postal_code',
         'locality',
         'country',
         'self',
         'notified_on',
         'ceased_on',
         'etag',
         'company_number']]

df = df.rename(columns={'self': 'url',
                        'notified_on': 'start_date',
                        'ceased_on': 'end_date',
                        'name': 'full_name',
                        'locality': 'city',
                        'address_line_1': 'address',
                        'month': 'dob_month',
                        'year': 'dob_year'})

In [83]:
df.head()

Unnamed: 0,natures_of_control,kind,registration_number,place_registered,full_name,title,forename,surname,dob_month,dob_year,...,address,premises,postal_code,city,country,url,start_date,end_date,etag,company_number
0,"[ownership-of-shares-25-to-50-percent, voting-...",individual-person-with-significant-control,,,Mrs Paula Joy Brittan,Mrs,Paula,Brittan,12.0,1967.0,...,13-19 London Road,Elizabeth House,RG14 1JL,Newbury,United Kingdom,/company/07817029/persons-with-significant-con...,2016-04-06,,b1449d9ba10cd310cd425394603ff33983ed6fb6,07817029
1,"[ownership-of-shares-50-to-75-percent, voting-...",individual-person-with-significant-control,,,Mr David Kenneth Brittan,Mr,David,Brittan,1.0,1966.0,...,13-19 London Road,Elizabeth House,RG14 1JL,Newbury,United Kingdom,/company/07817029/persons-with-significant-con...,2016-04-06,,d618849234047874014005a95228ad1e0fa6b5e1,07817029
0,[ownership-of-shares-75-to-100-percent],corporate-entity-person-with-significant-control,198500876g,Accounting And Corporate Regulatory Authority,Britoil Offshore Services Pte Ltd,,,,,,...,Tech Park Crescent,79,,Singapore,Singapore,/company/08279974/persons-with-significant-con...,2016-07-01,2019-11-15,c0c96c2f20babcc2eb8dde862df649637bb04397,08279974
1,[ownership-of-shares-50-to-75-percent],individual-person-with-significant-control,,,Mr David John Hill,Mr,David,Hill,5.0,1942.0,...,Old Gloucester Street,27,WC1N 3AX,London,United Kingdom,/company/08279974/persons-with-significant-con...,2016-07-01,,73566697a0d23651821981dbc78d06aead7232bf,08279974
0,"[ownership-of-shares-75-to-100-percent, voting...",corporate-entity-person-with-significant-control,00305943,"Registrar Of Companies, United Kingdom",Bp Exploration Operating Company Limited,,,,,,...,Chertsey Road,Bp Exploration Operating Company Limited,TW16 7BP,Sunbury-On-Thames,England,/company/SC077750/persons-with-significant-con...,2016-04-06,,69708476dfb259402f32831c39d316220ba17169,SC077750


In [84]:
df.to_csv(f'../data/company_info/registries/uk_ubos_{company_name}.csv', index=False)

# Instantiate database connection

In [None]:
begin_time = datetime.datetime.now()

class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            run = session.run(query, parameters)
            response = list(run)
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

conn = Neo4jConnection(uri="bolt://localhost:7687", 
                       user="neo4j",
                       pwd="doemeeffeneo4j")


In [None]:
# Create constraints companies

conn.query('CREATE CONSTRAINT companyIdConstraint ON (company:Company) ASSERT company.companyId IS UNIQUE;')

# Create indexes

conn.query('''CREATE BTREE INDEX companyIdIndex IF NOT EXISTS
    FOR (c:Company)
    ON (c.companyId);''')



In [None]:
for company in companies:

    conn.query('''
                MERGE (c:Company {companyId: $v.company_number}) ON CREATE SET 
                c.companyName = $v.company_name,
                c.companyStatus =  $v.company_status,
                c.companyType = $v.company_type,
                c.companyUrl = $v.links.company_profile,
                c.companyCreationDate = $v.date_of_creation,
                c.companyAddress = COALESCE($v.registered_office_address.address_line_1, 'NULL'),
                c.companyCity = COALESCE($v.registered_office_address.locality, 'NULL'),
                c.companyCountry = COALESCE($v.registered_office_address.country, 'NULL')''', parameters = {'v': company})

In [None]:
test = officers_history_total[0]
test

In [None]:
# CREATE OFFICER INDEX

conn.query('''CREATE CONSTRAINT officerIdConstraint ON (officer:Officer) ASSERT officer.officerId IS UNIQUE;''')

# Create indexes

conn.query('''CREATE BTREE INDEX officerIdIndex IF NOT EXISTS
    FOR (o:Officer)
    ON (o.officerId);''')

In [None]:
for companies in officers:
    for officer in companies['items']:
        print(officer)

In [None]:
for companies in officers:
    try:
        for officer in companies['items']:
            if 'identification' in t.keys():
                conn.query('''
                    MERGE (c:Company {companyId: $v.identification.registration_number}) ON CREATE SET
                    c.companyName = $v.name,
                    c.appointment = $v.links.officer.appointments;
                    MERGE (c:Company {$v.identification.registration_number}-[:ROLE {c.officerRole: $v.officer_role, c.appointedOn: $v.appointed_on}]->(c:Company {c.companyId: substring($v.links.self, 0, 17)}));           
                ''', parameters = {'v': officer})
    except:
        continue

In [None]:
'''else:
            conn.query('''
            MERGE (o:Officer {o:officerId: $v.links.self}) ON CREATE SET
            o.officerName = $v.name,
            o.officerDateOfBirth = COALESCE($v.date_of_birth.year + '-' + $v.date_of_birth.month, 'NULL'),
            o.officerNationality = COALESCE($v.nationality, 'NULL'),
            o.officerAddress = COALESCE($v.address.address_line_1 + ' ' + $v.address.premises + ', ' + $v.address.locality + ', ' + $v.address.country, 'NULL'),
            o.officerOccupation = COALESCE($v.occupation, 'NULL');
            MERGE (o:Officer {o:companyId:$v.links.self})-[:ROLE {o.officerRole: COALESCE($v.officer_role, 'NULL'), o.officerAppointedOn: COALESCE($v.appointed_on, 'NULL'),
            o.officerResignedOn: COALESCE($v.resigned_on, 'NULL'),o.officerAppointmentId: COALESCE($v.links.officer.appointments, 'NULL')} ]->(c:Company {c.companyId: substring($v.links.self, 0, 17)});  
            ''', parameters = {'v': officer})'''

# NORWAY

In [None]:
def search_no_companies(query):
    '''Takes a company name and
    searches for a list of company data
    and prints the names with an index.
    '''
    
    companies = []
    
    result = requests.get(no_url + query).json()
    
    for i, company in enumerate(result['_embedded']['enheter']):
        print(i, result['navn'], result['_links']['self']['href'])
        
        companies.append(company)
    
    return companies


def filter_no_companies(companies, filter_out):
    '''Takes a list of items to filter
    and filters companies
    '''
    companies_clean = []

    for i, company in enumerate(companies['_embedded']['enheter']):

        if i not in to_filter:
            companies_clean.append(company)
            print(company['navn'])
        else:
            continue
    
    return companies_clean


def get_no_officers(companies):
    
    officers = []

    for company in companies:
        company_number = company.get('organisasjonsnummer')
        try:
            result = requests.get(f'https://data.brreg.no/enhetsregisteret/api/enheter/{company_number}/roller').json()
            officers.append(result)
        except:
            print('could not find officers for ' + company.get('navn'), company.get('organisasjonsnummer'))

    return officers




# TODO GET NO OFFICERS HISTORY

In [None]:
no_url = 'https://data.brreg.no/enhetsregisteret/api/enheter?navn='

In [None]:
companies = search_no_companies('CHEVRON')

In [None]:
#to_filter = [12, 13, 14, 15, 16, 17, 18, 19]
companies = filter_no_companies(companies, to_filter)

In [None]:
officers = get_no_officers(companies)

# NL

In [None]:
def search_nl_companies(query):
    '''Search by text,
    make sure to set all
    variables'''
    
    companies = [] 
    
    method = 'dutchBusinessSearchParametersV2'
    
    url = f'https://ws1.webservices.nl/rpc/get-simplexml/utf-8/{method}/{NL_USER_NAME}/{NL_PASSWORD}/trade_name={query}'
    
    result = xmltodict.parse(requests.get(url).text)
    
    for i, entry in enumerate(result.get('response').get('results').get('entry')):
        print(i, entry.get('name'), entry.get('dossier_number'))
        
        companies.append(result)
    
    return companies



def filter_nl_companies(companies, names_to_keep):
    '''Takes a list of items to filter
    and filters companies
    '''
    companies_clean = []
    
    for i, company in enumerate(companies.get('response').get('results').get('entry')):
        
        if i in names_to_keep:
            companies_clean.append(company)   
            print(i, company.get('name'))
        else:
            continue
    
    return companies_clean


In [None]:
spirit = search_nl_companies('SPIRIT ENERGY')

In [None]:
names_to_keep = [3]

In [None]:
companies = filter_nl_companies(spirit, names_to_keep)

In [None]:
user_name = 'followthemoneyb_User'
password = 'LXzLQNetXM^UgQjBVZLFBnF}CQ7$2Mg3'
method_parameters = '33211110'
query = 'SPIRIT ENERGY'
method = 'dutchBusinessSearchParametersV2'
    
url = f'https://ws1.webservices.nl/rpc/get-simplexml/utf-8/{method}/{user_name}/{password}/trade_name={query}'

result = xmltodict.parse(requests.get(url).text)


#url = f'https://ws1.webservices.nl/rpc/get-simplexml/utf-8/{method_name}/{user_name}/{password}/{method_parameters}'

In [None]:
result

In [None]:
for i, entry in enumerate(result.get('response').get('results').get('entry')):
    print(i, entry.get('name'), entry.get('dossier_number'))

In [None]:
pos = requests.get(url)

In [None]:
pos.text

In [None]:
pd.read_xml(pos.text)

In [None]:
data_dict = xmltodict.parse(pos.text)

In [None]:
data_dict

In [None]:
method_name = 'dutchBusinessGetConcernRelationsDetails'
dossier_number = '33211110'
url = f'https://ws1.webservices.nl/rpc/get-simplexml/utf-8/{method_name}/{user_name}/{password}/{dossier_number}/include_source'

tree = requests.get(url)
tree.text

In [None]:
f = xmltodict.parse(func.text)


In [None]:
for function in f['response']['previous_positions']['entry']:
    print(function['company'])