# Experimentation notebook

For those of you familiar with Jupyter Notebooks you can experiment with the variables that are currently available and also experiment with your own detection logic if you wish.

The cells below will help you collect the data from Companies House and generate some png files with your charts without having to post to Twitter or join in with the #joinosintbotnetwithus



In [79]:
import requests
import pandas as pd
import networkx as nx
import json
import re
from collections import Counter
from datetime import datetime
from datetime import timedelta
from time import sleep
import collections
import matplotlib.pyplot as plt
import tweepy
from time import sleep
import random


You will need to register for a Companies House API key [here](https://developer.company-information.service.gov.uk/)

You will also need some Twitter Developer API keys with [elevated access](https://developer.twitter.com/en/portal/products/elevated)

In [80]:
api_key = "YOUR-COMPANIES-HOUSE-API-KEY" #register for one here


consumer_key=''
consumer_secret_key=''
access_token=''
access_token_secret=''

auth=tweepy.OAuthHandler(consumer_key,consumer_secret_key)
auth.set_access_token(access_token,access_token_secret)
api=tweepy.API(auth)

## PARAMETERS
At the miniute these are the only parameters to experiment with when creating charts unless you write your own logic functions

More coming soon hopefully!

In [81]:


time = 1 # n number of days ago to analyse - note that only 5000 registrations can be retrieved by csv. If time is large then the below limits will need to be tighter 
lower_add_limit = 2 #search for addresses that have more than n companies registered to one address 
upper_add_limit = 5 #search for addresses that have more less than n companies registered to one address 
chart_filter = 2 #include companies on the chart that appear n or more times
creation_limit = 3 #don't create or tweet a chart if it has less than n companies
check_if_tweeted_before = True #check if there is an image already present in pwd for that address can be changed to False

In [82]:
reach_back = datetime.now() - timedelta(days=time)

In [83]:
def get_csv():
    csv = requests.get("https://find-and-update.company-information.service.gov.uk/advanced-search/download?companyNameIncludes=&companyNameExcludes=&registeredOfficeAddress=&incorporationFromDay=" + str(reach_back.strftime('%d')) + "&incorporationFromMonth=" + str(reach_back.strftime('%m')) + "&incorporationFromYear=" + str(reach_back.strftime('%Y')) + "&incorporationToDay=" + str(datetime.now().strftime('%d')) + "&incorporationToMonth=" + str(datetime.now().strftime('%m')) + "&incorporationToYear=" + str(datetime.now().strftime('%Y')) + "&sicCodes=&dissolvedFromDay=&dissolvedFromMonth=&dissolvedFromYear=&dissolvedToDay=&dissolvedToMonth=&dissolvedToYear=")
    return csv


In [84]:
def make_dataframe(csv):
    dfmd = pd.DataFrame([x.split(',') for x in csv.text.split('\n')])
    new_header = dfmd.iloc[0] #grab the first row for the header
    dfmd = dfmd[1:] #take the data less the header row
    dfmd.columns = new_header #set the header row as the df header
    
    return dfmd

In [85]:
def coys_btwn(dataframe):
    
    #addresses where more than two but less than 5 companies have been registered to the same address in 24 hours
    
    vc = dataframe['registered_office_address\r'].value_counts().to_frame()
    temp_df = vc[(vc['registered_office_address\r'] >= lower_add_limit) & (vc['registered_office_address\r'] <= upper_add_limit)]
    index_list = temp_df.index.values.tolist()
    ret_dataframe = dataframe[dataframe['registered_office_address\r'].isin(index_list)]
    
    return ret_dataframe

In [86]:
def get_directors(coynumx):
    
    director_country_list = []
    
    url = "https://api.company-information.service.gov.uk/company/" + str(coynumx[1]) + "/officers"
    response = requests.get(url, auth=(api_key, ''))
    #print(response)
    json_search_result = response.text
    search_result = json.JSONDecoder().decode(json_search_result)
    
    #print(search_result)
    
    for director in search_result['items']:
        try:
            director_country_list.append([director['name'], director['country_of_residence']])
        except:
            director_country_list.append([director['name'], 'BLANK'])
        
    return director_country_list

In [87]:
def dir_handler(sdf):
    
    coynumlist = []
    
    counter = 0
    for i, r in sdf.iterrows():
        
        try:
            coynumlist.append([r['registered_office_address\r'], r['company_name'], r['company_number'], get_directors([r['company_name'], r['company_number']])])
        except Exception as e:
            print('HTTP Error due to CH API for ' + r['company_name'] + ', continuing...')
            counter += 1
            continue
    #print(counter)
    if int((counter/len(sdf.index))*100) < 10:
        print('CH API only allows 600 requests every 5 minutes, you may have hit your rate limit')
    else:
        print('retrieved ' + str(int((counter/len(sdf.index))*100)) + '% of officers for suspicious companies')
        
    return coynumlist 
            



In [None]:
#feel free to play with colors!!!!! lets make twitter colorful

#note that tweeter is commented out and not present - charts will just write to local directory

def grapher_new(dataframe, address, postcode):
    temp = dataframe.loc[dataframe['address'] == address]
    #print(temp)
    if len(temp['company_name'].unique()) > creation_limit:
        
        G = nx.Graph()
        f = plt.figure(figsize=(12, 14))
    
        for i, r in dataframe.loc[dataframe['address'] == address].iterrows():


            G.add_nodes_from([(r['company_name'], {"color": "red"})])
            G.add_nodes_from([(re.findall("[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}",r["address"])[0], {"color": "green"})])
            G.add_edge(r['company_name'], re.findall("[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}",r["address"])[0])         

            for ent in r['directors']:

                G.add_nodes_from([(ent[0], {"color": "blue"})])

                if ent[1] != "BLANK":
                    G.add_nodes_from([(ent[1], {"color": "yellow"})])

                G.add_edge(ent[0], r['company_name'])
                try:
                    G.add_edge(ent[0], ent[1])
                except:
                    continue

        color = nx.get_node_attributes(G, "color")
        color_map = []
        
        try:
            for node in G:

                if color[node] == "red":
                    color_map.append('red')
                elif color[node] == "green":
                    color_map.append('green')
                elif color[node] == "blue":
                    color_map.append('blue')
                elif color[node] == "yellow":
                    color_map.append('yellow')


            nx.draw(G, with_labels=True, font_weight='bold', node_color=color_map, ax=f.add_subplot(111))

            f.savefig(postcode.replace(" ", "") + ".png")
            plt.close(f)

            #tweeter(dataframe, address, postcode)
        except:
            print('Country of residence BLANK, skipping')            
        
    else:
        print('less than ' + str(creation_limit) + ' companies - skipping...')
        pass

In [None]:
def tweeter(result, fc, postcode):
    
    temp = result.loc[result['address'] == fc]
    
    #print(temp)
    
    companies = len(temp['company_name'].unique())
    temp['directors'] = temp['directors'].astype(str)
    directors = len(temp['directors'].unique())
    
    space = '''
    
    
    '''
    
    tweet_text= 'There have been ' + str(directors) + " officer/s register " + str(companies) + " companies in the UK at the postcode " + str(postcode) + " detected in the last 24 hours" + space + "#OSINT #Data #opensource"
    image_path = postcode.replace(" ", "") + '.png'

    api.update_status_with_media(tweet_text, image_path)
    
    sleep(random.randint(300, 900))

## HIGHLY RECCOMENDED 
Run the below cell to retrieve the relevant data and save as the temp variable
Running the data retrieval parts of the script may get you banned from the API or have your IP blacklisted
You can then experiment with chart layout in the grapher_new() function

In [89]:
temp = coys_btwn(make_dataframe(get_csv()))

In [None]:
    
def coy_director_filterer(listoflist):
    #takes [address, company_name, company_number, [[directors]]] as input
    result = pd.DataFrame(listoflist)
    #print(result)
    result.columns = ['address', 'company_name', 'company_number', 'directors']
    for add in result['address'].unique():
        directors_list = []
        for i, r in result.loc[result['address'] == add].iterrows():

            for ent in r['directors']:
                directors_list.append(ent[0])
        
        
        #print(collections.Counter(directors_list))
        filtered_companies = []    
        if len ([item for item, count in collections.Counter(directors_list).items() if count >= chart_filter]) > 0:
            filtered_companies.append(r['address'])
            
        
        
        #print(filtered_companies)
        for fc in filtered_companies:
            #print('fc - ' + fc)
            #print('raddress ' + r['address'])
            try:
                postcode = re.findall("[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}",r["address"])[0]
                #print('postcode - ' + postcode)


                if check_if_tweeted_before is True:
                    try:
                        check_file = open(postcode.replace(" ", "") + ".png")


                    except Exception as e:
                        print('image not created already, starting grapher')
                        grapher_new(result, fc, postcode)

                else:
                    grapher_new(result, fc, postcode)
            except Exception as e:
                print('postcode without a space not yet handled! skipping')



In [None]:
#run this to print the charts that would have got tweeted to the local directory

coy_director_filterer(temp)