# Test file for extracting data from webscraped raw data

In [62]:
import os
import sys
import lxml
import pickle
import requests
import regex as re
import pandas as pd
import os.path as path
from copy import deepcopy
from bs4 import BeautifulSoup

sys.setrecursionlimit(10000)

#### Importing the raw scraped html data

In [2]:
parent = path.abspath(path.join("brønnøysund_roller_scraped_extractor.ipynb" ,"../../.."))
with open(f'{parent}/Data/Raw/scraped_data_from_brønnøysund.obj', 'rb') as file:
	raw_html_data = pickle.load(file)

In [3]:
print(len(raw_html_data))

1003


In [4]:
def remove_seperators(row):
        
    result = ""
    line = str(row.text).strip()
    for character in line:
        #print(character)
        if character in ["\n", "\t"]:
            pass
            #break
        else:
            result = result + character
                    
    return result

In [31]:
def get_data_after(get_data_after, orgnumber, html_data):
    """Gets the data belonging to a category in a html file.

    Args:
        get_data_after (string): the name of a category e.g: Daglig leder/ adm.direktør:
        orgnumber (string): a 9 digit org number in string format.
        html_data (string): a string in html format, not parsed.

    Returns:
        list: returns a list with the values belonging to the category.
    """
    data = BeautifulSoup(html_data, 'lxml')
    get_data_after.strip()    
    acquired_data = False
    retrieve_data_rows = False
    retrieved_data = []
    
    for row in data.find_all('p'):
        if re.search("<b>[A-Z]", str(row)):
            retrieve_data_rows = False
        if retrieve_data_rows == True:
            if len(row.text) > 0 and row.text not in ["\n", "\n\n"]:
                retrieved_data.append(row.text)
                acquired_data = True
        if row.text == f"\n{get_data_after}\n":
            retrieve_data_rows = True
    
    if acquired_data == False:
        return False
    return retrieved_data

In [34]:
data_structure = {}
list_of_search_parameters = ["Daglig leder/ adm.direktør:", 
                            "Innehaver:",
                            "Kontaktperson:",
                            "Styrets leder:", 
                            "Nestleder:", 
                            "Styremedlem:",
                            "Varamedlem:"]

for key, value in raw_html_data.items():
    data_structure[key] = {}

    for search_parameter in list_of_search_parameters:
        data = get_data_after(get_data_after = search_parameter,
                                orgnumber = key,
                                html_data = value)
        if data != False:
            data_structure[key][search_parameter] = data

In [46]:
data_structure

{'927526891': {'Daglig leder/ adm.direktør:': ['Mehmet Celik'],
  'Styrets leder:': ['Mehmet Celik'],
  'Styremedlem:': ['Anette Wittingsrud Celik']},
 '926646419': {'Daglig leder/ adm.direktør:': ['Mohamed Ali Saleh Ahmed'],
  'Styrets leder:': ['Mubin Mohamed Ahmed'],
  'Styremedlem:': ['Abdallah Abdi Hussein']},
 '921505337': {'Daglig leder/ adm.direktør:': ['Gurkirt Singh'],
  'Styrets leder:': ['Gurkirt Singh']},
 '911956535': {'Daglig leder/ adm.direktør:': ['Sekoya Noreng'],
  'Styrets leder:': ['Sekoya Noreng']},
 '918928405': {'Daglig leder/ adm.direktør:': ['Tor Lundsholt'],
  'Styrets leder:': ['Bjørn Tore Furset'],
  'Styremedlem:': ['Gjøran Kvernes Sæther']},
 '930727334': {'Daglig leder/ adm.direktør:': ['Suye Mon'],
  'Styrets leder:': ['Tomas Adrian Glans'],
  'Nestleder:': ['Anne Aarrestad']},
 '928546861': {'Daglig leder/ adm.direktør:': ['Tharald Stray Laastad'],
  'Styrets leder:': ['Tharald Stray Laastad'],
  'Styremedlem:': ['Tore Kristian Tofte']},
 '919741570': 

### Filtering out all the people. With the assumption that people with the same name are the same people.

In [41]:
all_people = []

for orgnumber, company in data_structure.items():
    
    for role, person_list in company.items():

        for person in person_list:
            all_people.append(person)


In [56]:
# Removing the excess characters for names.
clean_list_of_people = []
for person in all_people:
    name = ""
    for character in person:
        if character in ["\n"]:
            break
        else:
            name = name + character
    clean_list_of_people.append(name)

In [58]:
unique_names = list(set(clean_list_of_people))

In [61]:
print(f"Complete number of names: {len(clean_list_of_people)}")
print(f"Number of unique names: {len(unique_names)}")

Complete number of names: 3047
Number of unique names: 1853


A big chunk of the loss comes from people having multiple roles within the same company.

### Creating the people dataframe:

In [86]:
# Adding the names to the dataframe and adding a person_id based on index.
df_people = pd.DataFrame(unique_names, columns =['navn'])
df_people["person_id"] = df_people.index
df_people

Unnamed: 0,navn,person_id
0,Marisa Rafaella Valente,0
1,Victor Joseph Demison Lionel,1
2,Gaute Houge,2
3,Berivan Hamide Ciftci,3
4,Jeevarajah Sivasamboo,4
...,...,...
1848,Rudolf Reim,1848
1849,Magnus Tvenge,1849
1850,Jan Abrahamsen,1850
1851,Shadi Haj Yahia,1851


### Creating the Junction tabels

In [139]:
org_num = [[],[],[],[],[],[],[]]
name = [[],[],[],[],[],[],[]]

for orgnumber, company in data_structure.items():
    for role, person_list in company.items():

        for num in range(len(list_of_search_parameters)):
            if role == list_of_search_parameters[num]:
                
                for person in person_list:
                    org_num[num].append(orgnumber)
                    name[num].append(person)

In [145]:
list_of_search_parameters

['Daglig leder/ adm.direktør:',
 'Innehaver:',
 'Kontaktperson:',
 'Styrets leder:',
 'Nestleder:',
 'Styremedlem:',
 'Varamedlem:']

#### Convert into dataframes

In [158]:
def uppdata_current_status(name):
    """Returns True if the name contains the string "Fratrådt".

    Args:
        name (string): string as part of a dataframe column.    

    Returns:
        bool: True or False based on the name.
    """
    if re.search("Fratrådt", name):
        return True
    else:
        return False

In [159]:
def remove_excess_in_name(name):
    """Removes the excess string "Fratrådt" from the name.

    Args:
        name (string): name of person.

    Returns:
        string: name without the "Fratrådt" string
    """
    if re.search("Fratrådt", name):
        return_name = ""
        for char in name:
            if char in ["\n"]:
                break
            else:
                return_name = return_name + char
        return return_name
    else:
        return name

In [184]:
def name_to_person_id(name):
    """Requires the df_people dataframe, gets the id of a person based on the name.

    Args:
        name (str): name of a person in string format, case sensetive.

    Returns:
        int: person id from the df_people dataframe.
    """
    return int(df_people.loc[df_people['navn']==name]['person_id'])

In [185]:
list_of_search_parameters

['Daglig leder/ adm.direktør:',
 'Innehaver:',
 'Kontaktperson:',
 'Styrets leder:',
 'Nestleder:',
 'Styremedlem:',
 'Varamedlem:']

In [187]:
list_of_role_dfs = []
for num in range(len(list_of_search_parameters)):
    list_of_role_dfs.append(pd.DataFrame({  'organisasjonsnummer': org_num[num],
                                            'person_id': name[num]}))
    list_of_role_dfs[num]['Fratrådt'] = list_of_role_dfs[num]['person_id'].map(uppdata_current_status)
    list_of_role_dfs[num]['person_id'] = list_of_role_dfs[num]['person_id'].map(remove_excess_in_name)
    list_of_role_dfs[num]['person_id'] = list_of_role_dfs[num]['person_id'].map(name_to_person_id)

In [200]:
df_people.head(2)

Unnamed: 0,navn,person_id
0,Marisa Rafaella Valente,0
1,Victor Joseph Demison Lionel,1


In [201]:
df_daglig_leder = list_of_role_dfs[0]
df_daglig_leder.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,927526891,341,False
1,926646419,799,False


In [202]:
df_innehaver = list_of_role_dfs[1]
df_innehaver.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,893105182,311,False
1,986113258,190,False


In [203]:
df_kontaktperson = list_of_role_dfs[2]
df_kontaktperson.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,980147622,1824,False
1,991098577,552,False


In [204]:
df_styrets_leder = list_of_role_dfs[3]
df_styrets_leder.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,927526891,341,False
1,926646419,969,False


In [205]:
df_nestleder = list_of_role_dfs[4]
df_nestleder.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,930727334,1319,False
1,914192900,773,False


In [206]:
df_styremedlem = list_of_role_dfs[5]
df_styremedlem.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,927526891,1728,False
1,926646419,972,False


In [207]:
df_varamedlem = list_of_role_dfs[6]
df_varamedlem.head(2)

Unnamed: 0,organisasjonsnummer,person_id,Fratrådt
0,814599892,830,False
1,958596936,601,False


In [208]:
# Function that stores the raw data as a pickle file.
# Gets the parent directory location enabeling storing the 
# data directly to the desired folder.

def pickler(file, file_name):	
    parent = path.abspath(path.join("brønnøysund_roller_scraped_extractor.ipynb", "../../.."))
    path = "/Data/Extracted/"
    with open(f'{parent + path + file_name}.obj', 'wb') as f:
        pickle.dump(file, f)

In [None]:
j = "junction_table"
pickler(df_people, "df_people")
pickler(df_daglig_leder, f"df_daglig_leder{j}")
pickler(df_innehaver, f"df_innehaver{j}")
pickler(df_kontaktperson, f"df_kontaktperson{j}")
pickler(df_nestleder, f"df_nestleder{j}")
pickler(df_styremedlem, f"df_styremedlem{j}")
pickler(df_styrets_leder, f"df_styrets_leder{j}")
pickler(df_varamedlem, f"df_varamedlem{j}")