# MHKDR Development

This is a test blurb.

This is another test edit to see if the gitignore is working.

### Cleaned 

In [17]:
def find_entry_id(entry_url):
    '''
    This function takes in the url of a MHKDR entry, and returns the entry_id of that page. 
    The 'entry_id' is the integer at the end of the url, which is unique to each MHKDR entry.
    The regex used in this function relies on the fact that the only number in the url is the id.
    '''
    rule = re.compile(r'\d+')
    matches_rule = rule.findall(entry_url)
    entry_id = int(matches_rule[0])

    return entry_id

In [18]:
def construct_authors_table(mhkdr_dataframe):
    '''
    This function creates a normalized table for the json element "author," connected to an "entry_id" that 
    may be called as a primary key to join this table to others. This disentangles the nested list structure
    present in the json to enable reporting e.g. associations among researchers, number of documents 
    attributed to each author.
    '''
    authors_of_entries = list(mhkdr_dataframe['author'])
    landing_page_urls = list(mhkdr_dataframe['sourceURL'])
    
    entry_ids = list()  # This list will contain duplicate entry ids, as it represents the final column that will map to entry
    authors = list()    # This list will contain duplicate authors when an author contributes to multiple entries
    
    for i in range(0, len(mhkdr_dataframe)):
        # Construct "entry_id" - This will be a primary key for all future merge operations.
        entry_id = find_entry_id(landing_page_urls[i])
        
        # Construct "author" column
        num_authors = len(authors_of_entries[i])
        for j in range(0, num_authors):
            entry_ids.append(entry_id)
            authors.append(authors_of_entries[i][j])
    
    final_df = pd.DataFrame({'entry_id':entry_ids, 'author':authors})
    
    return final_df

In [19]:
def construct_organizations_table(mhkdr_dataframe):
    '''
    This function creates a normalized table for the json element "organization," connected to an "entry_id" that 
    may be called as a primary key to join this table to others. This disentangles the nested list structure
    present in the json to enable reporting e.g. associations among researchers, number of documents 
    attributed to each author.
    '''
    orgs_of_entries = list(mhkdr_dataframe['organization'])
    landing_page_urls = list(mhkdr_dataframe['sourceURL'])
    
    entry_ids = list()  # This list will contain duplicate entry ids, as it represents the final column that will map to entry
    orgs = list()    # This list will contain duplicate authors when an author contributes to multiple entries
    
    for i in range(0, len(mhkdr_dataframe)):
        # Construct "entry_id" - This will be a primary key for all future merge operations.
        entry_id = find_entry_id(landing_page_urls[i])
        
        # Construct "organization" column
        org = orgs_of_entries[i][0]

        entry_ids.append(entry_id)
        orgs.append(org)
    
    final_df = pd.DataFrame({'entry_id':entry_ids, 'organization':orgs})
    
    return final_df

In [20]:
def construct_tags_table(mhkdr_dataframe):
    '''
    This function creates a normalized table for the json element "tags," connected to an "entry_id" that 
    may be called as a primary key to join this table to others. This disentangles the nested list structure
    present in the json to enable reporting e.g. associations among researchers, number of documents 
    attributed to each author.
    '''
    tags_of_entries = list(mhkdr_dataframe['tags'])
    landing_page_urls = list(mhkdr_dataframe['sourceURL'])
    
    entry_ids = list()  # This list will contain duplicate entry ids, as it represents the final column that will map to entry
    tags = list()    # This list will contain duplicate authors when an author contributes to multiple entries
    
    for i in range(0, len(mhkdr_dataframe)):
        # Construct "entry_id" - This will be a primary key for all future merge operations.
        entry_id = find_entry_id(landing_page_urls[i])
        
        # Construct "tag" column
        num_tags = len(tags_of_entries[i])
        for j in range(0, num_tags):
            entry_ids.append(entry_id)
            tags.append(tags_of_entries[i][j])
    
    final_df = pd.DataFrame({'entry_id':entry_ids, 'tag':tags})
    
    return final_df

### Cleaned - Tests

In [98]:
authors_tbl = construct_authors_table(mhkdr_dataframe)
construct_authors_table(mhkdr_dataframe)

Unnamed: 0,entry_id,author
0,545,James McVey
1,545,Molly Grear
2,545,Mikaela Freeman
3,545,Lysel Garavelli
4,534,Tyler Robertson
...,...,...
994,2,Tyler Mayer
995,1,Jon Weers
996,1,Nicole Taverna
997,1,Jay Huggins


In [92]:
orgs_tbl = construct_organizations_table(mhkdr_dataframe)
construct_organizations_table(mhkdr_dataframe)

Unnamed: 0,entry_id,organization
0,545,Pacific Northwest National Laboratory
1,534,"Triton Systems, Inc."
2,532,Pacific Northwest National Laboratory
3,531,University of Washington
4,530,Oregon State University
...,...,...
384,14,"Dehlsen Associates, LLC"
385,5,"Dehlsen Associates, LLC"
386,3,"Dehlsen Associates, LLC"
387,2,"Dehlsen Associates, LLC"


In [47]:
construct_tags_table(mhkdr_dataframe)

Unnamed: 0,entry_id,tag
0,545,wave
1,545,puerto rico
2,545,sea surface temperature
3,545,wave energy
4,545,wave measurements
...,...,...
10967,1,best practices
10968,1,guide
10969,1,API
10970,1,management
