In [29]:
import pandas as pd
import numpy as np
import csv
import difflib
from tqdm import tqdm
import nltk
from urllib.parse import urlsplit
tqdm.pandas(desc="Fuzzy Match Progress")

In [52]:
location_assignee_file="./location_assignee/location_assignee.tsv"
location_file="./location/location.tsv"
assignee_file="./assignee/assignee.tsv"

'''
Method loads and returns US assignee ID
'''
def get_us_assignees(location_assignee_file, location_file):
    # Load assignee location mapping
    assignee_details = pd.read_csv(
        location_assignee_file, sep="\t", encoding="latin-1")
    assignee_details.location_id = assignee_details.location_id.astype(str)
    assignee_details.assignee_id = assignee_details.assignee_id.astype(str)

    assignee_details = assignee_details[assignee_details.location_id != "nan"]
    assignee_details = assignee_details[assignee_details.assignee_id != "nan"]

    # Load Location details
    location = pd.read_csv(location_file, sep="\t", encoding="latin-1")
    location.id = location.id.astype(str)
    location = location[location.id != "nan"]
    location = location.assign(country=location.country.str.lower())
    location.country = location.country.astype(str)
    location = location[location.country != "nan"]

    # Filter US location
    us_locations = location[location.country == "us"]

    # Find assignees location in US
    us_assignees = pd.merge(
        left=assignee_details,
        right=us_locations[["id"]],
        how="inner",
        left_on="location_id",
        right_on="id")
    return us_assignees


us_assignees=get_us_assignees(location_assignee_file=location_assignee_file,location_file=location_file)

patent_assignee_file="./patent_assignee/patent_assignee.tsv"
def load_patent_assignee_mapping(patent_assignee_file):
    # Load Assignee to patent mapping
    patent_assignee_mapping = pd.read_csv(
        patent_assignee_file, sep="\t", low_memory=False)
    patent_assignee_mapping.assignee_id = patent_assignee_mapping.assignee_id.astype(
        str)
    patent_assignee_mapping.patent_id = patent_assignee_mapping.patent_id.astype(
        str)
    patent_assignee_mapping = patent_assignee_mapping[
        patent_assignee_mapping.assignee_id != "nan"]
    patent_assignee_mapping = patent_assignee_mapping[
        patent_assignee_mapping.patent_id != "nan"]
    return patent_assignee_mapping

patent_assignee_mapping=load_patent_assignee_mapping(patent_assignee_file=patent_assignee_file)


In [63]:
'''
Method uses assignee file, patent file and the search results to map selected patents' assignee information
'''


def get_search_results_assignee_details(patent_assignee_mapping,
                                        search_results_file,
                                        column_type_info,
                                        assignee_file,
                                        assignee_filter,
                                        selection_field_name=None,
                                        selection_start_boundary=0,
                                        selection_end_boundary=None):

    # Load search results for current sector
    patent_results = pd.read_csv(search_results_file, dtype=column_type_info)
    patent_results.id = patent_results.id.astype(str)
    patent_results = patent_results[patent_results.id != "nan"]

    # Filter search results based on selection criteria
    # Different sectors have different means of selection
    print(patent_results.iloc[:, selection_start_boundary:selection_end_boundary].head())
    # Selection criteria is available as single field (True/False)
    if selection_field_name is not None:
        selected_patents = patent_results[patent_results[selection_field_name]]

    # Selection criteria is available as OR condition of many field
    else:
        selected_patents = patent_results[
            patent_results.iloc[:, selection_start_boundary:selection_end_boundary].any(axis=1)]

    # Map Assignee id for each selected patent
    selected_patent_details = pd.merge(
        selected_patents,
        patent_assignee_mapping,
        how="left",
        left_on="id",
        right_on="patent_id")

    selected_patent_details.assignee_id.fillna("", inplace=True)

    # Identify Assignee id for US assignees
    us_assignee_ids = [str(x) for x in us_assignees.assignee_id.tolist()]

    # Filter selected patent based on assignee location
    us_selected_patent_details = selected_patent_details[
        selected_patent_details.assignee_id.isin(us_assignee_ids)]

    # Load assignee details
    assignee_details = pd.read_csv(
        assignee_file, sep="\t", low_memory=False, encoding='latin-1')

    # Map Assignee details to patents
    selected_patent_assignee_details = pd.merge(
        us_selected_patent_details,
        assignee_details,
        how="left",
        left_on="assignee_id",
        right_on="id")

    return selected_patent_assignee_details

In [67]:
nano_results_file = "../Phase-1-Search-Term/nano-technology/nano_utility_patents.csv"
nano_dtype = {
    'Biosensor 2 Term': bool,
    'Biosensor Term': bool,
    'Micro Term': bool,
    'Molecular Motor Term': bool,
    'Nano Term': bool,
    'Quantum Term': bool,
    'Quasi Term': bool,
    'Self Term': bool,
    'exclusion': bool,
    'id': str,
    'measure_exclusion': bool,
    'selection': bool
}
nano_us_selected_patent_assignee_details = get_search_results_assignee_details(
    patent_assignee_mapping,
    nano_results_file,
    nano_dtype,
    assignee_file,
    us_assignees,
    selection_field_name="selection")
pd.DataFrame(
    nano_us_selected_patent_assignee_details.organization.unique()).to_csv(
        "nano_unique_organizations.csv", index=False, header=None)

   Biosensor 2 Term  Biosensor Term  Micro Term  Molecular Motor Term  \
0             False           False       False                 False   
1             False           False       False                 False   
2             False           False       False                 False   
3             False           False       False                 False   
4             False           False       False                 False   

   Nano Term  Quantum Term  Quasi Term  Self Term  exclusion       id  \
0       True         False       False      False      False  9198873   
1       True         False       False      False      False  9198874   
2       True         False       False      False      False  9198928   
3       True         False       False      False      False  9198973   
4       True         False       False      False      False  9198974   

   measure_exclusion  selection  
0              False       True  
1              False       True  
2              False

In [48]:
green_results_file = "../Phase-1-Search-Term/green-technology/green_utility_patents.csv"
green_dtype = dtype={                                  'Renewable energy-All-purpose':bool,'Renewable energy-Biomass':bool,
                                  'Renewable energy-Geothermal':bool,'Renewable energy-Photovoltaic & solar':bool,
                                  'Renewable energy-Wave & Tidal':bool,'Renewable energy-Wind':bool,'patent_id': str}
green_us_selected_patent_assignee_details = get_search_results_assignee_details(
    patent_assignee_mapping,
    green_results_file,
    green_dtype,
    assignee_file,
    us_assignees,
    selection_field_boundary=6)
pd.DataFrame(
    green_us_selected_patent_assignee_details.organization.unique()).to_csv(
        "green_unique_organizations.csv", index=False, header=None)

In [66]:
synbio_results_file = "../Phase-1-Search-Term/synthetic-biology/synbio_utility_patents.csv"
synvio_dtype =   dtype={
        'Bio Tech/Engg': bool,
        'Cell biology': bool,
        'Chemical': bool,
        'General': bool,
        'Genetics': bool,
        'Nano technology': bool,
        'id': str
    }
synbio_us_selected_patent_assignee_details = get_search_results_assignee_details(
    patent_assignee_mapping,
    synbio_results_file,
    synvio_dtype,
    assignee_file,
    us_assignees,
    selection_start_boundary=27,
    selection_end_boundary=34)
pd.DataFrame(
    synbio_us_selected_patent_assignee_details.organization.unique()).to_csv(
        "synbio_unique_organizations.csv", index=False, header=None)

   synbio-Bio Tech/Engg  synbio-Cell biology  synbio-Chemical  synbio-General  \
0                  True                False            False           False   
1                 False                False            False           False   
2                 False                False            False           False   
3                 False                False            False           False   
4                  True                False            False           False   

   synbio-Genetics  synbio-Nano technology  
0            False                   False  
1             True                   False  
2             True                   False  
3             True                   False  
4            False                   False  
