In [1]:
import pandas as pd
import numpy as np
import re

# Block 1: 
WIPO Block 2
UKIPO Keywords
Standalone keyword search

In [16]:
# define function to take key words and return a list of SQL queries specifying the tables and fields to search, with each query limited to 10,000 characters (patstat requirement)
def split_conditions_into_queries(conditions, max_spaces, template_overhead):
    """
    Split conditions into groups such that each group's SQL query remains within the specified character limit.
    """
    # Initialize variables to hold condition groups and current query length
    query_groups = []
    current_group = []
    current_length = template_overhead  # Start with overhead of the fixed part of the SQL template

    for condition in conditions:
        # Calculate the length of the condition with ' OR ' separator
        condition_length = len(condition) + 4  # Account for ' OR '

        if current_length + condition_length <= max_spaces:
            # If adding this condition doesn't exceed the limit, add it to the current group
            current_group.append(condition)
            current_length += condition_length
        else:
            # If it exceeds the limit, save the current group and start a new one
            query_groups.append(current_group)
            # Reset current group with the new condition and the overhead
            current_group = [condition]
            current_length = template_overhead + condition_length

    # Add the last group if it's not empty
    if current_group:
        query_groups.append(current_group)

    return query_groups


def generate_sql_queries_keywords(near_patterns, max_spaces=10000):
    """
    Generate SQL queries based on 'NEAR' patterns, limiting each query to a specified number of spaces.
    """
    # Generate conditions for all given 'NEAR' patterns
    all_conditions = [
        f"CONTAINS (appln_title, '{pattern}')" for pattern in near_patterns
    ] + [
        f"CONTAINS (appln_abstract, '{pattern}')" for pattern in near_patterns
    ]

    # The static part of the SQL query (before inserting conditions)
    sql_template_base = """
    SELECT DISTINCT TOP 10
        reg102_pat_publn.PUBLN_NR,
        reg102_pat_publn.PUBLN_KIND,
        reg102_pat_publn.publn_auth,
        reg102_pat_publn.publn_date,
        appln_title,

    FROM
        reg101_appln
    INNER JOIN
        reg102_pat_publn
    ON
        reg101_appln.id = reg102_pat_publn.id
    LEFT OUTER JOIN
        tls203_appln_abstr
    ON
        reg101_appln.appln_id = tls203_appln_abstr.appln_id
    LEFT OUTER JOIN
        tls202_appln_title
    ON
        reg101_appln.appln_id = tls202_appln_title.appln_id
    WHERE
        reg102_pat_publn.publn_auth = 'EP'
        AND (
            {conditions}
        )
    """

    # Get the length of the static part of the query
    template_overhead = len(sql_template_base.format(conditions=''))

    # Split conditions into groups with the given max spaces limit
    query_groups = split_conditions_into_queries(all_conditions, max_spaces, template_overhead)

    # List to hold the final SQL queries
    sql_queries = []

    # Generate a complete SQL query for each group of conditions
    for group in query_groups:
        formatted_conditions = " OR ".join(group)  # Join with 'OR'
        final_sql = sql_template_base.format(conditions=formatted_conditions)  # Insert into template
        sql_queries.append(final_sql)  # Add to list of final SQL queries

    return sql_queries


In [18]:
# list of key word (combinations) from WIPO and UK IPO reports
WIPO_UKIPO_keywords_standalone = ['NEAR(("artific*", "intelligen*"), 1, TRUE)',
    'NEAR(("computation*", "intelligen*"), 1, TRUE)',
    'NEAR(("neural", "network*"), 1, TRUE)','"neural-network*"',
    'NEAR(("bayes*", "network*"), 1, TRUE)',
    '"chatbot*"',
    '"chat bot*"',
    '"chat-bot*"',
    'NEAR(("data", "mining*"), 1, TRUE)',
    'NEAR(("decision", "model*"), 1, TRUE)',
    'NEAR(("deep", "learning*"), 1, TRUE)',
    '"deep-learning*"',
    'NEAR(("genetic", "algorithm*"), 1, TRUE)',
    'NEAR(("inductive", "programme*"), 8, TRUE)',
    'NEAR(("logic", "programme*"), 8, TRUE)',
    'NEAR(("machine", "learning*"), 1, TRUE)',
    '"machine-learning*"',
    'NEAR(("natural", "language"), 8, TRUE)',
    'NEAR(("reinforcement", "learning"), 1, TRUE)',
    '"supervised-learning*"',
    'NEAR(("supervised", "learning*"), 1, TRUE)',
    'NEAR(("swarm", "intelligen*"), 1, TRUE)',
    '"swarm-intelligen*"',
    '"supervised-learning*"',
    '"supervised learning*"',
    '"supervisedlearning*"',
    '"semi-supervised-learning*"',
    '"semi supervised learning*"',
    '"semi-supervised learning*"',
    '"semisupervised learning*"',
    'NEAR(("expert", "system*"), 1, TRUE)',
    'NEAR(("fuzzy", "logic*"), 1, TRUE)',
    '"transfer-learning*"',
    '"transferlearning*"',
    '"transfer learning*"',
    'NEAR(("transfer", "learning"), 1, TRUE)',
    'NEAR(("learning", "algorithm*"), 3, TRUE)',
    'NEAR(("learning", "model*"), 1, TRUE)',
    '"vector machine*"',
    '"vector-machine*"',
    '"random forest*"',
    '"random-forest*"',
    '"decision trees*"',
    '"decision-trees*"',
    '"gradient tree boosting"',
    '"gradient-tree-boosting"',
    '"xgboost"',
    '"xgboost*"',
    '"adaboost*"',
    '"rankboost"',
    '"logistic regression"',
    '"logistic-regression"',
    '"stochastic gradient descent"',
    '"stochastic-gradient-descent"',
    '"multilayer preceptron*"',
    '"multilayer-perceptron*"',
    '"latent semantic analysis"',
    '"latent-semantic-analysis"',
    '"latent dirichlet allocation"',
    '"latent-dirichlet-allocation"',
    '"multi-agent system*"',
    '"multi-agent-system*"',
    '"multi agent system*"',
    '"hidden markov model*"',
    '"supervised training"',
    '"unsupervised training"',
    '"semisupervised training"',
    '"semi-supervised training"',
    '"ant-colony"',
    '"factorization machin*"',
    '"high-dimensional*"',
    '"feature*"',
    '"particle-swarm*"',
    '"bee-colony"',
    '"factorisation machin*"',
    '"high-dimensional*"',
    '"input*"',
    '"pattern-recogni*"',
    '"fire-fly"',
    '"feature engineer*"',
    '"k-means"',
    '"policy-gradient method"',
    '"adversar*"',
    '"feature extract*"',
    '"kernel learn*"',
    '"q-learn*"',
    '"feature select*"',
    '"latent-variable*"',
    '"random-forest*"',
    '"association rule"',
    '"fuzzy-c"',
    '"link* predict*"',
    '"recommender system*"',
    '"auto-encod*"',
    '"fuzzy environment*"',
    '"machine intelligen*"',
    '"reinforc* learn*"',
    '"autonom* comput*"',
    '"fuzzy logic*"',
    '"machine learn*"',
    '"sentiment* analy*"',
    '"back-propagat*"',
    '"fuzzy number*"',
    '"map-reduce"',
    '"sparse represent*"',
    '"back-propogat*"',
    '"fuzzy set*"',
    '"memetic algorithm*"',
    '"sparse*-code*"',
    '"cognitiv* comput*"',
    '"fuzzy system*"',
    '"multi* label* classif*"',
    '"spectral cluster*"',
    '"collaborat* filter*"',
    '"gaussian mixture model"',
    '"multi*-objective* algorithm*"',
    '"stochastic*-gradient*"',
    '"deep-belief network*"',
    '"gaussian process*"',
    '"multi*-objective* optim*"',
    '"*supervis* learn*"',
    '"deep-learn*"',
    '"genetic program*"',
    '"natural-gradient"',
    '"differential*-evol* algorithm*"',
    '"genetic* algorithm"',
    '"neural-turing"',
    '"swarm behav*"',
    '"dimensional*-reduc*"',
    '"high-dimensional* model*"',
    '"neuro-morph comput*"',
    '"transfer-learn*"',
    '"evolution* algorithm*"',
    '"high-dimensional* space*"',
    '"non-negative matri* factor*"',
    '"variation*-infer*"',
    '"evolution* comput*"',
    '"high-dimensional* system*"',
    '"object-recogni*"',
    '"vector-machine*"'
]


WIPO_UKIPO_keyword_AI_queries = generate_sql_queries_keywords(WIPO_UKIPO_keywords_standalone)

def clean_query(query):
    """
    Remove unwanted backslashes, newlines, tabs, and extra spaces.
    """
    query = query.replace("\\", "")  # Remove backslashes
    query = query.replace("\n", " ")  # Replace newlines with a space
    query = query.replace("\t", " ")  # Replace tabs with a space
    query = query.replace("  ", " ")  # Remove double spaces
    return query.strip()  # Remove leading/trailing spaces

for query in WIPO_UKIPO_keyword_AI_queries:
    print(clean_query(query))
    print("\n")

SELECT DISTINCT TOP 10     reg102_pat_publn.PUBLN_NR,     reg102_pat_publn.PUBLN_KIND,     reg102_pat_publn.publn_auth,     reg102_pat_publn.publn_date,     appln_title,   FROM     reg101_appln   INNER JOIN     reg102_pat_publn   ON     reg101_appln.id = reg102_pat_publn.id   LEFT OUTER JOIN     tls203_appln_abstr   ON     reg101_appln.appln_id = tls203_appln_abstr.appln_id   LEFT OUTER JOIN     tls202_appln_title   ON     reg101_appln.appln_id = tls202_appln_title.appln_id   WHERE     reg102_pat_publn.publn_auth = 'EP'     AND (       CONTAINS (appln_title, 'NEAR(("artific*", "intelligen*"), 1, TRUE)') OR CONTAINS (appln_title, 'NEAR(("computation*", "intelligen*"), 1, TRUE)') OR CONTAINS (appln_title, 'NEAR(("neural", "network*"), 1, TRUE)') OR CONTAINS (appln_title, '"neural-network*"') OR CONTAINS (appln_title, 'NEAR(("bayes*", "network*"), 1, TRUE)') OR CONTAINS (appln_title, '"chatbot*"') OR CONTAINS (appln_title, '"chat bot*"') OR CONTAINS (appln_title, '"chat-bot*"') OR CONTAIN

## had to be split up because searching all abstracts exceeded memory limitations ("There is insufficient system memory in source pool internal to run this query")
### separate parts of second query
## 1
CONTAINS (appln_abstract, '"fire-fly"') OR CONTAINS (appln_abstract, '"feature engineer*"') OR CONTAINS (appln_abstract, '"k-means"') OR CONTAINS (appln_abstract, '"policy-gradient method"') OR CONTAINS (appln_abstract, '"adversar*"') OR CONTAINS (appln_abstract, '"feature extract*"') OR CONTAINS (appln_abstract, '"kernel learn*"') OR CONTAINS (appln_abstract, '"q-learn*"') OR CONTAINS (appln_abstract, '"feature select*"') OR CONTAINS (appln_abstract, '"latent-variable*"') OR CONTAINS (appln_abstract, '"random-forest*"')

## 2
CONTAINS (appln_abstract, '"association rule"') OR CONTAINS (appln_abstract, '"fuzzy-c"') OR CONTAINS (appln_abstract, '"link* predict*"') OR CONTAINS (appln_abstract, '"recommender system*"') OR CONTAINS (appln_abstract, '"auto-encod*"') OR CONTAINS (appln_abstract, '"fuzzy environment*"') OR CONTAINS (appln_abstract, '"machine intelligen*"') OR CONTAINS (appln_abstract, '"reinforc* learn*"') OR CONTAINS (appln_abstract, '"autonom* comput*"') OR CONTAINS (appln_abstract, '"fuzzy logic*"') OR CONTAINS (appln_abstract, '"machine learn*"') OR CONTAINS (appln_abstract, '"sentiment* analy*"') OR CONTAINS (appln_abstract, '"back-propagat*"') 

## 3
CONTAINS (appln_abstract, '"fuzzy number*"') OR CONTAINS (appln_abstract, '"map-reduce"') OR CONTAINS (appln_abstract, '"sparse represent*"') OR CONTAINS (appln_abstract, '"back-propogat*"') OR CONTAINS (appln_abstract, '"fuzzy set*"') OR CONTAINS (appln_abstract, '"memetic algorithm*"') OR CONTAINS (appln_abstract, '"sparse*-code*"') OR CONTAINS (appln_abstract, '"cognitiv* comput*"') OR CONTAINS (appln_abstract, '"fuzzy system*"') OR CONTAINS (appln_abstract, '"multi* label* classif*"') OR CONTAINS (appln_abstract, '"spectral cluster*"') OR CONTAINS (appln_abstract, '"collaborat* filter*"') 

## 4
CONTAINS (appln_abstract, '"gaussian mixture model"') OR CONTAINS (appln_abstract, '"multi*-objective* algorithm*"') OR CONTAINS (appln_abstract, '"stochastic*-gradient*"') OR CONTAINS (appln_abstract, '"deep-belief network*"') OR CONTAINS (appln_abstract, '"gaussian process*"') OR CONTAINS (appln_abstract, '"multi*-objective* optim*"') OR CONTAINS (appln_abstract, '"*supervis* learn*"') OR CONTAINS (appln_abstract, '"deep-learn*"') OR CONTAINS (appln_abstract, '"genetic program*"') 

## 5
CONTAINS (appln_abstract, '"natural-gradient"') OR CONTAINS (appln_abstract, '"differential*-evol* algorithm*"') OR CONTAINS (appln_abstract, '"genetic* algorithm"') OR CONTAINS (appln_abstract, '"neural-turing"') OR CONTAINS (appln_abstract, '"swarm behav*"') OR CONTAINS (appln_abstract, '"dimensional*-reduc*"') OR CONTAINS (appln_abstract, '"high-dimensional* model*"') OR CONTAINS (appln_abstract, '"neuro-morph comput*"') 

## 6
CONTAINS (appln_abstract, '"transfer-learn*"') OR CONTAINS (appln_abstract, '"evolution* algorithm*"') OR CONTAINS (appln_abstract, '"high-dimensional* space*"') OR CONTAINS (appln_abstract, '"non-negative matri* factor*"') OR CONTAINS (appln_abstract, '"variation*-infer*"') OR CONTAINS (appln_abstract, '"evolution* comput*"') OR CONTAINS (appln_abstract, '"high-dimensional* system*"') OR CONTAINS (appln_abstract, '"object-recogni*"') OR CONTAINS (appln_abstract, '"vector-machine*"')

# Example Query
SELECT Distinct 
reg102_pat_publn.PUBLN_NR, reg102_pat_publn.PUBLN_KIND, reg102_pat_publn.publn_auth, appln_title, appln_abstract, publn_claims
FROM reg101_appln
INNER JOIN reg102_pat_publn
ON reg101_appln.id = reg102_pat_publn.id
LEFT OUTER JOIN tls203_appln_abstr
ON reg101_appln.appln_id = tls203_appln_abstr.appln_id
LEFT OUTER JOIN tls202_appln_title
ON reg101_appln.appln_id = tls202_appln_title.appln_id
LEFT OUTER JOIN tls211_pat_publn
ON reg101_appln.appln_id =  tls211_pat_publn.appln_id
WHERE reg102_pat_publn.publn_auth = 'EP' 
AND (
CONTAINS (appln_title, 'NEAR(("artific*", "intelligen*"), 1, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("computation*", "intelligen*"), 1, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("neural", "network*"), 1, TRUE)')
OR CONTAINS (appln_title, '"neural-network*"')
OR CONTAINS (appln_title, 'NEAR(("bayes*", "network*"), 1, TRUE)')
OR CONTAINS (appln_title, '"chatbot*"')
OR CONTAINS (appln_title, '"chat bot*"')
OR CONTAINS (appln_title, '"chat-bot*"')
OR CONTAINS (appln_title, 'NEAR(("data", "mining*"), 1, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("decision", "model*"), 1, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("deep", "learning*"), 1, TRUE)')
OR CONTAINS (appln_title, '"deep-learning*"')
OR CONTAINS (appln_title, 'NEAR(("genetic", "algorithm*"), 1, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("inductive", "programme*"), 8, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("logic", "programme*"), 8, TRUE)')
OR CONTAINS (appln_title, 'NEAR(("machine", "learning*"), 1, TRUE)')
OR CONTAINS (appln_title, '"machine-learning*"')
OR CONTAINS (appln_title, 'NEAR(("natural", "language"), 8, TRUE)')

OR CONTAINS (appln_abstract, 'NEAR(("artific*", "intelligen*"), 1, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("computation*", "intelligen*"), 1, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("neural", "network*"), 1, TRUE)')
OR CONTAINS (appln_abstract, '"neural-network*"')
OR CONTAINS (appln_abstract, 'NEAR(("bayes*", "network*"), 1, TRUE)')
OR CONTAINS (appln_abstract, '"chatbot*"')
OR CONTAINS (appln_abstract, '"chat bot*"')
OR CONTAINS (appln_abstract, '"chat-bot*"')
OR CONTAINS (appln_abstract, 'NEAR(("data", "mining*"), 1, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("decision", "model*"), 1, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("deep", "learning*"), 1, TRUE)')
OR CONTAINS (appln_abstract, '"deep-learning*"')
OR CONTAINS (appln_abstract, 'NEAR(("genetic", "algorithm*"), 1, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("inductive", "programme*"), 8, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("logic", "programme*"), 8, TRUE)')
OR CONTAINS (appln_abstract, 'NEAR(("machine", "learning*"), 1, TRUE)')
OR CONTAINS (appln_abstract, '"machine-learning*"')
OR CONTAINS (appln_abstract, 'NEAR(("natural", "language"), 8, TRUE)')
)

# Block 2: 
WIPO Block 1 (CPC codes)
Patentscope CPC codes
UKIPO CPC codes

(all standalone)

### Codes as taken from the Background paper for WIPO Technology Trends: Artificial Intelligence (original query, block 1)
WIPO_AI_string_1 = "Y10S-706 OR G06N-003 OR G06N-005/003:G06N-005/027 OR G06N007/005:G06N-007/06 OR G06N-099/005 OR G06T2207/20081 OR G06T2207/20084 OR G06T-003/4046 OR G06T-009/002 OR G06F-017/16 OR G05B-013/027 OR G05B013/0275 OR G05B-013/028 OR G05B-013/0285 OR G05B-013/029 OR G05B-013/0295 OR G05B-2219/33002 OR G05D-001/0088 OR G06K-009 OR G10L-015 OR G10L-017 OR G06F-017/27:G06F-017/2795 OR G06F-017/28:G06F-017/289 OR G06F-017/30029:G06F017/30035 OR G06F-017/30247:G06F-017/30262 OR G06F-017/30401 OR G06F-017/3043 OR G06F-017/30522:G06F-017/3053 OR G06F-017/30654 OR G06F-017/30663 OR G06F-017/30666 OR G06F-017/30669 OR G06F-017/30672 OR G06F-017/30684 OR G06F-017/30687 OR G06F-017/3069 OR G06F-017/30702 OR G06F-017/30705:G06F017/30713 OR G06F-017/30731:G06F-017/30737 OR G06F-017/30743:G06F-017/30746 OR G06F-017/30784:G06F-017/30814 OR G06F-019/24 OR G06F-019/707 OR G01R031/2846:G01R-031/2848 OR G01N-2201/1296 OR G01N-029/4481 OR G01N-033/0034 OR G01R-031/3651 OR G01S-007/417 OR G06N-003/004:G06N-003/008 OR G06F011/1476 OR G06F-011/2257 OR G06F-011/2263 OR G06F-015/18 OR G06F-2207/4824 OR G06K-007/1482 OR G06N-007/046 OR G11B-020/10518 OR G10H-2250/151 OR G10H-2250/311 OR G10K-2210/3024 OR H01J-2237/30427 OR H01M-008/04992 OR H02H-001/0092 OR H02P-021/0014 OR H02P-023/0018 OR H03H-2017/0208 OR H03H2222/04 OR H04L-2012/5686 OR H04L-2025/03464 OR H04L-2025/03554 OR H04L025/0254 OR H04L-025/03165 OR H04L-041/16 OR H04L-045/08 OR H04N021/4662:H04N-021/4666 OR H04Q-2213/054 OR H04Q-2213/13343 OR H04Q-2213/343 OR H04R-025/507 OR G08B-029/186 OR B60G-2600/1876 OR B60G-2600/1878 OR B60G-2600/1879 OR B64G-2001/247 OR E21B-2041/0028 OR B23K-031/006 OR B29C2945/76979 OR B29C-066/965 OR B25J-009/161 OR A61B-005/7264:A61B-005/7267 OR Y10S-128/924 OR Y10S-128/925 OR F02D-041/1405 OR F03D-007/046 OR F05B2270/707 OR F05B-2270/709 OR F16H-2061/0081 OR F16H-2061/0084 OR B60W-030/06 OR B60W-030/10:B60W-030/12 OR B60W-030/14:B60W-030/17 OR B62D-015/0285 OR G06T-2207/30248:G06T-2207/30268 OR G06T-2207/30236 OR G05D-001 OR A61B005/7267 OR F05D-2270/709 OR G06T-2207/20084 OR G10K-2210/3038 OR G10L-025/30 OR H04N-021/4666 OR A63F-013/67 OR G06F-017/2282"

In [6]:
# turn this into a sql query
WIPO_AI_CPC_string = "Y10S-706 OR G06N-003 OR G06N-005/003:G06N-005/027 OR G06N007/005:G06N-007/06 OR G06N-099/005 OR G06T2207/20081 OR G06T2207/20084 OR G06T-003/4046 OR G06T-009/002 OR G06F-017/16 OR G05B-013/027 OR G05B013/0275 OR G05B-013/028 OR G05B-013/0285 OR G05B-013/029 OR G05B-013/0295 OR G05B-2219/33002 OR G05D-001/0088 OR G06K-009 OR G10L-015 OR G10L-017 OR G06F-017/27:G06F-017/2795 OR G06F-017/28:G06F-017/289 OR G06F-017/30029:G06F017/30035 OR G06F-017/30247:G06F-017/30262 OR G06F-017/30401 OR G06F-017/3043 OR G06F-017/30522:G06F-017/3053 OR G06F-017/30654 OR G06F-017/30663 OR G06F-017/30666 OR G06F-017/30669 OR G06F-017/30672 OR G06F-017/30684 OR G06F-017/30687 OR G06F-017/3069 OR G06F-017/30702 OR G06F-017/30705:G06F017/30713 OR G06F-017/30731:G06F-017/30737 OR G06F-017/30743:G06F-017/30746 OR G06F-017/30784:G06F-017/30814 OR G06F-019/24 OR G06F-019/707 OR G01R031/2846:G01R-031/2848 OR G01N-2201/1296 OR G01N-029/4481 OR G01N-033/0034 OR G01R-031/3651 OR G01S-007/417 OR G06N-003/004:G06N-003/008 OR G06F011/1476 OR G06F-011/2257 OR G06F-011/2263 OR G06F-015/18 OR G06F-2207/4824 OR G06K-007/1482 OR G06N-007/046 OR G11B-020/10518 OR G10H-2250/151 OR G10H-2250/311 OR G10K-2210/3024 OR H01J-2237/30427 OR H01M-008/04992 OR H02H-001/0092 OR H02P-021/0014 OR H02P-023/0018 OR H03H-2017/0208 OR H03H2222/04 OR H04L-2012/5686 OR H04L-2025/03464 OR H04L-2025/03554 OR H04L025/0254 OR H04L-025/03165 OR H04L-041/16 OR H04L-045/08 OR H04N021/4662:H04N-021/4666 OR H04Q-2213/054 OR H04Q-2213/13343 OR H04Q-2213/343 OR H04R-025/507 OR G08B-029/186 OR B60G-2600/1876 OR B60G-2600/1878 OR B60G-2600/1879 OR B64G-2001/247 OR E21B-2041/0028 OR B23K-031/006 OR B29C2945/76979 OR B29C-066/965 OR B25J-009/161 OR A61B-005/7264:A61B-005/7267 OR Y10S-128/924 OR Y10S-128/925 OR F02D-041/1405 OR F03D-007/046 OR F05B2270/707 OR F05B-2270/709 OR F16H-2061/0081 OR F16H-2061/0084 OR B60W-030/06 OR B60W-030/10:B60W-030/12 OR B60W-030/14:B60W-030/17 OR B62D-015/0285 OR G06T-2207/30248:G06T-2207/30268 OR G06T-2207/30236 OR G05D-001 OR A61B005/7267 OR F05D-2270/709 OR G06T-2207/20084 OR G10K-2210/3038 OR G10L-025/30 OR H04N-021/4666 OR A63F-013/67 OR G06F-017/2282"

def expand_code_range(code):
    code = code.replace("-0", "")
    if ":" in code:
        # Split the code into start and end parts
        start, end = code.split(":")

        # Find the longest common prefix between start and end
        # We consider up to the last alphanumeric or special character before the numbers
        common_prefix = start.rsplit("/", 1)[0]
        
        # Get the numeric parts for the range
        start_number_str = start.rsplit("/", 1)[1]  # Numeric portion of the start code
        end_number_str = end.rsplit("/", 1)[1]      # Numeric portion of the end code

        while len(start_number_str) < len(end_number_str):
            start_number_str = start_number_str + '0'

        while len(end_number_str) < len(start_number_str):
            end_number_str = end_number_str + '0'
        #print(start_number_str)
        #print(end_number_str)
        start_int = int(start_number_str)
        end_int = int(end_number_str)
        padding_length = len(start_number_str)

        # Generate the full range with consistent padding
        expanded_range = [
            f"{common_prefix}/{str(i).zfill(padding_length)}" 
            for i in range(start_int + 1, end_int)  # Start counting from the next number
        ]
        # Generate the full range
        #expanded_range = [f"{common_prefix}/{i}" for i in range(start_int+1, end_int + 1)]
        
        # Include the original start code (not expanded)
        expanded_range.insert(0, start)
        expanded_range.insert(-1, end)  # Add the original start code
        
        return expanded_range
    else:
        # If no colon, just return the code as-is
        return [code]

# Create a new list of codes with expanded colon ranges
expanded_codes = []
for code in WIPO_AI_CPC_string.split(" OR "):
    expanded_codes.extend(expand_code_range(code))


print(len(expanded_codes))




411


### Codes as taken from Patentscope AI Index 
Patentscope_AI_string = "A61B5/7264, A61B5/7267, A63F13/67, B23K31/006, B25J9/161, B29C2945/76979, B29C66/965, B60G2600/1876, B60G2600/1878, B60G2600/1879, B60W30/06, B60W30/10, B60W30/14, B62D15/0285, B64G2001/247, E21B2041/0028, F02D41/1405, F03D7/046, F05B2270/707, F05B2270/709, F05D2270/709, F16H2061/0081, F16H2061/0084, G01N2201/1296, G01N29/4481, G01N33/0034, G01R31/2846, G01R31/3651, G01S7/417, G05B13/027, G05B13/0275, G05B13/028, G05B13/0285, G05B13/029, G05B13/0295, G05B2219/33002, G05D1/00, G05D1/0088, G06F11/1476, G06F11/2257, G06F11/2263, G06F15/18, G06F17/16, G06F17/2282, G06F17/27, G06F17/28, G06F17/30029, G06F17/30247, G06F17/30401, G06F17/3043, G06F17/30522, G06F17/30654, G06F17/30663, G06F17/30666, G06F17/30669, G06F17/30672, G06F17/30684, G06F17/30687, G06F17/3069, G06F17/30702, G06F17/30705, G06F17/30731, G06F17/30743, G06F17/30784, G06F19/24, G06F19/707, G06F2207/4824, G06K7/1482, G06K9/00, G06N3/00, G06N3/004, G06N5/003, G06N7/005, G06N7/046, G06N99/005, G06T2207/20081, G06T2207/20084, G06T2207/20084, G06T2207/30236, G06T2207/30248, G06T3/4046, G06T9/002, G08B29/186, G10H2250/151, G10H2250/311, G10K2210/3024, G10K2210/3038, G10L15/00, G10L17/00, G10L25/30, G11B20/10518, H01J2237/30427, H01M8/04992, H02H1/0092, H02P21/0014, H02P23/0018, H03H2017/0208, H03H2222/04, H04L2012/5686, H04L2025/03464, H04L2025/03554, H04L25/0254, H04L25/03165, H04L41/16, H04L45/08, H04N21/4662, H04N21/4666, H04Q2213/054, H04Q2213/13343, H04Q2213/343, H04R25/507, Y10S128/924, Y10S128/925, Y10S706/00"

In [2]:
Patentscope_AI_CPC_string = "A61B5/7264, A61B5/7267, A63F13/67, B23K31/006, B25J9/161, B29C2945/76979, B29C66/965, B60G2600/1876, B60G2600/1878, B60G2600/1879, B60W30/06, B60W30/10, B60W30/14, B62D15/0285, B64G2001/247, E21B2041/0028, F02D41/1405, F03D7/046, F05B2270/707, F05B2270/709, F05D2270/709, F16H2061/0081, F16H2061/0084, G01N2201/1296, G01N29/4481, G01N33/0034, G01R31/2846, G01R31/3651, G01S7/417, G05B13/027, G05B13/0275, G05B13/028, G05B13/0285, G05B13/029, G05B13/0295, G05B2219/33002, G05D1/00, G05D1/0088, G06F11/1476, G06F11/2257, G06F11/2263, G06F15/18, G06F17/16, G06F17/2282, G06F17/27, G06F17/28, G06F17/30029, G06F17/30247, G06F17/30401, G06F17/3043, G06F17/30522, G06F17/30654, G06F17/30663, G06F17/30666, G06F17/30669, G06F17/30672, G06F17/30684, G06F17/30687, G06F17/3069, G06F17/30702, G06F17/30705, G06F17/30731, G06F17/30743, G06F17/30784, G06F19/24, G06F19/707, G06F2207/4824, G06K7/1482, G06K9/00, G06N3/00, G06N3/004, G06N5/003, G06N7/005, G06N7/046, G06N99/005, G06T2207/20081, G06T2207/20084, G06T2207/20084, G06T2207/30236, G06T2207/30248, G06T3/4046, G06T9/002, G08B29/186, G10H2250/151, G10H2250/311, G10K2210/3024, G10K2210/3038, G10L15/00, G10L17/00, G10L25/30, G11B20/10518, H01J2237/30427, H01M8/04992, H02H1/0092, H02P21/0014, H02P23/0018, H03H2017/0208, H03H2222/04, H04L2012/5686, H04L2025/03464, H04L2025/03554, H04L25/0254, H04L25/03165, H04L41/16, H04L45/08, H04N21/4662, H04N21/4666, H04Q2213/054, H04Q2213/13343, H04Q2213/343, H04R25/507, Y10S128/924, Y10S128/925, Y10S706/00"
codes = Patentscope_AI_CPC_string.split(", ")

# List for codes without a colon
codes = ["'" + code + "'" for code in codes if ":" not in code]

# Print the formatted results
print(len(codes))



114


cpc codes UK: 
A61B5/7267 G01N33/0034 G06F19/24 G10H2250/151 H04L2025/03464
B29C66/965 G01N2201/1296 G06F19/707 G10H2250/311 H04N21/4662
B29C2945/76979 G01S7/417 G06F2207/4824 G10K2210/3024 H04N21/4663
B60G2600/1876 G05B13/027 G06K7/1482 G10K2210/3038 H04N21/4665
B60G2600/1878 G05B13/0275 G06N3/004 G10L25/30 H04N21/4666
B60G2600/1879 G05B13/028 G06N3/02 G11B20/10518 H04Q2213/054
E21B2041/0028 G05B13/0285 G06N3/12 H01J2237/30427 H04Q2213/13343
F02D41/1405 G05B13/029 G06N5 H02P21/0014 H04Q2213/343
F03D7/046 G05B13/0295 G06N7 H02P23/0018 H04R25/507
F05B2270/707 G05B2219/33002 G06N20 H03H2017/0208 Y10S128/924
F05B2270/709 G05D1/0088 G06N99/005 H03H2222/04 Y10S128/925
F05D2270/707 G06F11/1476 G06T3/4046 H04L25/0254 Y10S706
F05D2270/709 G06F11/2257 G06T9/002 H04L25/03165
F16H2061/0081 G06F11/2263 G06T2207/20081 H04L41/16
F16H2061/0084 G06F15/18 G06T2207/20084 H04L45/08
G01N29/4481 G06F17/16 G08B29/186 H04L2012/5686

In [3]:
UKIPO_AI_CPC_string = "A61B5/7267 G01N33/0034 G06F19/24 G10H2250/151 H04L2025/03464 B29C66/965 G01N2201/1296 G06F19/707 G10H2250/311 H04N21/4662 B29C2945/76979 G01S7/417 G06F2207/4824 G10K2210/3024 H04N21/4663 B60G2600/1876 G05B13/027 G06K7/1482 G10K2210/3038 H04N21/4665 B60G2600/1878 G05B13/0275 G06N3/004 G10L25/30 H04N21/4666 B60G2600/1879 G05B13/028 G06N3/02 G11B20/10518 H04Q2213/054 E21B2041/0028 G05B13/0285 G06N3/12 H01J2237/30427 H04Q2213/13343 F02D41/1405 G05B13/029 G06N5 H02P21/0014 H04Q2213/343 F03D7/046 G05B13/0295 G06N7 H02P23/0018 H04R25/507 F05B2270/707 G05B2219/33002 G06N20 H03H2017/0208 Y10S128/924 F05B2270/709 G05D1/0088 G06N99/005 H03H2222/04 Y10S128/925 F05D2270/707 G06F11/1476 G06T3/4046 H04L25/0254 Y10S706 F05D2270/709 G06F11/2257 G06T9/002 H04L25/03165 F16H2061/0081 G06F11/2263 G06T2207/20081 H04L41/16 F16H2061/0084 G06F15/18 G06T2207/20084 H04L45/08 G01N29/4481 G06F17/16 G08B29/186 H04L2012/5686"
codes = UKIPO_AI_CPC_string.split(" ")

# List for codes without a colon
codes = ["'" + code + "'" for code in codes if ":" not in code]

# Print the formatted results
print(len(codes))



76


In [22]:
# join and deduplicate
codes_pat = Patentscope_AI_CPC_string.split(", ")
codes_wipo = []
for code in WIPO_AI_CPC_string.split(" OR "):
    codes_wipo.extend(expand_code_range(code))
codes_ukipo = UKIPO_AI_CPC_string.split(" ")

# join all codes
codes_all = codes_pat + codes_wipo + codes_ukipo
print(len(codes_all))
codes_all = list(set(codes_all))
print(len(codes_all))

# List for codes without a colon
CPC_AI_codes = ["'" + code + "'" for code in codes_all]

# Join the codes with a comma and a space for each list
result = ", ".join(CPC_AI_codes)


601
473


In [23]:
# define function to take key words and return a list of SQL queries specifying the tables and fields to search, with each query limited to 10,000 characters (patstat requirement)
def split_codes_into_queries(codes, max_spaces, template_overhead):
    """
    Split conditions into groups such that each group's SQL query remains within the specified character limit.
    """
    # Initialize variables to hold condition groups and current query length
    query_groups = []
    current_group = []
    current_length = template_overhead  # Start with overhead of the fixed part of the SQL template

    for code in codes:
        # Calculate the length of the condition with ' OR ' separator
        code_length = len(code) + 2  # Account for ', '

        if current_length + code_length <= max_spaces:
            # If adding this condition doesn't exceed the limit, add it to the current group
            current_group.append(code)
            current_length += code_length
        else:
            # If it exceeds the limit, save the current group and start a new one
            query_groups.append(current_group)
            # Reset current group with the new condition and the overhead
            current_group = [code]
            current_length = template_overhead + code_length

    # Add the last group if it's not empty
    if current_group:
        query_groups.append(current_group)

    return query_groups


def generate_sql_queries_cpc(codes, max_spaces=10000):
    """
    Generate SQL queries based on 'NEAR' patterns, limiting each query to a specified number of spaces.
    """
    # Generate conditions for all given 'NEAR' patterns
    
    # The static part of the SQL query (before inserting conditions)
    sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, cpc_class_symbol
    FROM reg101_appln
    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls224_appln_cpc
    ON reg101_appln.appln_id = tls224_appln_cpc.appln_id
    WHERE publn_auth = 'EP' 
    AND REPLACE(cpc_class_symbol, ' ', '') IN 
    ( {codes}  )
    """

    # Get the length of the static part of the query
    template_overhead = len(sql_template_base.format(codes=''))

    # Split conditions into groups with the given max spaces limit
    query_groups = split_codes_into_queries(codes, max_spaces, template_overhead)

    # List to hold the final SQL queries
    sql_queries = []    

    # Generate a complete SQL query for each group of conditions
    for group in query_groups:
        formatted_codes = ", ".join(group)  # Join with 'OR'
        final_sql = sql_template_base.format(codes=formatted_codes)  # Insert into template
        sql_queries.append(final_sql)  # Add to list of final SQL queries

    return sql_queries


In [24]:
queries_AI_cpc = generate_sql_queries_cpc(codes = CPC_AI_codes, max_spaces=10000)

def clean_query(query):
    """
    Remove unwanted backslashes, newlines, tabs, and extra spaces.
    """
    query = query.replace("\\", "")  # Remove backslashes
    query = query.replace("\n", " ")  # Replace newlines with a space
    query = query.replace("\t", " ")  # Replace tabs with a space
    query = query.replace("  ", " ")  # Remove double spaces
    query = query.replace("-0", "")  # Remove dashes with zeroes
    query = query.replace("-", "")  # Remove dashes
    return query.strip()  # Remove leading/trailing spaces

for query in queries_AI_cpc:
    print(clean_query(query))
    print("\n")

SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, cpc_class_symbol   FROM reg101_appln   INNER JOIN reg102_pat_publn   ON reg101_appln.id = reg102_pat_publn.id   INNER JOIN tls224_appln_cpc   ON reg101_appln.appln_id = tls224_appln_cpc.appln_id   WHERE publn_auth = 'EP'   AND REPLACE(cpc_class_symbol, ' ', '') IN   ( 'G05B013/0275', 'G06F17/30796', 'G06F17/2756', 'G05B13/0285', 'G06N007/029', 'G06N007/012', 'G01N2201/1296', 'Y10S706', 'G06F17/3043', 'G06F17/2788', 'G06N05/021', 'G06F17/2754', 'G01N29/4481', 'G06N05/024', 'G06N007/006', 'G06F17/2730', 'G11B20/10518', 'G06N5', 'G06T2207/30248', 'G06F17/2727', 'G06N007/036', 'G06N007/020', 'G06F17/2779', 'G06F17/2765', 'G06T2207/30249', 'G06F17/2729', 'B29C66/965', 'Y10S128/925', 'G06N05/027', 'G06N05/023', 'G06F17/30810', 'G06F17/2705', 'A61B5/7264', 'G06F17/30801', 'G06F17/30809', 'H04N021/4662', 'G06N007/047', 'H04L25/03165', 'G06T2207/30266', 'G06N007/053', 'G06F17/287', 'G06T2207/30257', 'G06F19/24', 'G06F17/30711', 'G05B13/0295', 'G

## Block 2.5
IPC Codes (only given by UKIPO method)

G06F19/24 G06N3 G06N5 G06N7/02 G06N7/04 G06N7/06 G06N20 G06T1/40 G16B4/20 G16B4/30 G16C20/70

In [10]:
UKIPO_AI_IPC_string = "G06F19/24 G06N3 G06N5 G06N7/02 G06N7/04 G06N7/06 G06N20 G06T1/40 G16B4/20 G16B4/30 G16C20/70"

codes_ukipo_IPC = UKIPO_AI_IPC_string.split(" ")

# List for codes without a colon
codes_IPC = ["'" + code + "'" for code in codes_ukipo_IPC]

def generate_sql_queries_ipc(codes, max_spaces=10000):
    """
    Generate SQL queries based on 'NEAR' patterns, limiting each query to a specified number of spaces.
    """
    # Generate conditions for all given 'NEAR' patterns
    
    # The static part of the SQL query (before inserting conditions)
    sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, ipc_class_symbol
    FROM reg101_appln
    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls209_appln_ipc
    ON reg101_appln.appln_id = tls209_appln_ipc.appln_id
    WHERE publn_auth = 'EP' 
    AND REPLACE(ipc_class_symbol, ' ', '') IN 
    ( {codes}  )
    """

    # Get the length of the static part of the query
    template_overhead = len(sql_template_base.format(codes=''))

    # Split conditions into groups with the given max spaces limit
    query_groups = split_conditions_into_queries(codes, max_spaces, template_overhead)

    # List to hold the final SQL queries
    sql_queries = []    

    # Generate a complete SQL query for each group of conditions
    for group in query_groups:
        formatted_codes = ", ".join(group)  # Join with 'OR'
        final_sql = sql_template_base.format(codes=formatted_codes)  # Insert into template
        sql_queries.append(final_sql)  # Add to list of final SQL queries

    return sql_queries

queries_AI_ipc = generate_sql_queries_ipc(codes_IPC)

def clean_query(query):
    """
    Remove unwanted backslashes, newlines, tabs, and extra spaces.
    """
    query = query.replace("\\", "")  # Remove backslashes
    query = query.replace("\n", " ")  # Replace newlines with a space
    query = query.replace("\t", " ")  # Replace tabs with a space
    query = query.replace("  ", " ")  # Remove double spaces
    return query.strip()  # Remove leading/trailing spaces

for query in queries_AI_ipc:
    print(clean_query(query))

NameError: name 'split_conditions_into_queries' is not defined

In [11]:
codes_IPC = pd.DataFrame(codes_IPC)
codes_IPC.to_csv('IPC_AI_codes.csv', index=False)

# Block 3
WIPO Block 3
Patentscope AI Index (General, duplicates removed)

In [6]:
# list of key word (combinations) from WIPO and UK IPO reports
WIPO_Patentscope_keyword_combinations = ['"clustering"',
    '"comput* creativity"',
    '"descriptive model*"',
    '"inductive reasoning"',
    '"overfitting"',
    'NEAR(("predictive", "analytics"), 1, TRUE)',
    'NEAR(("predictive", "model*"), 1, TRUE)',
    'NEAR(("target", "function"), 1, TRUE)',
    'NEAR(("test", "data set"), 1, TRUE)',
    'NEAR(("test", "dataset"), 1, TRUE)',
    'NEAR(("training", "data set"), 1, TRUE)',
    'NEAR(("training", "dataset"), 1, TRUE)',
    'NEAR(("validation", "data set"), 1, TRUE)',
    'NEAR(("validation", "dataset"), 1, TRUE)',
    'NEAR(("test data", "set"), 1, TRUE)',
    'NEAR(("training data", "set"), 1, TRUE)',
    'NEAR(("validation data", "set"), 1, TRUE)',
    '"backpropagation"',
    '"self-learning"',
    '"self learning"',
    '"selflearning"',
    '"objective function*"',
    '"feature* selection"',
    '"embedding*"',
    '"active learning"',
    '"regression model*"',
    '"stochastic approach*"',
    '"stochastic technique*"',
    '"stochastic method*"',
    '"stochastic algorithm*"',
    '"probabilist technique*"',
    '"probabilist approach*"',
    '"probabilist method*"',
    '"probabilist algorithm*"',
    '"recommend system*"',
    'NEAR(("text", "analysis"), 1, TRUE)',
    'NEAR(("speech", "analysis"), 1, TRUE)',
    'NEAR(("hand writing", "analysis"), 1, TRUE)',
    'NEAR(("handwriting", "analysis"), 1, TRUE)',
    'NEAR(("facial", "analysis"), 1, TRUE)',
    'NEAR(("face", "analysis"), 1, TRUE)',
    'NEAR(("text", "analytic*"), 1, TRUE)',
    'NEAR(("speech", "analytic*"), 1, TRUE)',
    'NEAR(("hand writing", "analytic*"), 1, TRUE)',
    'NEAR(("handwriting", "analytic*"), 1, TRUE)',
    'NEAR(("facial", "analytic*"), 1, TRUE)',
    'NEAR(("face", "analytic*"), 1, TRUE)',
    'NEAR(("text", "recognition"), 1, TRUE)',
    'NEAR(("speech", "recognition"), 1, TRUE)',
    'NEAR(("hand writing", "recognition"), 1, TRUE)',
    'NEAR(("handwriting", "recognition"), 1, TRUE)',
    'NEAR(("facial", "recognition"), 1, TRUE)',
    'NEAR(("face", "recognition"), 1, TRUE)',
    
]

In [7]:
Patentscope_keyword_combinations = ['"robot*"',
    '"deep blue"',
    '"combinatorial explosion"',
    '"autonomous system*"',
    '"medical imag*"',
    '"healthcare"',
    '"virtual assist*"',
    '"personal* medic*"',
    '"precision medic*"',
    '"genomic screening"',
    '"drug discover*"',
    '"medical diagnos*"',
    '"drug creation"',
    '"medication manag*"',
    '"autonomous vehicle*"',
    '"transportation"',
    '"driverless"',
    '"smart car*"',
    '"smart city*"',
    '"smart grid*"',
    '"automotive"',
    '"agriculture"',
    '"irrigation system*"',
    '"fintech"',
    '"banking"',
    '"finance"',
    '"economics"',
    '"cybersecurity"',
    '"predictive purchas*"',
    '"marketing analytic*"',
    '"video game*"'
]

In [51]:
formatted_conditions = " OR ".join(Patentscope_keyword_combinations)  
formatted_conditions

'"robot*" OR "deep blue" OR "combinatorial explosion" OR "autonomous system*" OR "medical imag*" OR "healthcare" OR "virtual assist*" OR "personal* medic*" OR "precision medic*" OR "genomic screening" OR "drug discover*" OR "medical diagnos*" OR "drug creation" OR "medication manag*" OR "autonomous vehicle*" OR "transportation" OR "driverless" OR "smart car*" OR "smart city*" OR "smart grid*" OR "automotive" OR "agriculture" OR "irrigation system*" OR "fintech" OR "banking" OR "finance" OR "economics" OR "cybersecurity" OR "predictive purchas*" OR "marketing analytic*" OR "video game*"'

In [8]:
Patentscope_IPC_combinations = "G10L13/00, G10L25/00, G10L99/00, G06F17/14, G06F17/153, G10H2250/005, G06F17/30, G06F17/50, G06Q, G06Q30/02, G06T7/00, G06T1/20"
Patentscope_CPC_combinations = "A61B5/00, A63F13/67, B23K31/00, B25J9/16, B29C65/00, B60W30/06, B60W30/10, B60W30/14, B62D15/02, B64G1/24, E21B41/00, F02D41/14, F03D7/04, F16H61/00, G01N29/44, G01N33/00, G01R31/28, G01R31/36, G01S7/41, G05B13/02, G05D1/00, G06E1/00, G06E3/00, G06F9/44, G06F11/14, G06F11/22, G06F15/00, G06F17/00, G06F19/00, G06G7/00, G06J1/00, G06K7/14, G06K9/00, G06N3/00, G06N5/00, G06N7/00, G06N99/00, G06T1/20, G06T1/40, G06T3/40, G06T7/00, G06T9/00, G08B29/18, G10L13/00, G10L15/00, G10L17/00, G10L25/00, G10L99/00, G11B20/10, G16H50/20, H01M8/04992, H02H1/00, H02P21/00, H02P23/00, H03H17/02, H04L12/24, H04L12/70, H04L12/751, H04L25/02, H04L25/03, H04N21/466, H04R25/00"

Patentscope_IPC_combinations = ["'" + code + "'" for code in Patentscope_IPC_combinations.split(", ")]
Patentscope_CPC_combinations = ["'" + code + "'" for code in Patentscope_CPC_combinations.split(", ")]

In [9]:
WIPO_CPC_combinations_string = "G06T-007 OR G06T-001/20 OR G10L-013 OR G10L-025 OR G10L-099 OR G06F017/14:G06F-017/148 OR G06F-017/153 OR G10H-2250/005:G10H-2250/021 OR G06F017/50 OR G06Q-030/02:G06Q-030/0284 OR G07C-009 OR G06F-021"
WIPO_IPC_combinations_string = "A61B-005 OR A63F-013/67 OR B23K-031 OR B25J-009/16:B25J-009/20 OR B29C065 OR B60W-030/06 OR B60W-030/10:B60W-030/12 OR B60W-030/14:B60W-030/17 OR B62D-015/02 OR B64G-001/24:B64G-001/38 OR E21B-041 OR F02D041/14:F02D-041/16 OR F03D-007/04:F03D-007/048 OR F16H-061 OR G01N029/44:G01N-029/52 OR G01N-033 OR G01R31/36 OR G01R31/364 OR G01R31/367 OR G01S-007/41:G01S-007/418 OR G05B-013/02 OR G05B-013/04 OR G05D-001 OR G06F-009/44+ OR G06F-011/14 OR G06F011/22 OR G06F011/24 OR G06F011/25 OR G06F011/26 OR G06F011/263 OR G06F011/267 OR G06F011/27 OR G06F011/273 OR G06F011/277 OR G06F-015/18 OR G06F-017/14 OR G06F-017/15 OR G06F017/16 OR G06F-017/20 OR G06F-017/27 OR G06F-017/28 OR G06F-019/24 OR G06K-009 OR G06N-003 OR G06N-005 OR G06N-007 OR G06N-099 OR G06T-001/20 OR G06T-001/40+ OR G06T-007 OR G06T-009 OR G08B-029/18:G08B-029/28 OR G10L-013 OR G10L-015 OR G10L-017 OR G10L-025 OR G10L-099 OR G11B-020/10:G11B-020/18 OR G16H-050/20 OR H01M-008/04992 OR H02H-001 OR H02P-021 OR H02P-023 OR H03H-017/02:H03H017/06 OR H04L-012/24+ OR H04L-012/70+ OR H04L-012/751+ OR H04L-025/02:H04L025/26 OR H04L-025/03 OR H04N-021/466:H04N-021/4668 OR H04R025 OR G07C-009 OR G06F-021 OR GO1R31/28 OR G01R31/309 OR G01R31/306 OR G01R31/302 OR G01R31/307 OR G01R31/308 OR G01R31/304 OR G01R31/30 OR G01R31/303 OR G01R31/305 OR G01R31/3173 OR G01R31/3177 OR G01R31/3187 OR G01R31/312 OR G01R31/317 OR G01R31/3183 OR G01R31/319 OR G01R31/3185 OR G01R31/315 OR G01R31/3193 OR G01R31/3167 OR G01R31/311 OR G01R31/3181 OR G01R31/3161 OR G01R31/3163 OR G01R31/316"

In [60]:
# turn this into a sql query

def expand_code_range(code):
    code = code.replace("-0", "")
    if ":" in code:
        # Split the code into start and end parts
        start, end = code.split(":")

        # Find the longest common prefix between start and end
        # We consider up to the last alphanumeric or special character before the numbers
        common_prefix = start.rsplit("/", 1)[0]
        
        # Get the numeric parts for the range
        start_number_str = start.rsplit("/", 1)[1]  # Numeric portion of the start code
        end_number_str = end.rsplit("/", 1)[1]      # Numeric portion of the end code

        while len(start_number_str) < len(end_number_str):
            start_number_str = start_number_str + '0'

        while len(end_number_str) < len(start_number_str):
            end_number_str = end_number_str + '0'
        #print(start_number_str)
        #print(end_number_str)
        start_int = int(start_number_str)
        end_int = int(end_number_str)
        padding_length = len(start_number_str)

        # Generate the full range with consistent padding
        expanded_range = [
            f"{common_prefix}/{str(i).zfill(padding_length)}" 
            for i in range(start_int + 1, end_int)  # Start counting from the next number
        ]
        # Generate the full range
        #expanded_range = [f"{common_prefix}/{i}" for i in range(start_int+1, end_int + 1)]
        
        # Include the original start code (not expanded)
        expanded_range.insert(0, start)
        expanded_range.insert(-1, end)  # Add the original start code
        
        return expanded_range
        
    else:
        # If no colon, just return the code as-is
        return [code]

# Create a new list of codes with expanded colon ranges
WIPO_CPC_combinations_expanded = []
for code in WIPO_CPC_combinations_string.split(" OR "):
    WIPO_CPC_combinations_expanded.extend(expand_code_range(code))


WIPO_IPC_combinations_expanded = []
for code in WIPO_IPC_combinations_string.split(" OR "):
    WIPO_IPC_combinations_expanded.extend(expand_code_range(code))

WIPO_IPC_combinations_plus = []
for code in WIPO_IPC_combinations_expanded:
    if "+" in code:
        WIPO_IPC_combinations_expanded.remove(code)
        WIPO_IPC_combinations_plus.append(code.replace("+", "%"))

print("IPC +", len(WIPO_IPC_combinations_plus))
print("IPC expanded (WIPO)", len(WIPO_IPC_combinations_expanded))
print("CPC expanded (WIPO)", len(WIPO_CPC_combinations_expanded))
print("IPC expanded (PAT)", len(Patentscope_IPC_combinations))
print("CPC expanded (PAT)", len(Patentscope_CPC_combinations))

WIPO_IPC_combinations_expanded = ["'" + code + "'" for code in WIPO_IPC_combinations_expanded]
WIPO_CPC_combinations_expanded = ["'" + code + "'" for code in WIPO_CPC_combinations_expanded]
WIPO_IPC_combinations_plus = ["'" + code + "'" for code in WIPO_IPC_combinations_plus]


IPC + 4
IPC expanded (WIPO) 198
CPC expanded (WIPO) 120
IPC expanded (PAT) 12
CPC expanded (PAT) 62


In [61]:
IPC_combinations_complete = set(WIPO_IPC_combinations_expanded).union(set(Patentscope_IPC_combinations))
CPC_combinations_complete = set(WIPO_CPC_combinations_expanded).union(set(Patentscope_CPC_combinations))
print(len(IPC_combinations_complete))
print(len(CPC_combinations_complete))


208
182


In [62]:
def generate_sql_queries_comb(codes, conditions, type, keywordfield):
    """
    Generate SQL queries based on IPC/CPC codes and keyword conditions.
    """
    # Generate conditions for all given 'NEAR' patterns
    if type == "CPC" and keywordfield == "abstract": 
    # The static part of the SQL query (before inserting conditions)
        sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, cpc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls224_appln_cpc
    ON reg101_appln.appln_id = tls224_appln_cpc.appln_id
    INNER JOIN tls203_appln_abstr
    ON reg101_appln.appln_id = tls203_appln_abstr.appln_id

    WHERE publn_auth = 'EP' 
    AND REPLACE(cpc_class_symbol, ' ', '') IN 
    ( {inputcodes}  )
    AND (CONTAINS(tls203_appln_abstr.appln_abstract, '{inputconditions}'))
    """
    elif type == "CPC" and keywordfield == "title":
        sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, cpc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls224_appln_cpc
    ON reg101_appln.appln_id = tls224_appln_cpc.appln_id
    INNER JOIN tls202_appln_title
    ON reg101_appln.appln_id = tls202_appln_title.appln_id

    WHERE publn_auth = 'EP' 
    AND REPLACE(cpc_class_symbol, ' ', '') IN 
    ( {inputcodes}  )
    AND (CONTAINS(tls202_appln_title.appln_title, '{inputconditions}'))
    """
    elif type == "IPC" and keywordfield == "abstract":
            sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, ipc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls209_appln_ipc
    ON reg101_appln.appln_id = tls209_appln_ipc.appln_id
    INNER JOIN tls203_appln_abstr
    ON reg101_appln.appln_id = tls203_appln_abstr.appln_id

    WHERE publn_auth = 'EP' 
    AND REPLACE(ipc_class_symbol, ' ', '') IN 
    ( {inputcodes}  )
    AND (CONTAINS(tls203_appln_abstr.appln_abstract, '{inputconditions}'))
    """
    elif type == "IPC" and keywordfield == "title":
         sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, ipc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls209_appln_ipc
    ON reg101_appln.appln_id = tls209_appln_ipc.appln_id
    INNER JOIN tls202_appln_title
    ON reg101_appln.appln_id = tls202_appln_title.appln_id
    
    WHERE publn_auth = 'EP' 
    AND REPLACE(ipc_class_symbol, ' ', '') IN 
    ( {inputcodes}  )
    AND (CONTAINS(tls202_appln_title.appln_title, '{inputconditions}'))
    """
    elif type == "IPC_plus" and keywordfield == "abstract":
           sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, ipc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls209_appln_ipc
    ON reg101_appln.appln_id = tls209_appln_ipc.appln_id
    INNER JOIN tls203_appln_abstr
    ON reg101_appln.appln_id = tls203_appln_abstr.appln_id

    WHERE publn_auth = 'EP' 
    AND ({inputcodes})
    AND (CONTAINS(tls203_appln_abstr.appln_abstract, '{inputconditions}'))
    """   
    elif type == "IPC_plus" and keywordfield == "title":
           sql_template_base = """

    SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, ipc_class_symbol

    FROM reg101_appln

    INNER JOIN reg102_pat_publn
    ON reg101_appln.id = reg102_pat_publn.id
    INNER JOIN tls209_appln_ipc
    ON reg101_appln.appln_id = tls209_appln_ipc.appln_id
    INNER JOIN tls202_appln_title
    ON reg101_appln.appln_id = tls202_appln_title.appln_id
    
    WHERE publn_auth = 'EP' 
    AND ({inputcodes})
    AND (CONTAINS(tls202_appln_title.appln_title, '{inputconditions}'))
    """          

    # List to hold the final SQL queries

    # Generate a complete SQL query for each group of conditions
    codes = [code.replace("-0", "") for code in codes]
    codes = [code.replace("-", "") for code in codes]
    if type == "IPC_plus":
        formatted_codes_list = [f"REPLACE(ipc_class_symbol, ' ', '') LIKE ({pattern})" for pattern in codes] 
        formatted_codes = " OR ".join(formatted_codes_list)
        formatted_conditions = " OR ".join(conditions)
        final_sql = sql_template_base.format(inputcodes=formatted_codes, inputconditions = formatted_conditions)  # Insert into template
    else: 
        formatted_codes = ", ".join(codes)
        formatted_conditions = " OR ".join(conditions) 
        final_sql = sql_template_base.format(inputcodes = formatted_codes, inputconditions=formatted_conditions)  # Insert into template
    return final_sql


In [63]:
queries_AI_combs = []
queries_AI_combs.append(generate_sql_queries_comb(codes = Patentscope_IPC_combinations, conditions = Patentscope_keyword_combinations, type = "IPC", keywordfield = "abstract"))
queries_AI_combs.append(generate_sql_queries_comb(codes = Patentscope_IPC_combinations, conditions = Patentscope_keyword_combinations, type = "IPC", keywordfield = "title"))
queries_AI_combs.append(generate_sql_queries_comb(codes = Patentscope_CPC_combinations, conditions = Patentscope_keyword_combinations, type = "CPC", keywordfield = "abstract"))
queries_AI_combs.append(generate_sql_queries_comb(codes = Patentscope_CPC_combinations, conditions = Patentscope_keyword_combinations, type = "CPC", keywordfield = "title"))
queries_AI_combs.append(generate_sql_queries_comb(codes = IPC_combinations_complete, conditions = WIPO_Patentscope_keyword_combinations, type = "IPC", keywordfield = "abstract"))
queries_AI_combs.append(generate_sql_queries_comb(codes = IPC_combinations_complete, conditions = WIPO_Patentscope_keyword_combinations, type = "IPC", keywordfield = "title"))
queries_AI_combs.append(generate_sql_queries_comb(codes = CPC_combinations_complete, conditions = WIPO_Patentscope_keyword_combinations, type = "CPC", keywordfield = "abstract"))
queries_AI_combs.append(generate_sql_queries_comb(codes = CPC_combinations_complete, conditions = WIPO_Patentscope_keyword_combinations, type = "CPC", keywordfield = "title"))
queries_AI_combs.append(generate_sql_queries_comb(codes = WIPO_IPC_combinations_plus, conditions = WIPO_Patentscope_keyword_combinations, type = "IPC_plus", keywordfield = "abstract"))
queries_AI_combs.append(generate_sql_queries_comb(codes = WIPO_IPC_combinations_plus, conditions = WIPO_Patentscope_keyword_combinations, type = "IPC_plus", keywordfield = "title"))

In [64]:
for query in queries_AI_combs:
    print(clean_query(query))
    print("\n")

SELECT Distinct PUBLN_NR, PUBLN_KIND, publn_auth, publn_date, ipc_class_symbol   FROM reg101_appln   INNER JOIN reg102_pat_publn   ON reg101_appln.id = reg102_pat_publn.id   INNER JOIN tls209_appln_ipc   ON reg101_appln.appln_id = tls209_appln_ipc.appln_id   INNER JOIN tls203_appln_abstr   ON reg101_appln.appln_id = tls203_appln_abstr.appln_id   WHERE publn_auth = 'EP'   AND REPLACE(ipc_class_symbol, ' ', '') IN   ( 'G10L13/00', 'G10L25/00', 'G10L99/00', 'G06F17/14', 'G06F17/153', 'G10H2250/005', 'G06F17/30', 'G06F17/50', 'G06Q', 'G06Q30/02', 'G06T7/00', 'G06T1/20' )   AND (CONTAINS(tls203_appln_abstr.appln_abstract, '"robot*" OR "deep blue" OR "combinatorial explosion" OR "autonomous system*" OR "medical imag*" OR "healthcare" OR "virtual assist*" OR "personal* medic*" OR "precision medic*" OR "genomic screening" OR "drug discover*" OR "medical diagnos*" OR "drug creation" OR "medication manag*" OR "autonomous vehicle*" OR "transportation" OR "driverless" OR "smart car*" OR "smart cit