In [3]:
import json
import re
import pandas as pd

In [4]:
patent_data=pd.read_csv(r"../titles_abstracts_20170307.tsv", sep="\t", nrows=10000)

In [3]:
patent_data.head()

Unnamed: 0,id,title,abstract
0,3930271,Golf glove,A golf glove is disclosed having an extra fin...
1,3930272,Crib leg lock,A lock for a height-adjustable crib or plaype...
2,3930273,Bed safety side rail arrangement,A bed safety side rail arrangement which incl...
3,3930274,Assembly for use in recreational activities,The assembly includes a longitudinal axis and...
4,3930275,Method of fabricating a slipper,A novel slipper and its method of fabrication...


In [7]:
terms={'clauses': ['substan*', 'bound* edg*',{'clauses': ['random*', 'edg*'], 'joinwith': 'AND'}], 'joinwith': 'AND'}

In [5]:
with open('./green-technology/dummy.json') as data_file:    
    patterns = json.load(data_file)

In [6]:
patterns

[{'id': 1,
  'sector': 'General',
  'subsector': '',
  'terms': {'clauses': ['sustainab*',
    'green good*',
    'green technolog*',
    'green innov*',
    'eco*innov*',
    'green manufac*',
    'green prod*',
    'pollut*',
    'ecolabel',
    'environ* product declarat*',
    {'clauses': ['EPD', 'environ*'], 'joinwith': 'AND'},
    'environ* prefer* product*',
    'environ* label*'],
   'joinwith': 'OR'}}]

In [6]:
def build_re(terms, regexps):
    regexps["joinwith"]=terms["joinwith"]
    regexps["clauses"]=[]
    for clause in terms["clauses"]:
        found_flag=False
        if isinstance(clause, dict):
            current_reg=build_re(clause,{})
        else:
            current_expression=clause.replace('*',"[\w]*")
            current_expression=current_expression.replace(r" ","[\s]*?")
            #current_expression=r".*[ ]*"+current_expression+"[ ]*.*"
            if sum(1 for c in clause if c.isupper()) == len(clause):
                current_reg= re.compile(current_expression) 
            else:
                current_reg= re.compile(current_expression, re.IGNORECASE) 
        regexps["clauses"].append(current_reg)
    return regexps

In [7]:
pattern_with_regexps=[]
for pattern in patterns:
    pattern_with_regexps.append({"id":pattern["id"],"sector":pattern["sector"],"subsector":pattern["subsector"],"terms":build_re(pattern["terms"],{})})

In [45]:
pattern_with_regexps

[{'id': 1,
  'sector': 'General',
  'subsector': '',
  'terms': {'clauses': [re.compile(r'sustainab[\w]*',
    re.IGNORECASE|re.UNICODE),
    re.compile(r'green[\s]*?good[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'green[\s]*?technolog[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'green[\s]*?innov[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'eco[\w]*innov[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'green[\s]*?manufac[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'green[\s]*?prod[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'pollut[\w]*', re.IGNORECASE|re.UNICODE),
    re.compile(r'ecolabel', re.IGNORECASE|re.UNICODE),
    re.compile(r'environ[\w]*[\s]*?product[\s]*?declarat[\w]*',
    re.IGNORECASE|re.UNICODE),
    {'clauses': [re.compile(r'EPD', re.UNICODE),
      re.compile(r'environ[\w]*', re.IGNORECASE|re.UNICODE)],
     'joinwith': 'AND'},
    re.compile(r'environ[\w]*[\s]*?prefer[\w]*[\s]*?product[\w]*',
    re.IGNORECASE|re.UNICODE),
    re.comp

In [8]:
test_string="no EPDs masters"
print(pattern_with_regexps[0]["terms"]["clauses"][10]["clauses"][0].search(test_string))

<_sre.SRE_Match object; span=(3, 6), match='EPD'>


In [51]:
def search_pattern(string,terms):
    for clause in terms["clauses"]:
        found_flag=False
        # If current element is a dictionary, indicates there is a nested condition
        if isinstance(clause, dict):
            found_flag=search_pattern(string,clause)
        # If not simple pattern checking
        else:
            try:
                if clause.match(string) is not None:
                    found_flag=True
                else:
                    found_flag=False
            except:
                print(current_expression)
        # For OR condition, its sufficient that only one pattern has to match
        if terms["joinwith"] == "OR" and found_flag ==True:
            break
        # For AND condition, even one match failure leads to not matching the set of clauses
        if terms["joinwith"] == "AND" and found_flag ==False:
            break
    return found_flag

In [42]:
# Given a dataframe row and set of patterns returns a series of boolean values indicating if the patterns were found
## Called on dataframe using apply function
def search_current_row(row, patterns):
    match_results={}
    for pattern in pattern_with_regexps:
        key=pattern["sector"] + "-"+pattern["subsector"]
        match_results[key ]=search_pattern(row.title, pattern["terms"]) and search_pattern(row.abstract,pattern["terms"])
    return pd.Series(match_results)

In [56]:
## "Loop" through each row and look for patterns
match_results=patent_data.apply(search_current_row,1, args=(patterns,) )

result_frame = patent_data.join(match_results) 

## Write all fields leaving out title and abstract
result_frame[result_frame.columns.difference(['title','abstract'])].to_csv("patent_ids_with_pattern_matches.csv", index=False)

In [3]:
patent_data.shape[0]

10000

In [9]:
start=1
for i in range(100000, patent_data.shape[0], 100000):
    print("Start: "+str(start)+", End: "+str(i))
    patent_data.iloc[start:i,:]
    start=i+1
    

Start: 1, End: 100000
Start: 100001, End: 200000
Start: 200001, End: 300000
Start: 300001, End: 400000
Start: 400001, End: 500000
Start: 500001, End: 600000
Start: 600001, End: 700000
Start: 700001, End: 800000
Start: 800001, End: 900000
Start: 900001, End: 1000000
Start: 1000001, End: 1100000
Start: 1100001, End: 1200000
Start: 1200001, End: 1300000
Start: 1300001, End: 1400000
Start: 1400001, End: 1500000
Start: 1500001, End: 1600000
Start: 1600001, End: 1700000
Start: 1700001, End: 1800000
Start: 1800001, End: 1900000
Start: 1900001, End: 2000000
Start: 2000001, End: 2100000
Start: 2100001, End: 2200000
Start: 2200001, End: 2300000
Start: 2300001, End: 2400000
Start: 2400001, End: 2500000
Start: 2500001, End: 2600000
Start: 2600001, End: 2700000
Start: 2700001, End: 2800000
Start: 2800001, End: 2900000
Start: 2900001, End: 3000000
Start: 3000001, End: 3100000
Start: 3100001, End: 3200000
Start: 3200001, End: 3300000
Start: 3300001, End: 3400000
Start: 3400001, End: 3500000
Start: 35

In [7]:
 patent_data.shape[0]

5380911

In [7]:
green_search_results=pd.read_csv("./green csvs/all_patent_green_terms_searches.csv", low_memory=False)

In [16]:
counts=green_search_results.loc[:,green_search_results.columns.difference(["id"])].apply(pd.value_counts)

In [18]:
counts.to_csv("green_terms_search_counts.csv")