In [1]:
import pandas as pd
import re
import numpy as np
import json
import html
import random
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
import os
# TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# SKLearn libraries for splitting sample and validation
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

In [2]:
patents_text = pd.read_csv("patents_text_deduplicated.csv")

  patents_text = pd.read_csv("patents_text_deduplicated.csv")


In [3]:
labelled_data = pd.read_csv("labelled_patents.csv")

In [4]:
unseen_data = pd.read_csv("unseen_data.csv")

# Prep for Classifier
Step 1: 
Only use titles and abstracts
Use Scibert
5 fold cross validation
Start with Narrow

In [29]:
labelled_data.columns

Index(['publication_number_EPO', 'AI for Sus', 'Sus of AI', 'AI ', 'Sus',
       'Energy grid/production', 'Electronic component/battery/charging',
       'Example patent'],
      dtype='object')

In [5]:
training_data = labelled_data[["publication_number_EPO", "Sus of AI", "AI for Sus"]]
training_data = training_data.merge(patents_text[['publication_number_EPO', 'abstract', 'title', 'description', 'claims', 'cpc_codes_EPO']], on='publication_number_EPO', how='left')
training_data['title'] = training_data['title'].fillna('')
training_data['abstract'] = training_data['abstract'].fillna('')
training_data['description'] = training_data['description'].fillna('')
training_data['claims'] = training_data['claims'].fillna('')

# Function to extract first 200 words from a text
def extract_first_300_words(text):
    words = re.findall(r'\b\w+\b', text)
    return ' '.join(words[:300])

def extract_first_3000_words(text):
    words = re.findall(r'\b\w+\b', text)
    return ' '.join(words[:3000])

# Apply the function to each row in training_data

# Concatenate "title" and "abstract" columns, replacing NaN with empty string
training_data["text_abstr"] = (training_data["title"] + " " + training_data["abstract"]).fillna('')
training_data["text_descr300"] = (training_data["title"] + " " + training_data["abstract"] + " " + training_data["description"].fillna('').apply(extract_first_300_words))
training_data["text_descr"] = (training_data["title"] + " " + training_data["abstract"] + " " + training_data["description"]).fillna('')
training_data["text_descr_claims"] = (training_data["title"] + " " + training_data["abstract"] + " " + training_data["description"] + " " + training_data["claims"]).fillna('')
training_data["text_descr3000_claims"] = (training_data["title"] + " " + training_data["abstract"] + " " + training_data["description"].fillna('').apply(extract_first_3000_words)) + " " + training_data["claims"].fillna('')
training_data["text_claims"] = (training_data["title"] + " " + training_data["abstract"] + " " + training_data["claims"]).fillna('')

In [6]:
# print average length of text in words for each column
for col in ["text_abstr", "text_descr300", "text_descr", "text_descr_claims", "text_descr3000_claims", "text_claims"]:
    print(f"{col}: {training_data[col].str.split().apply(len).mean()} words")

# check average length of text in words for each column taking only rows with claims and description not being null strings
for col in ["text_abstr", "text_descr300", "text_descr", "text_descr_claims", "text_descr3000_claims", "text_claims"]:
    print(f"{col}: {training_data[(training_data['claims'] != '') & (training_data['description'] != '')][col].str.split().apply(len).mean()} words")


text_abstr: 205.73926073926074 words
text_descr300: 426.01898101898104 words
text_descr: 9074.454545454546 words
text_descr_claims: 9887.696303696304 words
text_descr3000_claims: 3151.2607392607392 words
text_claims: 1018.981018981019 words
text_abstr: 211.19727891156464 words
text_descr300: 511.19727891156464 words
text_descr: 12289.542857142857 words
text_descr_claims: 13397.10068027211 words
text_descr3000_claims: 4222.717006802721 words
text_claims: 1318.7551020408164 words


In [7]:
training_data["SofAI"] = training_data["Sus of AI"]
training_data["AIforS"] = training_data["AI for Sus"]

In [8]:
training_data.to_csv("training_data_new3.csv")

# Prep for Multimodal Classifier

In [10]:
patents_text["cpc_codes_EPO"].head()

0    ['B01D 53/0476', 'B01D 53/0476', 'B01D 53/0431...
1    ['B60K 6/00', 'B60K 6/48', 'B60K 6/485', 'B60K...
2    ['G06F 13/387', 'G06F 13/387', 'H04W 4/80', 'H...
3    ['H02J 13/00016', 'H02J 13/00016', 'Y02E 60/00...
4    ['H02M 1/10', 'H02M 1/10', 'H02M 3/33592', 'H0...
Name: cpc_codes_EPO, dtype: object

In [12]:
for i in range(5):
    for code in patents_text["cpc_codes_EPO"].iloc[i]:
        print(code)
    

[
'
B
0
1
D
 
5
3
/
0
4
7
6
'
,
 
'
B
0
1
D
 
5
3
/
0
4
7
6
'
,
 
'
B
0
1
D
 
5
3
/
0
4
3
1
'
,
 
'
B
0
1
D
 
5
3
/
0
4
3
1
'
,
 
'
B
0
1
D
 
5
3
/
0
4
4
6
'
,
 
'
B
0
1
D
 
5
3
/
0
4
4
6
'
,
 
'
B
0
1
D
 
2
2
5
6
/
1
2
'
,
 
'
B
0
1
D
 
2
2
5
6
/
1
2
'
,
 
'
B
0
1
D
 
2
2
5
7
/
1
0
2
'
,
 
'
B
0
1
D
 
2
2
5
7
/
1
0
2
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
0
0
7
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
0
0
7
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
1
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
1
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
2
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
2
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
3
'
,
 
'
B
0
1
D
 
2
2
5
9
/
4
0
3
'
]
[
'
B
6
0
K
 
6
/
0
0
'
,
 
'
B
6
0
K
 
6
/
4
8
'
,
 
'
B
6
0
K
 
6
/
4
8
5
'
,
 
'
B
6
0
K
 
6
/
5
4
'
,
 
'
B
6
0
W
 
1
0
/
0
2
'
,
 
'
B
6
0
W
 
1
0
/
0
6
'
,
 
'
B
6
0
W
 
1
0
/
0
8
'
,
 
'
B
6
0
W
 
1
0
/
1
0
'
,
 
'
Y
0
2
T
 
1
0
/
6
2
'
]
[
'
G
0
6
F
 
1
3
/
3
8
7
'
,
 
'
G
0
6
F
 
1
3
/
3
8
7
'
,
 
'
H
0
4
W
 
4
/
8
0
'
,
 
'
H
0
4
W
 
4
/
8
0
'
,
 
'
G
0
6
F
 
2
2
1
3
/
3
8
1
4
'
,
 


In [13]:
pattern = r"'(.*?)'"

patents_text["cpc_codes_EPO_list"] = patents_text["cpc_codes_EPO"].apply(lambda x: set(re.findall(pattern, x)))


# Prep unseen data

In [14]:
len(unseen_data)

19457

In [15]:
unseen_data['title'] = unseen_data['title'].fillna('')
unseen_data['abstract'] = unseen_data['abstract'].fillna('')
unseen_data['description'] = unseen_data['description'].fillna('')
unseen_data['claims'] = unseen_data['claims'].fillna('')

# Function to extract first 200 words from a text
def extract_first_300_words(text):
    words = re.findall(r'\b\w+\b', text)
    return ' '.join(words[:300])

def extract_first_3000_words(text):
    words = re.findall(r'\b\w+\b', text)
    return ' '.join(words[:3000])

# Apply the function to each row in training_data

# Concatenate "title" and "abstract" columns, replacing NaN with empty string
unseen_data["text_abstr"] = (unseen_data["title"] + " " + unseen_data["abstract"]).fillna('')
unseen_data["text_descr300"] = (unseen_data["title"] + " " + unseen_data["abstract"] + " " + unseen_data["description"].fillna('').apply(extract_first_300_words))
unseen_data["text_descr"] = (unseen_data["title"] + " " + unseen_data["abstract"] + " " + unseen_data["description"]).fillna('')
unseen_data["text_descr_claims"] = (unseen_data["title"] + " " + unseen_data["abstract"] + " " + unseen_data["description"] + " " + unseen_data["claims"]).fillna('')
unseen_data["text_descr3000_claims"] = (unseen_data["title"] + " " + unseen_data["abstract"] + " " + unseen_data["description"].fillna('').apply(extract_first_3000_words)) + " " + unseen_data["claims"].fillna('')
unseen_data["text_claims"] = (unseen_data["title"] + " " + unseen_data["abstract"] + " " + unseen_data["claims"]).fillna('')

In [16]:
len(unseen_data)

19457

In [None]:
# unseen_data.to_csv("unseen_data_new.csv")