In [0]:
%pip install azure-storage-blob

In [0]:
import tarfile
import io
from pyspark.sql import functions as F
from azure.storage.blob import ContainerClient

In [0]:
conn_string= ""

In [0]:
client = ContainerClient.from_connection_string(conn_string, container_name='training-data')

In [0]:
# Get train data filepaths
train_paths = [file.name for file in client.list_blobs()][0]
blob_client = client.get_blob_client(train_paths)
raw_file = blob_client.download_blob().readall()
paths = [path for path in str(raw_file).split('\\n')]
files = []
for name in paths:
    try: 
        folder, path = name.split('/')
        files.append(path)
    except ValueError:
        print(name)

In [0]:
client = ContainerClient.from_connection_string(conn_string, container_name='updates')
blob_list = client.list_blobs()

In [0]:
# Download train data
blob_list = enumerate(blob_list)
patents = []
false_examples = []
max_examples = 2000
c = 0

for i, blob_name in blob_list:
    try:
        print("Reading folder {}/{}".format(i, 92 ))
        blob_client = client.get_blob_client(blob_name)
        raw_file = blob_client.download_blob().readall()
        tar = tarfile.open(fileobj=io.BytesIO(raw_file))
        x_files = tar.getnames()
        for xml_file in x_files:
            x = xml_file.split('/')[1]
            if x in files:
                f = str(tar.extractfile(xml_file).read())
                patents.append((xml_file, f))
                print("######PATENT FOUND########")
                print(blob_name.name)
                print("##########################")
            elif c < max_examples:
                f = str(tar.extractfile(xml_file).read())
                false_examples.append((xml_file, f))
                c = c+1
                
    except Exception as E:
        print("ERROR {}. {}".format(blob_name.name, E))
    

In [0]:
# Create Dataframe with train data
df = spark.createDataFrame(patents).toDF(*["file_name", "raw_txt"])
df.count()

In [0]:
df_false = spark.createDataFrame(false_examples).toDF(*["file_name", "raw_txt"])
df_false.count()

In [0]:
dbutils.fs.rm("/gustavo/train_data_raw/", True)
dbutils.fs.mkdirs("/gustavo/train_data_raw/good/")
dbutils.fs.mkdirs("/gustavo/train_data_raw/bad/")

In [0]:
# Save raw data
df.write.parquet("/gustavo/train_data_raw/good/", mode="overwrite", compression="snappy")
df_false.write.parquet("/gustavo/train_data_raw/bad/", mode="overwrite", compression="snappy")

In [0]:
df = spark.read.parquet("/gustavo/train_data_raw/good/")
df_false = spark.read.parquet("/gustavo/train_data_raw/bad/")

In [0]:
df_false.show()

In [0]:
# Add padding
def pad_sequence(features, max_len, t):
    pad = "0A" if t == "str" else -1
    features = features[1:]
    x = len(features)
    if x > max_len:
        features = features[0:max_len]
    else:
        extra= [pad]* (max_len - x)
        features = features + extra
    return features

In [0]:
# Gets all the classification-ipc items
def get_classification(txt):
    return txt.split("<classifications-ipcr>")[1].split("</classifications-ipcr>")[0].split("</classification-ipcr>") if "</classifications-ipcr>" in txt else ""

# Extracs all the values for an specified tag
def get_tags(class_list, tag, max_len, t):
    raw_tag = list(set([txt.split("<"+tag+">")[1].split("</"+tag+">")[0] if "</"+tag+">" in txt else "" for txt in class_list]))
    return pad_sequence(raw_tag, max_len, t)

In [0]:
# Extract training fetures
get_classifications = udf(lambda x: get_classification(x))
get_levels = udf(lambda x: get_tags(x, "classification-level", 2, "str"))
get_sections = udf(lambda x: get_tags(x, "section", 2, "str"))
get_classes = udf(lambda x: get_tags(x, "class", 3, "int"))
get_subclasses = udf(lambda x: get_tags(x, "subclass", 3, "str"))
get_groups = udf(lambda x: get_tags(x, "main-group", 4, "int"))
get_subgroups = udf(lambda x: get_tags(x, "subgroup", 4, "int"))

In [0]:
# DF with true examples
df = df.withColumn("classification", get_classifications("raw_txt"))
df = df.withColumn("levels", get_levels("classification"))
df = df.withColumn("sections", get_sections("classification"))
df = df.withColumn("clasess", get_classes("classification"))
df = df.withColumn("subclasess", get_subclasses("classification"))
df = df.withColumn("groups", get_groups("classification"))
df = df.withColumn("subgroups", get_subgroups("classification"))
df = df.withColumn("target", F.lit(0))
df = df.drop("raw_txt", "classification")

# DF with false examples
df_false = df_false.withColumn("classification", get_classifications("raw_txt"))
df_false = df_false.withColumn("levels", get_levels("classification"))
df_false = df_false.withColumn("sections", get_sections("classification"))
df_false = df_false.withColumn("clasess", get_classes("classification"))
df_false = df_false.withColumn("subclasess", get_subclasses("classification"))
df_false = df_false.withColumn("groups", get_groups("classification"))
df_false = df_false.withColumn("subgroups", get_subgroups("classification"))
df_false = df_false.withColumn("target", F.lit(1))
df_false = df_false.drop("raw_txt", "classification")

df = df.union(df_false)
df.show()

In [0]:
# Save train data
dbutils.fs.mkdirs("/gustavo/train_data/")
df.write.parquet("/gustavo/train_data/", mode="overwrite", compression="snappy")