In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Start by importing the bq_helper module and calling on the specific active_project and dataset_name for the BigQuery dataset.
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package

patentsview = bq_helper.BigQueryHelper(active_project="patents-public-data",
                                   dataset_name="patentsview")

In [None]:
# View table names under the patentsview data table
bq_assistant = BigQueryHelper("patents-public-data", "patentsview")
bq_assistant.list_tables()

In [None]:
# View the first three rows of the patent data table
bq_assistant.head("patent", num_rows=3)

In [None]:
# View information on all columns in the patent data table
bq_assistant.table_schema("patent")

In [None]:
# View the first ten rows of the ipcr data table
bq_assistant.head("ipcr", num_rows=10)

In [None]:
# View information on all columns in the ipcr data table
bq_assistant.table_schema("ipcr")

##What is the IPC?

https://en.wikipedia.org/wiki/International_Patent_Classification

In [None]:
query4 = """
SELECT DISTINCT
  a.id, a.abstract, b.section
FROM
  `patents-public-data.patentsview.patent` a
INNER JOIN
  `patents-public-data.patentsview.ipcr` b
ON
  a.id = b.patent_id
WHERE
  a.type = 'utility'
LIMIT
  2000;
        """

bq_assistant.estimate_query_size(query4)

In [None]:
response4 = patentsview.query_to_pandas(query4)
response4.head(10)

In [None]:
response4.shape

In [None]:
labels = response4["section"]
features = response4["abstract"]

In [None]:
Section_dict = {}
sections = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
for section in sections:
    Section_dict[section] = labels.str.count(section).sum()

Section_dict

In [None]:
sum(Section_dict.values())

In [None]:
#plot distribution of sections
import matplotlib.pyplot as plt
plt.bar(range(len(Section_dict)), list(Section_dict.values()), align='center')
plt.xticks(range(len(Section_dict)), list(Section_dict.keys()))
plt.show()

In [None]:
#plot relative distribution of sections
plt.bar(range(len(Section_dict)), np.array(list(Section_dict.values()))/len(labels), align='center')
plt.xticks(range(len(Section_dict)), list(Section_dict.keys()))
plt.show()

In [None]:
def to_multilabel_encode(labels, features, num_codes):
    """Input:
        features: list of the features
        labels: list of the labels associated with each feature in features, 
                wherein the labels are single uppercase letters from A to ...(defined by num_codes)
        num_codes: int, maximum number of codes
        Output:
            tuple of two numpy arrays, wherein the first array contains the features
            and the second array contains the corresponding features encoded in a "multi"-hot-encoding
    """
    assert len(labels) == len(features)
    assert num_codes > 0
    assert type(labels) == type([])
    assert type(features) == type([])
    features_final = []
    labels_final = []
    label_vector = np.zeros(num_codes)
    
    for index, feature in enumerate(features):
        if feature == None:
            pass
        else:
            if index == 0:
                try:
                    label_vector[ord(labels[index]) - 65] = 1
                except:
                    pass
            else:
                if feature == features[index - 1]:
                    try:
                        label_vector[ord(labels[index]) - 65] = 1
                    except:
                        pass
                else:
                    labels_final.append(label_vector)
                    features_final.append(features[index - 1])
                    label_vector = np.zeros(num_codes)
                    try:
                        label_vector[ord(labels[index]) - 65] = 1
                    except:
                        pass

    labels_final.append(label_vector)
    features_final.append(features[-1])
    
    return (np.array(features_final), np.array(labels_final))

In [None]:
features = response4["abstract"].tolist()
labels = response4["section"].tolist()

In [None]:
num_codes = 8
features, labels = to_multilabel_encode(labels, features, num_codes)

In [None]:
print(features)
print(labels)

In [None]:
from keras.models import Model, Input
from keras.layers import Dense, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam

In [None]:
max_features = 20000  # number of words we want to keep
maxlen = 500  # max length of the abstract in the model
batch_size = 64  # batch size for the model
embedding_dims = 20  # dimension of the hidden variable, i.e. the embedding dimension

In [None]:
X_train = features[0:len(features)//2]
X_test = features[len(features)//2:]

In [None]:
#continue here

In [None]:
X_test.shape

In [None]:
min_length = 1
for element in enumerate(X_train):
    try:
        if len(element) < min_length:
            print(element)
    except:
        print(element)

In [None]:
tok = Tokenizer(num_words = max_features)
tok.fit_on_texts(X_train.tolist() + X_test.tolist())
x_train = tok.texts_to_sequences(X_train)
x_test = tok.texts_to_sequences(X_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

In [None]:
nn = Sequential()
nn.add(Dense(64, activation="relu", input_shape=(10000,)))
nn.add(Dense(64, activation="relu", input_shape=(10000,)))
nn.add(Dense(num_codes, activation="sigmoid"))

nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#https://www.depends-on-the-definition.com/guide-to-multi-label-classification-with-neural-networks/
#https://www.depends-on-the-definition.com/classify-toxic-comments-on-wikipedia/