### Imports

In [1]:
import pandas as pd
import numpy as np
import json

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

from sklearn.decomposition import TruncatedSVD

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

## Load Data

In [2]:
with open('successful.json') as f:
    patents = json.load(f)

In [3]:
seed_data = pd.DataFrame(patents['seed'])
antiseed_data = pd.DataFrame(patents['antiseed'])
antiseed_data['seed'] = 0
seed_data['seed'] = 1
all_data = antiseed_data
all_data = all_data.append(seed_data)
all_data = all_data.reset_index(drop=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)
all_data['text'] = all_data['title'] + " " + all_data['abstract']
all_data[:5]

Unnamed: 0,id,title,abstract,seed,text
0,8806085,Application specific integrated circuit (ASIC)...,An input/output module for use in an industria...,0,Application specific integrated circuit (ASIC)...
1,8447714,System for electronic learning synapse with sp...,"A system, method and computer program product ...",1,System for electronic learning synapse with sp...
2,8388568,Shunt device and method for treating ocular di...,Shunt devices and a method for continuously de...,0,Shunt device and method for treating ocular di...
3,9047569,Genetic optimization method and system,A multi-objective optimization method. The met...,1,Genetic optimization method and system A multi...
4,5101452,Apparatus and method for dynamic step quantiza...,The apparatus and method of the present invent...,0,Apparatus and method for dynamic step quantiza...


In [4]:
all_data['seed'].value_counts()

0    780
1    736
Name: seed, dtype: int64

## Prepare Data

### Create Vectorizer

In [5]:
stopwords = []
with open('stopwords.txt') as f:
    lines = f.readlines()
    for line in lines:
        stopwords.append(line[:-1])

In [6]:
vectorizer = CountVectorizer(stop_words = stopwords)

In [7]:
X_raw = vectorizer.fit_transform(all_data['text'].values)
svd = TruncatedSVD(n_components=100,random_state=42)
X_raw = svd.fit_transform(X_raw)
y_raw = all_data['seed'].values

In [8]:
X_raw.shape

(1516, 100)

### Split training set

In [9]:
n_initial = 100

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw)

initial_idx = np.random.choice(range(X_train.shape[0]), size=n_initial, replace=False)

X_initial, y_initial = X_train[initial_idx], y_train[initial_idx]
X_pool, y_pool = np.delete(X_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)

In [11]:
y_initial

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)

## Initialize learner

In [12]:
learner = ActiveLearner(
    estimator=svm.SVC(kernel='linear', gamma='scale', C=2, probability = True),
    query_strategy=uncertainty_sampling,
    X_training=X_initial, y_training=y_initial
)

In [13]:
predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)

predictions

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [14]:
unqueried_score = learner.score(X_raw, y_raw)


In [15]:
unqueried_score

0.8825857519788918

### Active Learning loop

In [16]:
N_QUERIES = 20
performance_history = [unqueried_score]

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy (uncertainty sampling).
for index in range(N_QUERIES):
    query_index, query_instance = learner.query(X_pool)

    # Teach our ActiveLearner model the record it has requested.
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)

    # Remove the queried instance from the unlabeled pool.
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

    # Calculate and report our model's accuracy.
    model_accuracy = learner.score(X_test, y_test)
    print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

    # Save our model's performance for plotting.
    performance_history.append(model_accuracy)

Accuracy after query 1: 0.8813
Accuracy after query 2: 0.8786
Accuracy after query 3: 0.8681
Accuracy after query 4: 0.8681
Accuracy after query 5: 0.8707
Accuracy after query 6: 0.8839
Accuracy after query 7: 0.8839
Accuracy after query 8: 0.8813
Accuracy after query 9: 0.8918
Accuracy after query 10: 0.8865
Accuracy after query 11: 0.8786
Accuracy after query 12: 0.8839
Accuracy after query 13: 0.8839
Accuracy after query 14: 0.8813
Accuracy after query 15: 0.8918
Accuracy after query 16: 0.8918
Accuracy after query 17: 0.8892
Accuracy after query 18: 0.8945
Accuracy after query 19: 0.8997
Accuracy after query 20: 0.8945
