### Imports

In [163]:
import pandas as pd
import numpy as np
import json

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

from sklearn.decomposition import TruncatedSVD

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

## Load Data

In [151]:
with open('../successful.json') as f:
    patents = json.load(f)

In [152]:
seed_data = pd.DataFrame(patents['seed'])
antiseed_data = pd.DataFrame(patents['antiseed'])
antiseed_data['seed'] = 0
seed_data['seed'] = 1
all_data = antiseed_data
all_data = all_data.append(seed_data)
all_data = all_data.reset_index(drop=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)
all_data['text'] = all_data['title'] + " " + all_data['abstract']
all_data[:5]

Unnamed: 0,id,title,abstract,seed,text
0,8253263,Wave-power system for extracting simultaneousl...,A wave energy conversion system or device incl...,0,Wave-power system for extracting simultaneousl...
1,8630964,Using a genetic algorithm employing an expedit...,Apparatus and method for at least partially fi...,1,Using a genetic algorithm employing an expedit...
2,6311172,"Method for determination of weights, suitable ...",The training phase of a neural network NN is s...,1,"Method for determination of weights, suitable ..."
3,8812414,Low-power event-driven neural computing archit...,A neural network includes an electronic synaps...,1,Low-power event-driven neural computing archit...
4,10380482,Training neural networks on partitioned traini...,"Methods, systems, and apparatus, including com...",1,Training neural networks on partitioned traini...


In [153]:
all_data['seed'].value_counts()

0    780
1    736
Name: seed, dtype: int64

## Prepare Data

### Create Vectorizer

In [154]:
stopwords = []
with open('../stopwords.txt') as f:
    lines = f.readlines()
    for line in lines:
        stopwords.append(line[:-1])

In [164]:
vectorizer = CountVectorizer(stop_words = stopwords)

In [187]:
X_raw = vectorizer.fit_transform(all_data['text'].values)
svd = TruncatedSVD(n_components=100,random_state=42)
X_raw = svd.fit_transform(X_raw)
y_raw = all_data['seed'].values

In [188]:
X.shape

(1, 100)

### Split training set

In [189]:
n_initial = 100

In [190]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw)

initial_idx = np.random.choice(range(X_train.shape[0]), size=n_initial, replace=False)

X_initial, y_initial = X_train[initial_idx], y_train[initial_idx]
X_pool, y_pool = np.delete(X_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)

In [191]:
y_initial

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0], dtype=int64)

## Initialize learner

In [192]:
learner = ActiveLearner(
    estimator=svm.SVC(kernel='linear', gamma='scale', C=2, degree=20, probability = True),
    query_strategy=uncertainty_sampling,
    X_training=X_initial, y_training=y_initial
)

In [193]:
predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)

predictions

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [194]:
unqueried_score = learner.score(X_raw, y_raw)


### Active Learning loop

In [195]:
N_QUERIES = 20
performance_history = [unqueried_score]

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy (uncertainty sampling).
for index in range(N_QUERIES):
    query_index, query_instance = learner.query(X_pool)

    # Teach our ActiveLearner model the record it has requested.
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)

    # Remove the queried instance from the unlabeled pool.
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

    # Calculate and report our model's accuracy.
    model_accuracy = learner.score(X_raw, y_raw)
    print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

    # Save our model's performance for plotting.
    performance_history.append(model_accuracy)

Accuracy after query 1: 0.8588
Accuracy after query 2: 0.8582
Accuracy after query 3: 0.8588
Accuracy after query 4: 0.8621
Accuracy after query 5: 0.8641
Accuracy after query 6: 0.8654
Accuracy after query 7: 0.8701
Accuracy after query 8: 0.8786
Accuracy after query 9: 0.8832
Accuracy after query 10: 0.8786
Accuracy after query 11: 0.8819
Accuracy after query 12: 0.8813
Accuracy after query 13: 0.8852
Accuracy after query 14: 0.8813
Accuracy after query 15: 0.8865
Accuracy after query 16: 0.8872
Accuracy after query 17: 0.8892
Accuracy after query 18: 0.8912
Accuracy after query 19: 0.8879
Accuracy after query 20: 0.8879
