### Imports

In [1]:
import pandas as pd
import numpy as np
import json

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

from sklearn.decomposition import TruncatedSVD

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

import pickle
import pymongo
from pymongo import MongoClient

## Load Data

In [2]:
#We connect to the database
cluster = MongoClient("mongodb://compute1.cognac.cs.fiu.edu:59122/PatentData?readPreference=secondary&ssl=false")

#Access the test patent data cluster
db = cluster['TestDatabase']

#Access the test labels Patent
patent_db = db['LabeledPatents']

In [3]:
df = pd.DataFrame(list(patent_db.find()))
df = df.iloc[:,1:]
df['text'] = df['title'] + " " + df['abstract']
df = df.rename(columns={"documentId":"id","MachineLearningPatent": "ml", "Hardware": "hardware","EVO":"evo","NLP":"nlp","Vision":"vision","Planning":"planning","KnowledgeProcessing":"knowledgeplanning"})

In [4]:
ml_df = df[['id','text','ml']]
ml_df = ml_df.rename(columns = {"documentId":"id", "ml": "seed"})
ml_df['seed'] = ml_df.seed.map(dict(Yes=1, No=0))

In [5]:
ml_df

Unnamed: 0,id,text,seed
0,07328147,Automatic Resolution of Segmentation Ambiguiti...,1
1,08037928,Chromium-enriched Oxide Containing Material an...,0
2,05664062,High Performance Max-Min Circuit for a Fuzzy I...,0
3,09037463,Efficient Exploitation of Model Complementarin...,1
4,09206987,Wire Mesh Thermal Radiative Element and Use in...,0
...,...,...,...
81,10012544,Homogenization of Light Beam for Spectral Feat...,0
82,05339090,Spatial Light Modulators \n A smart pixel i...,0
83,04461201,Safety Closure Lock \n A safety closure loc...,0
84,05202066,Method of Plasticizing Molding Material and Ap...,0


In [6]:
# with open('successful.json') as f:
#     patents = json.load(f)

In [7]:
# seed_data = pd.DataFrame(patents['seed'])
# antiseed_data = pd.DataFrame(patents['antiseed'])
# antiseed_data['seed'] = 0
# seed_data['seed'] = 1
# all_data = antiseed_data
# all_data = all_data.append(seed_data)
# all_data = all_data.reset_index(drop=True)
# all_data = all_data.sample(frac=1).reset_index(drop=True)
# all_data['text'] = all_data['title'] + " " + all_data['abstract']
# all_data[:5]

In [8]:
ml_df['seed'].value_counts()

0    68
1    18
Name: seed, dtype: int64

In [9]:
all_data = ml_df

## Prepare Data

### Create Vectorizer

In [10]:
stopwords = []
with open('stopwords.txt') as f:
    lines = f.readlines()
    for line in lines:
        stopwords.append(line[:-1])

In [11]:
vectorizer = CountVectorizer(stop_words = stopwords)

In [12]:
X_raw = vectorizer.fit_transform(all_data['text'].values)
svd = TruncatedSVD(n_components=100,random_state=42)
X_raw = svd.fit_transform(X_raw)
y_raw = all_data['seed'].values

In [13]:
X_raw.shape

(86, 86)

### Split training set

In [14]:
n_initial = 20

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw)

initial_idx = np.random.choice(range(X_train.shape[0]), size=n_initial, replace=False)

X_initial, y_initial = X_train[initial_idx], y_train[initial_idx]
X_pool, y_pool = np.delete(X_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)

In [16]:
y_initial

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

## Initialize learner

In [17]:
learner = ActiveLearner(
    estimator=svm.SVC(kernel='linear', gamma='scale', C=2, probability = True),
    query_strategy=uncertainty_sampling,
    X_training=X_initial, y_training=y_initial
)

In [18]:
learner.estimator

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [19]:
# import pickle
# pickle.dump(learner.estimator, open('models/model0.sav','wb'))
# loaded_model = pickle.load(open('models/model0.sav','rb'))

In [20]:
predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)

predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [21]:
unqueried_score = learner.score(X_raw, y_raw)


In [22]:
unqueried_score

0.8255813953488372

### Active Learning loop

In [23]:
N_QUERIES = 20
performance_history = [unqueried_score]

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy (uncertainty sampling).
for index in range(N_QUERIES):
    query_index, query_instance = learner.query(X_pool)

    # Teach our ActiveLearner model the record it has requested.
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)

    # Remove the queried instance from the unlabeled pool.
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

    # Calculate and report our model's accuracy.
    model_accuracy = learner.score(X_test, y_test)
    
    filename = 'models/model'+str(index)+'.sav'
    pickle.dump(learner.estimator, open(filename,'wb'))
    print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

    # Save our model's performance for plotting.
    performance_history.append(model_accuracy)

Accuracy after query 1: 0.5909
Accuracy after query 2: 0.5909
Accuracy after query 3: 0.6364
Accuracy after query 4: 0.6364
Accuracy after query 5: 0.6364
Accuracy after query 6: 0.6364
Accuracy after query 7: 0.6364
Accuracy after query 8: 0.6364
Accuracy after query 9: 0.6364
Accuracy after query 10: 0.6364
Accuracy after query 11: 0.6364
Accuracy after query 12: 0.6364
Accuracy after query 13: 0.6364
Accuracy after query 14: 0.6364
Accuracy after query 15: 0.7273
Accuracy after query 16: 0.7273
Accuracy after query 17: 0.7273
Accuracy after query 18: 0.7273
Accuracy after query 19: 0.7273
Accuracy after query 20: 0.7273
