# SVM Model

In [1]:
import pandas as pd
import json
import numpy as np
import sklearn
#import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import svm

## Load Data

In [2]:
with open('../successful.json') as f:
    patents = json.load(f)

In [3]:
print(len(patents['antiseed']))

780


In [4]:
seed_data = pd.DataFrame(patents['seed'])
antiseed_data = pd.DataFrame(patents['antiseed'])

In [5]:
antiseed_data['seed'] = 0
seed_data['seed'] = 1

In [6]:
all_data = antiseed_data

In [7]:
all_data = all_data.append(seed_data)

In [8]:
all_data = all_data.reset_index(drop=True)

# Prepare Data

## Create Sample

In [9]:
# training_set = all_data.loc[all_data['seed']==1].sample(n=500).append(all_data.loc[all_data['seed']==0].sample(n=500))

In [10]:
np.bincount(all_data.seed[780:])
#pd.set_option('display.max_rows', 100)
all_data

Unnamed: 0,id,title,abstract,seed
0,7319635,Memory system with registered memory module an...,A memory module and related method are disclos...,0
1,7547047,Coupler and method of making molded coupler,A coupler device for fluid transport that incl...,0
2,8280734,Systems and arrangements for titling audio rec...,Generally methods for titling segments of reco...,0
3,8399370,Glass composition,A glass composition which is reduced in the am...,0
4,7094688,Method for manufacturing semiconductor device,"A via hole is first formed, and an embedded ma...",0
...,...,...,...,...
1511,5519811,"Neural network, processor, and pattern recogni...",Apparatus for realizing a neural network of a ...,1
1512,9052896,Adjusting mobile device state based on user in...,"In one embodiment, when a computing system is ...",1
1513,8909574,Systems for matching sparkle appearance of coa...,This disclosure is directed to a process for p...,1
1514,5390261,Method and apparatus for pattern classificatio...,"A method for pattern classification and, in pa...",1


780 antiseed 736 seed

## Prepare Data

In [11]:
X_train = all_data.abstract[:500].append(all_data.abstract[780:780+500])
X_test = all_data.abstract[500:780].append(all_data.abstract[780+500:])
Y_train = all_data.seed[:500].append(all_data.seed[780:780+500])
Y_test = all_data.seed[500:780].append(all_data.seed[780+500:])

### Create Vectorizer

In [12]:
stopwords = []
with open('../stopwords.txt') as f:
    lines = f.readlines()
    for line in lines:
        stopwords.append(line[:-1])

In [13]:
vectorizer = CountVectorizer(stop_words = stopwords)

In [14]:
X_train = vectorizer.fit_transform(X_train.values)
X_test = vectorizer.transform(X_test.values)

In [15]:
len(vectorizer.get_feature_names())

6978

In [16]:
scores = cross_val_score(LogisticRegression(),X_train,Y_train.values, cv=5)
print(np.mean(scores))

0.9490000000000001


In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

LogisticRegression()

In [18]:
print(logreg.score(X_train, Y_train))
print(logreg.score(X_test, Y_test))

1.0
0.9302325581395349


In [19]:
pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(Y_test, pred_logreg)
confusion

array([[270,  10],
       [ 26, 210]])

In [20]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
print(rf.score(X_train, Y_train))
print(rf.score(X_test, Y_test))

1.0
0.9457364341085271


In [21]:
nb = MultinomialNB()
nb.fit(X_train, Y_train)
print(nb.score(X_train, Y_train))
print(nb.score(X_test, Y_test))

0.975
0.9554263565891473


## SVM

In [22]:
svm_classifier = svm.SVC(kernel='linear', gamma='scale', C=2, degree=20, probability = True)

In [23]:
svm_classifier.fit(X_train, Y_train)
y_predict = svm_classifier.predict(X_test)
len(Y_test.values)

516

In [24]:
from sklearn.metrics import classification_report
print(confusion_matrix(Y_test, y_predict))
print(svm_classifier.score(X_train, Y_train.values))
print(svm_classifier.score(X_test, Y_test.values))
print(y_predict)
print(len(y_predict))

[[268  12]
 [ 26 210]]
1.0
0.9263565891472868
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1
 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0

In [25]:
# for i, elem in enumerate(y_predict):
#     print(i, elem)

<img src="../img/confusion_matrix.png">

In [33]:
np.set_printoptions(threshold=np.inf)

## Sample of how to determine confidence scores and sample treshold for a decent error ratio

In [46]:
counter = 0
total = 0
got = 0
prob = svm_classifier.predict_proba(X_test)
for i, values in enumerate(y_predict):
    #print(prob[i][0],prob[i][1])
    if max(prob[i][0],prob[i][1]) < 0.7:
        total += 1
        got= y_predict[i]
        expected = Y_test.values[i]
        if got != expected:
            #print("Wrong")
            counter +=1
#         print('===================================')
#         print(f"index: {i}")
#         print(f"Got {got}")
#         print(f"Expected: {expected}")
#         print(f"Probability {prob[i]}")
#         print('===================================')
print(counter, total)

14 25


In [30]:
# for i, values in enumerate(prob):
#     print(y_predict[i], Y_test.values[i])