# Invalidation — linear SVC skeleton

In [32]:
import pandas as pd
import csv
from pandas import DataFrame
import numpy as np

patent_data = DataFrame.from_csv('./records_sample_10000_2.tsv', sep='\t', header=0)
patent_text = pd.read_csv('./patent_title_abstract_claims.tsv.00', sep='\t', names=['patent_id', 'patent_text'], index_col='patent_id')

In [33]:
patent_text.head()

Unnamed: 0_level_0,patent_text
patent_id,Unnamed: 1_level_1
D500396,Glove. ABSTRACT. CLAIMS. The ornamental des...
D500397,Shoe. ABSTRACT. CLAIMS. The ornamental desi...
D500398,Shoe. ABSTRACT. CLAIMS. The ornamental desi...
D500399,Shoe having an enclosed toe. ABSTRACT. CLAI...
D500400,Portion of an article of footwear. ABSTRACT. ...


In [35]:
patent_text.describe()

Unnamed: 0,patent_text
count,84527
unique,82556
top,Footwear upper. ABSTRACT. CLAIMS. The ornam...
freq,64


There are ~2K duplicate rows in the claims dataset. For now, we'll eliminate the duplicates.

In [116]:
patent_text = patent_text.drop_duplicates()
patent_text.describe()

Unnamed: 0,patent_text
count,82556
unique,82556
top,Method for making a puffed food starch product...
freq,1


Next, we inner join the `patent_text dataset` with the `patent_data` dataset (on patent ID).

In [117]:
merged = patent_data.merge(patent_text, how='inner', left_index=True, right_index=True)
np.random.seed(42)

# This will be replaced with actual joined dataset from Dany 
merged['invalidated'] = np.random.choice(range(0, 2), merged.shape[0])
merged.head()

Unnamed: 0,GrantDate,ApplNo,ApplNoSimple,ApplDate,AssigneeName,AssigneeLocation,AssigneeCountryCode,AssigneeStateCode,AssigneeCityCode,InventorName,...,InventorStateCode,InventorCityCode,Reference,USClass,CPCClass,Ack,ErrorFlag,IsGrant,patent_text,invalidated
6837216,2005-01-04,"10/786,327",10786327,2004-02-26,Mitsubishi Denki Kabushiki Kaisha,"Tokyo, JP",JP,,Tokyo,Tanabe; Tsuneo,...,,Hyogo,5746174+6711492+6775609+2003/0182050,123/396+123/397+123/399+701/114+701/115+73/114...,F02D 11/107 (20130101)+F02D 9/105 (20130101),,0,1.0,Electronic throttle control system. ABSTRACT....,0
6837270,2005-01-04,"10/303,881",10303881,2002-11-26,,,,,,Douglas; Andrew S.+Walsh; Patrick A.,...,NULL+NULL,"London, Ontario+London, Ontario",1706051+2119473+2591531+2819097+2823699+286182...,137/613+128/204.26+128/205.24+137/557+251/148+...,F17C 13/04 (20130101)+Y10T 137/8326 (20150401)...,,0,1.0,Gas delivery system and pneumatic yoke for a p...,1
6837514,2005-01-04,"10/623,845",10623845,2003-07-22,"Toyoda Gosei Co., Ltd.","Aichi-ken, JP",JP,,Aichi-ken,Fujita; Yoshiyuki+Isomura; Motoi,...,NULL+NULL,Aichi-ken+Aichi-ken,4148503+5499841+5893581+5964477+6013884+610964...,280/731+200/61.54+280/728.2,B60Q 1/0082 (20130101)+B60R 21/2037 (20130101)...,,0,1.0,Steering wheel having airbag apparatus. ABSTR...,0
6838044,2005-01-04,"10/647,770",10647770,2003-08-25,KM Europa Metal AG,DE,DE,,,Hornschemeyer; Wolfgang,...,,An der Alten Schmiede,5678806+6470958,266/44+266/193,C21B 7/10 (20130101)+F27B 1/24 (20130101)+F27D...,,0,1.0,Cooling plate and method for manufacturing a c...,0
6838426,2005-01-04,"10/161,485",10161485,2002-05-31,"Magic American Products, Inc.","Cleveland, OH",US,OH,Cleveland,Zeilinger; Scott,...,OH,Pepper Pike,4306989+4315828+4362638+5330673+5523014+552525...,510/417+510/238+510/365+510/421+510/424+510/43...,C11D 1/528 (20130101)+C11D 3/18 (20130101)+C11...,,0,1.0,Compositions for water-based and solvent-based...,0


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import cross_validation

In [109]:
X = merged['patent_text'].as_matrix()
y = merged['invalidated'].as_matrix()

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=40)

In [110]:
# this normalizes word frequency based on use in each patent vs. all patents

tfidf = TfidfVectorizer()
tfidf.fit(X)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

In [114]:
svc_class = LinearSVC()
model = svc_class.fit(X_train, y_train)
model.score(X_test, y_test)

0.5

Right now the accuracy is the same as randomly choosing an outcome. Hopefully that changes when we train on real data :). 