# Accessing the data

## Imports

In [5]:
import pymongo
from pymongo import MongoClient
from pprint import pprint

## Database Connection

In [6]:
#We connect to the database
cluster = MongoClient(#INSERT URI HERE)

#We create different objects for the different databases needed
db = cluster['PatentData']
labels_db = db['labels']
patent_db = db['Patents']

### Example of a patent entry

In [12]:
pprint(test_patent := patent_db.find_one({"documentId":"06958839"}))

{'_id': ObjectId('5fbd9231c00b71b87435bde2'),
 'abstract': '\\n   A scanning optical system is configured to include a light '
             'source, an anamorphic optical element, a polygonal mirror, and '
             'an imaging optical system. The imaging optical system has a '
             'scanning lens including a first lens provided on a polygonal '
             'mirror side and a second lens provided on a surface side, and a '
             'compensation lens provided on the surface side with respect to '
             'the scanning lens, the compensation lens compensating for '
             'curvature of field. The scanning lens includes at least one '
             'convex surface that has a toric surface having a stronger power '
             'in the auxiliary scanning direction than in the main scanning '
             'direction. One surface of the compensation lens has an '
             'anamorphic aspherical surface, which is a surface whose radius '
             'of curvatu

### Example of the same patent label

In [13]:
pprint(test_label := labels_db.find_one({"document":"06958839"}))

{'ActiveLearningPatent': 'Yes',
 'MachineLearningPatent': 'Yes',
 '__v': 0,
 '_id': ObjectId('6010861261ff287238a5c280'),
 'document': '06958839'}


### Sample of an Ideal entry

In [23]:
print(
f'''Patent#: {test_label["document"]}\n
Title: {test_patent["title"]}\n
Abstract: {test_patent["abstract"]}\n
Active Learning: {test_label["ActiveLearningPatent"]}\n
Machine Learning: {test_label["MachineLearningPatent"]}''')

Patent#: 06958839

Title: Scanning Optical System

Abstract: \n   A scanning optical system is configured to include a light source, an anamorphic optical element, a polygonal mirror, and an imaging optical system. The imaging optical system has a scanning lens including a first lens provided on a polygonal mirror side and a second lens provided on a surface side, and a compensation lens provided on the surface side with respect to the scanning lens, the compensation lens compensating for curvature of field. The scanning lens includes at least one convex surface that has a toric surface having a stronger power in the auxiliary scanning direction than in the main scanning direction. One surface of the compensation lens has an anamorphic aspherical surface, which is a surface whose radius of curvature in the auxiliary scanning direction at a point spaced from the optical axis thereof is determined independently from a cross-sectional shape thereof along the main scanning direction.\n

Ac

### Simple way to iterate through all patents objects

In [35]:
# It seems that we do not have any document really labeled(?)
for x in labels_db.find():
    print(x)

{'_id': ObjectId('6010861261ff287238a5c280'), 'document': '06958839', 'MachineLearningPatent': 'Yes', 'ActiveLearningPatent': 'Yes', '__v': 0}
{'_id': ObjectId('6010862561ff287238a5c281'), 'document': '08051028', 'MachineLearningPatent': 'Yes', 'ActiveLearningPatent': 'Yes', '__v': 0}


## Bag of Words Model

### Lets create a sample dataset

In [107]:
data_list = []
for patent in patent_db.find().limit(10):
    #data_list.append(patent)
    data_list.append({"id":patent["documentId"],"title":patent["title"],"abstract":patent["abstract"][5:-4], "target":1})
pprint(data_list)

ServerSelectionTimeoutError: compute1.cognac.cs.fiu.edu:59122: [Errno -2] Name or service not known, Timeout: 30s, Topology Description: <TopologyDescription id: 6021036d3e083c0d6a870b8c, topology_type: Single, servers: [<ServerDescription ('compute1.cognac.cs.fiu.edu', 59122) server_type: Unknown, rtt: None, error=AutoReconnect('compute1.cognac.cs.fiu.edu:59122: [Errno -2] Name or service not known')>]>

### New Imports

In [84]:
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
titles = [x["title"] for x in data_list]
vect = CountVectorizer()
vect.fit(titles)

CountVectorizer()

In [78]:
print(vect.vocabulary_)

{'connectors': 7, 'with': 48, 'primary': 31, 'and': 1, 'secondary': 35, 'lock': 23, 'structure': 39, 'fire': 14, 'protection': 32, 'apparatus': 2, 'polyhydroxyl': 29, 'monosulfoxide': 25, 'shampoo': 36, 'compositions': 5, 'differential': 8, 'amplifier': 0, 'light': 22, 'concrete': 6, 'method': 24, 'of': 27, 'preparing': 30, 'the': 42, 'same': 34, 'display': 9, 'arrangements': 3, 'employing': 10, 'thermochromic': 43, 'tiltable': 44, 'visor': 47, 'for': 15, 'helmets': 18, 'in': 19, 'particular': 28, 'motorcyclists': 26, 'similar': 37, 'system': 41, 'starting': 38, 'internal': 20, 'combustion': 4, 'engines': 12, 'hand': 16, 'held': 17, 'laser': 21, 'surveying': 40, 'rod': 33, 'variable': 46, 'valve': 45, 'event': 13, 'engine': 11}


In [79]:
bag_of_words = vect.transform(titles)

In [81]:
print(bag_of_words)

  (0, 1)	1
  (0, 7)	1
  (0, 23)	1
  (0, 31)	1
  (0, 35)	1
  (0, 39)	1
  (0, 48)	1
  (1, 2)	1
  (1, 14)	1
  (1, 32)	1
  (2, 5)	1
  (2, 25)	1
  (2, 29)	1
  (2, 36)	1
  (3, 0)	1
  (3, 8)	1
  (4, 1)	1
  (4, 6)	1
  (4, 22)	1
  (4, 24)	1
  (4, 27)	1
  (4, 30)	1
  (4, 34)	1
  (4, 42)	1
  (5, 3)	1
  :	:
  (5, 43)	1
  (6, 1)	1
  (6, 15)	1
  (6, 18)	2
  (6, 19)	1
  (6, 26)	1
  (6, 28)	1
  (6, 37)	1
  (6, 44)	1
  (6, 47)	1
  (7, 4)	1
  (7, 12)	1
  (7, 15)	1
  (7, 20)	1
  (7, 38)	1
  (7, 41)	1
  (8, 16)	1
  (8, 17)	1
  (8, 21)	1
  (8, 33)	1
  (8, 40)	1
  (9, 11)	1
  (9, 13)	1
  (9, 45)	1
  (9, 46)	1


In [82]:
print(bag_of_words.toarray())

[[0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
  0 0 0 1 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0
  0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 1 0 0 1 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [101]:
data_frame = pd.DataFrame(data_list)

In [104]:
data_frame

Unnamed: 0,id,title,abstract,target
0,4017141,Connectors with Primary and Secondary Lock Str...,Electrical connectors include an insulating h...,1
1,4058167,Fire Protection Apparatus,Fire protection apparatus having a conduit fo...,1
2,4058629,Polyhydroxyl Monosulfoxide Shampoo Compositions,"A polyhydroxyl monosulfoxide surfactant, usef...",1
3,4078206,Differential Amplifier,Source-coupled first and second FET's arrange...,1
4,4126470,Light Concrete and Method of Preparing the Same,A light concrete prepared from a mixture of P...,1
5,4142782,Display Arrangements Employing Thermochromic C...,Display arrangements useful as art forms and ...,1
6,4223410,"Tiltable Visor for Helmets, in Particular Moto...",This invention concerns a tiltable shield or ...,1
7,4232521,System for Starting Internal Combustion Engines,A system for starting internal combustion eng...,1
8,4240208,Hand-Held Laser Surveying Rod,A telescoping surveying rod carries on its up...,1
9,4253434,Variable Valve Event Engine,A variable valve event engine has a plurality...,1


In [None]:
vectorizer = CountVectorizer()