# What Patents Reveal About Facebook?
## Abstract Analysis

In [1]:
import pandas as pd
import re
import time
import datetime
import numpy as np


import matplotlib.pyplot as plt
from cycler import cycler


% matplotlib inline

In [2]:
years = []
n=2000
for i in range(18):
    n+=1
    years.append(n)


## Glue the documents together for each year

In [3]:
#read in my big table csv
df = pd.read_csv('allpatents.csv', parse_dates = True)
df.head()

Unnamed: 0,abstract,applicant,assignee,date_filed,date_issued,inventor,p_num,title,type,url,filed_date
0,A social networking system generates socially-...,,"FACEBOOK, INC.","April 13, 2012","July 24, 2018","Timothy Kendall, Matthew R. Cohler, Mark E. Zu...",10032189,Sponsored stories and news stories within a ne...,Grant,https://patents.justia.com/patent/10032189,2012-04-13
1,The various embodiments described herein inclu...,,"FACEBOOK, INC.","August 3, 2015","July 24, 2018","Scott Snibbe, Graham McDermott, Justin Ponczek...",10031921,Methods and systems for storage of media item ...,Grant,https://patents.justia.com/patent/10031921,2015-08-03
2,"In one embodiment, a method includes deploying...",,"FACEBOOK, INC.","January 26, 2016","July 24, 2018","Michael Dudley Johnson, Mathieu Benjamin Tozer...",10032186,Native application testing,Grant,https://patents.justia.com/patent/10032186,2016-01-26
3,"In one embodiment, a method includes receiving...",,"FACEBOOK, INC.","October 9, 2015","July 24, 2018","Kittipat Virochsiri, Sriram Sankar",10032047,User search based on private information,Grant,https://patents.justia.com/patent/10032047,2015-10-09
4,"In one embodiment, a method includes receiving...",,"FACEBOOK, INC.","October 17, 2012","July 24, 2018","Matthew Nicholas Papakipos, Michael John McKen...",10032233,Social context in augmented reality,Grant,https://patents.justia.com/patent/10032233,2012-10-17


In [13]:
df.dtypes

abstract       object
applicant      object
assignee       object
date_filed     object
date_issued    object
inventor       object
p_num          object
title          object
type           object
url            object
filed_date     object
dtype: object

In [4]:
# Make a new column that is filed_date as a datetime
# usually with .groupby you give it a column name, but you can also give it a list of values
#      you're going to use .dt.year to pull the year out of each of those filing dates
# so it will then be grouped by year
# you then pull out the .abstract column
# and use .apply to do something to each group of abstracts
# ' '.join(abstracts) will take the list of abtracts and join them all together with spaces, making a new string
# the end!
# probably you want to .reset_index() to turn it into a dataframe

df.groupby(pd.to_datetime(df.filed_date).dt.year).abstract.apply(lambda abstracts: ' '.join(abstracts))

filed_date
2001    A digital audio playback device that includes ...
2002    A process, system and computer software are ro...
2003    Linking multiple identities from a single serv...
2004    An Internet/world-wide-web based keyword bid m...
2005    Systems and techniques for transferring electr...
2006    Content maintained in an online social network...
2007    An iterative language translation system. The ...
2008    Embodiments of the invention provide technique...
2009    To avoid the need to operate in-chassis fans t...
2010    A method and apparatus for fine-grained, trust...
2011    A wireless communication system and in particu...
2012    A social networking system generates socially-...
2013    In one embodiment, a method includes receiving...
2014    A server system receives a first request for a...
2015    The various embodiments described herein inclu...
2016    In one embodiment, a method includes deploying...
2017    A tracking system generates a structured light...
201

In [5]:
df_abstract = df.groupby(pd.to_datetime(df.filed_date).dt.year).abstract.apply(lambda abstracts: ' '.join(abstracts)).reset_index()

In [6]:
df_abstract

Unnamed: 0,filed_date,abstract
0,2001,A digital audio playback device that includes ...
1,2002,"A process, system and computer software are ro..."
2,2003,Linking multiple identities from a single serv...
3,2004,An Internet/world-wide-web based keyword bid m...
4,2005,Systems and techniques for transferring electr...
5,2006,Content maintained in an online social network...
6,2007,An iterative language translation system. The ...
7,2008,Embodiments of the invention provide technique...
8,2009,To avoid the need to operate in-chassis fans t...
9,2010,"A method and apparatus for fine-grained, trust..."


## Analyzing the abstract of the patents

### 1.Tokenizing (or phrasing?)

In [7]:
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import math

In [8]:
def tokenize(s):
    blob = TextBlob(s.lower())
    words = [token for token in blob.words if len(token)>=2]
    return words

In [9]:
def phrase(s):
    blob = TextBlob(s.lower())
    phrase = [phrase for phrase in blob.noun_phrases]
    return phrase

In [19]:
df_abstract.abstract.apply(tokenize)

0     [digital, audio, playback, device, that, inclu...
1     [process, system, and, computer, software, are...
2     [linking, multiple, identities, from, single, ...
3     [an, internet/world-wide-web, based, keyword, ...
4     [systems, and, techniques, for, transferring, ...
5     [content, maintained, in, an, online, social, ...
6     [an, iterative, language, translation, system,...
7     [embodiments, of, the, invention, provide, tec...
8     [to, avoid, the, need, to, operate, in-chassis...
9     [method, and, apparatus, for, fine-grained, tr...
10    [wireless, communication, system, and, in, par...
11    [social, networking, system, generates, social...
12    [in, one, embodiment, method, includes, receiv...
13    [server, system, receives, first, request, for...
14    [the, various, embodiments, described, herein,...
15    [in, one, embodiment, method, includes, deploy...
16    [tracking, system, generates, structured, ligh...
17    [in, one, embodiment, method, includes, st

### 2. Counting and TF-IDF on abstarct analysis

In [20]:
def print_sorted_vector(v):
    sorted_list = sorted(v.items(), key=lambda x: (x[1],x[0]), reverse=True) 
    sorted_list = sorted_list[:20]
    print('\n'.join([str(x) for x in sorted_list]))

In [10]:
def return_sorted_vector(v):
    sorted_list = sorted(v.items(), key=lambda x: (x[1],x[0]), reverse=True) 
    sorted_list = sorted_list[:20]
    return sorted_list

***Counting-term frequency***

In [34]:
# Count Vectorizer with stop words
vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize)

matrix = vectorizer.fit_transform(df_abstract.abstract)

#convert matrix to a dataframe
results = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())
results.head()



Unnamed: 0,'s,100,144th,145th,27,289th,290th,30,300,3d,...,zeros,zfp,zone,zones,zones—areas,zookeeper,zoom,zooming,˜10,—are
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
big_list = []
for i in range(0,18):
    year = return_sorted_vector(results.iloc[i,:])
    big_list.append(year)

In [36]:
df_counting=pd.DataFrame(big_list).T
df_counting

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,"(digital, 9)","(multidimensional, 7)","(user, 12)","(message, 10)","(instant, 6)","(content, 20)","(social, 35)","(social, 60)","(user, 60)","(user, 413)","(user, 524)","(user, 1627)","(user, 1324)","(user, 1110)","(user, 689)","(user, 482)","(user, 149)","(access, 13)"
1,"(audio, 7)","(data, 4)","(list, 8)","(members, 9)","(network, 5)","(social, 19)","(user, 34)","(user, 56)","(social, 56)","(social, 195)","(social, 351)","(information, 635)","(social, 674)","(social, 596)","(device, 387)","(device, 274)","(device, 116)","(content, 10)"
2,"(playback, 5)","(space, 3)","(identity, 8)","(member, 9)","(messaging, 5)","(user, 17)","(network, 33)","(website, 31)","(document, 51)","(users, 133)","(networking, 235)","(social, 619)","(device, 574)","(device, 541)","(data, 302)","(social, 243)","(network, 92)","(device, 7)"
3,"(device, 4)","(retrieved, 3)","(account, 8)","(individual, 9)","(users, 4)","(network, 17)","(time, 15)","(network, 30)","(information, 42)","(networking, 129)","(information, 219)","(device, 591)","(data, 486)","(information, 442)","(social, 267)","(content, 193)","(based, 84)","(associated, 7)"
4,"(computing, 4)","(recommendation, 3)","(wallet, 6)","(reply, 8)","(communications, 4)","(items, 14)","(web-based, 14)","(networking, 28)","(structured, 38)","(second, 108)","(device, 205)","(content, 509)","(associated, 463)","(content, 406)","(second, 264)","(associated, 188)","(request, 66)","(posting, 6)"
5,"(platform, 3)","(information, 3)","(secondary, 4)","(social, 7)","(message, 3)","(associated, 14)","(method, 13)","(member, 28)","(application, 36)","(information, 104)","(users, 197)","(based, 481)","(content, 461)","(based, 401)","(associated, 262)","(based, 185)","(associated, 64)","(code, 6)"
6,"(wireless, 2)","(recommendation-specific, 2)","(primary, 4)","(original, 7)","(mail, 3)","(method, 13)","(users, 11)","(content, 28)","(data, 34)","(content, 99)","(content, 194)","(associated, 457)","(based, 438)","(associated, 390)","(based, 258)","(second, 150)","(second, 59)","(second, 5)"
7,"(players, 2)","(query, 2)","(identifier, 4)","(network, 7)","(gateway, 3)","(data, 12)","(second, 11)","(based, 28)","(client, 32)","(associated, 90)","(based, 131)","(users, 420)","(networking, 405)","(second, 346)","(content, 194)","(information, 150)","(social, 56)","(messaging, 5)"
8,"(enable, 2)","(cube, 2)","(buddy, 4)","(information, 7)","(e-mail, 3)","(media, 11)","(includes, 10)","(information, 26)","(target, 31)","(based, 88)","(second, 126)","(message, 390)","(includes, 404)","(networking, 342)","(includes, 190)","(data, 143)","(content, 51)","(user, 4)"
9,"(content, 2)","(based, 2)","(associated, 4)","(based, 7)","(configuring, 3)","(member, 10)","(trust, 9)","(associated, 23)","(resources, 31)","(web, 65)","(associated, 121)","(networking, 385)","(information, 392)","(data, 339)","(media, 179)","(networking, 139)","(data, 49)","(location, 4)"


***tf-idf***

In [11]:
df_abstract

Unnamed: 0,filed_date,abstract
0,2001,A digital audio playback device that includes ...
1,2002,"A process, system and computer software are ro..."
2,2003,Linking multiple identities from a single serv...
3,2004,An Internet/world-wide-web based keyword bid m...
4,2005,Systems and techniques for transferring electr...
5,2006,Content maintained in an online social network...
6,2007,An iterative language translation system. The ...
7,2008,Embodiments of the invention provide technique...
8,2009,To avoid the need to operate in-chassis fans t...
9,2010,"A method and apparatus for fine-grained, trust..."


In [16]:
#remove puctuation
df_abstract.abstract = df_abstract.abstract.str.replace('[^\w\s]','')

In [17]:
#lemnization
from textblob import Word
df_abstract.abstract = df_abstract.abstract.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [24]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)

matrix = vectorizer.fit_transform(df_abstract.abstract)

# The easiest way to see what happenned is to make a dataframe
tfidf = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())
tfidf.shape

(18, 5966)

In [27]:
big_list = []
for i in range(0,18):
    year = return_sorted_vector(tfidf.iloc[i,:])
    big_list.append(year)

In [32]:
df_tfidf=pd.DataFrame(big_list).T
df_tfidf.columns = years
df_tfidf

Unnamed: 0,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,"(digital, 0.5093593752472022)","(multidimensional, 0.7042078623671597)","(wallet, 0.34117465799583585)","(reply, 0.2678499950035751)","(instant, 0.4256768367266237)","(social, 0.2749502329564752)","(social, 0.38959948945428985)","(social, 0.37061557028094705)","(document, 0.29145670724458134)","(user, 0.5867473358014293)","(user, 0.47197511353743693)","(user, 0.5228683147190759)","(user, 0.47704032230985316)","(user, 0.45533349290928765)","(user, 0.4342410011679197)","(user, 0.4358201798856833)","(user, 0.333753568422941)","(access, 0.4219901721360733)"
1,"(audio, 0.4715154226323161)","(440, 0.22986895714175268)","(user, 0.29640390409549355)","(individual, 0.24987805889668013)","(mail, 0.2962560322426169)","(content, 0.25981552778906875)","(network, 0.3479219477218573)","(user, 0.29445923938657825)","(social, 0.23910199926170003)","(social, 0.3254400047831769)","(social, 0.37138997853843064)","(social, 0.23368491183785312)","(social, 0.28527404016289354)","(social, 0.28720244274516166)","(device, 0.257212717768784)","(device, 0.2612646956731295)","(device, 0.27401069011386664)","(content, 0.34231730332685345)"
2,"(playback, 0.40996216616082126)","(410, 0.22986895714175268)","(identity, 0.2750126094636282)","(members, 0.22288680722075824)","(configuring, 0.2962560322426169)","(items, 0.24002319994441185)","(user, 0.3221766045796355)","(networking, 0.2049062175433301)","(user, 0.2180776603945611)","(networking, 0.2550649191833866)","(networking, 0.2945883861262219)","(information, 0.20406968644536766)","(device, 0.21809658479752173)","(device, 0.2340311954547239)","(data, 0.20071896838804335)","(social, 0.2581082121559337)","(network, 0.22928744749654564)","(posting, 0.28776799328197306)"
3,"(computing, 0.2395173995265161)","(400, 0.22986895714175268)","(accountholder, 0.25302816241551557)","(message, 0.22196191893518288)","(network, 0.2835904679164947)","(network, 0.23300592449632643)","(web-based, 0.19600871087202348)","(website, 0.2023555119383207)","(structured, 0.18140739757623706)","(users, 0.1992611275520081)","(information, 0.1972567745509517)","(device, 0.20029131628091607)","(networking, 0.20308697658018432)","(networking, 0.19525073184324057)","(social, 0.1976778998095072)","(associated, 0.18913458443253464)","(based, 0.19842153422038616)","(code, 0.25581246485614295)"
4,"(platform, 0.2020780382709926)","(recommendation-specific, 0.20120224639061707)","(account, 0.24530638159848356)","(original, 0.21956729777184505)","(messaging, 0.2835904679164947)","(user, 0.20941807385073521)","(time, 0.1669712097661242)","(network, 0.1755137787591796)","(client, 0.16187129959397323)","(second, 0.17071733913073311)","(device, 0.1947204397048069)","(content, 0.17250131977493446)","(associated, 0.18560979821022766)","(information, 0.1813129764557704)","(second, 0.18512639117192733)","(content, 0.1840295119157445)","(request, 0.17366761356829963)","(associated, 0.25281906874037885)"
5,"(device, 0.1923071661544377)","(cube, 0.20120224639061707)","(list, 0.23212820453169042)","(indicator, 0.21527077989495003)","(communications, 0.25313080432046337)","(associated, 0.1918872319381512)","(trust, 0.15337894935057372)","(member, 0.17295393279777532)","(target, 0.15681282148166156)","(content, 0.14832219268908872)","(users, 0.18712159327730227)","(networking, 0.17219694716337583)","(data, 0.18466017458466125)","(associated, 0.17800164996757553)","(associated, 0.18372391851153397)","(based, 0.17640134561871879)","(associated, 0.15950431130194478)","(device, 0.23962211232879743)"
6,"(players, 0.1766333598290466)","(recommendation, 0.19778523856771174)","(secondary, 0.17769320918446327)","(member, 0.210913038775488)","(gateway, 0.24045511827173666)","(media, 0.1885896570991807)","(method, 0.13706016122376197)","(applications, 0.15884606564406129)","(resources, 0.15681282148166156)","(information, 0.14775235574660692)","(content, 0.18427202586698802)","(associated, 0.16340817319470752)","(content, 0.17516119441055317)","(content, 0.17563154409356357)","(based, 0.17147514517918935)","(networking, 0.17491821785895717)","(social, 0.14735433878522394)","(second, 0.1805850491002706)"
7,"(wireless, 0.13471869218066176)","(space, 0.17412059450739278)","(buddy, 0.17769320918446327)","(ppcses, 0.17510583550429867)","(e-mail, 0.24045511827173666)","(method, 0.1781810010854261)","(gift, 0.13633684386717665)","(terms, 0.15538153523939033)","(phrase, 0.15490296656871688)","(associated, 0.1422644492756109)","(second, 0.12627319250070754)","(based, 0.16301205267533103)","(based, 0.1664221326503737)","(based, 0.17346859404315024)","(media, 0.15700901267998574)","(second, 0.15090525353659678)","(second, 0.14704303698148036)","(messaging, 0.1805850491002706)"
8,"(enable, 0.13471869218066176)","(retrieved, 0.17412059450739278)","(primary, 0.15534833899657124)","(social, 0.16404347460315735)","(users, 0.21502981494879372)","(buyer, 0.1577398560988492)","(organization, 0.13088478814721838)","(content, 0.15526195780003)","(application, 0.15370842809680715)","(based, 0.13184194905696778)","(based, 0.12443111025038882)","(users, 0.142339006494052)","(includes, 0.1455621527289884)","(second, 0.15791941253533623)","(client, 0.15437757671328203)","(client, 0.14345810673324544)","(video, 0.12576375606812135)","(location, 0.17054164323742865)"
9,"(playlists, 0.11675400994741758)","(data, 0.1657015199936708)","(sites, 0.14354929162267316)","(network, 0.15537334325462804)","(message, 0.17015428074989683)","(data, 0.15588931667344125)","(travel, 0.1272275188710474)","(based, 0.15526195780003)","(information, 0.15265436227619278)","(web, 0.11463853227402687)","(associated, 0.12126235152845725)","(message, 0.13945117624931277)","(information, 0.1412385244301076)","(data, 0.14664801341802478)","(content, 0.1289386750572199)","(data, 0.136353472559334)","(content, 0.12047021720523444)","(geographic, 0.17054164323742865)"


## HMMMM,let's try tokenize by noun phrase

In [18]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=phrase)

matrix = vectorizer.fit_transform(df_abstract.abstract)

# The easiest way to see what happenned is to make a dataframe
tfidf = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())


In [27]:

big_list = []
for i in range(0,18):
    content = return_sorted_vector(tfidf.iloc[i,:])
    big_list.append(content)

In [28]:
df_tfidf=pd.DataFrame(big_list).T
df_tfidf.columns = years

# df_tfidf.to_csv('abs_tfidf_lem.csv',index=False)

In [49]:
df_tfidf[df_tfidf[2012]].plot.scatter(marker='o',s=)

KeyError: "[('electronic message', 0.29977677180642387)\n ('mobile device', 0.22734209249962897)\n ('instant message', 0.19518983185255231)\n ('content item', 0.16917245275486606)\n ('geographic location', 0.15297785663526436)\n ('user profile', 0.13984922761068927)\n ('visible portion', 0.13354959229028382)\n ('input tool', 0.13354959229028382) ('social graph', 0.12962054346104598)\n ('user interface', 0.1296062396493212) ('web page', 0.12110746983625095)\n ('data request', 0.11548443699293567)\n ('subordinate user', 0.10720155070057925)\n ('browser request', 0.10543388865022407)\n ('particular embodiment', 0.10375910435631784)\n ('communication system', 0.09992559060214128)\n ('alert gate', 0.09840496274020913)\n ('profile information', 0.09759491592627616)\n ('user interaction', 0.09348646794377266)\n ('target user', 0.09223900670966888)] not in index"

In [None]:
# This makes lots of sense...

In [43]:
## Cleanup the table a little to show only the key phrases for each year

In [23]:


big_list = []
for i in range(0,18):
    content = return_sorted_vector(tfidf.iloc[i,:])
    words=[]
    for n in range(len(content)):
        if content[n][1] != 0:
            words.append(content[n][0])
        else:
            words.append('None')
      
    big_list.append(words)



In [25]:
df_tfidf=pd.DataFrame(big_list).T
df_tfidf.columns = years
df_tfidf.to_csv('abs_tfidf_words_only_lem.csv',index=False)

In [26]:
df_tfidf

Unnamed: 0,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,digital audio playback device,multidimensional space,secondary identity,original message,communication system,product category,social network,top term,browser application,web page,mobile device,electronic message,character stream,mobile device,client device,client device,network request,access code
1,wireless communication link,recommendationspecific query,primary identity,reply message,email message,news item,digital file,social network,abstract phrase,threshold degree,anchor term,mobile device,social graph,content item,content item,mobile device,rewrite rule,access device
2,user request copy,multidimensional data,user identifier,social network,host system,medium content,travel time,demographic information,executable code segment,candidate user,map view,instant message,interactive element,social network,social graph,target word,natural language user request,remote web browser
3,mobile digital audio player,multidimensional cube,common graphical user interface,mobile telephone,delivers email message,social network environment,trust value,text phrase,data center,particular embodiment access,particular embodiment,content item,content item,future activity,mobile device,social graph,network resource,user identifier
4,enable wireless communication,variable information,secondlevel password,reply source indicator,certain user,real life friend,multiple identity,external system,text phrase,search query,timeline unit,geographic location,electronic message,online system,medium item,content identifier,object graph,present geographic location
5,digital playback device andor,process system,onetime challenge,destination indicator,particular moment,privacy summary,master keywords,picture file,text value,user comment,social network,user profile,social network,client device,online system,user profile,mobile device,content creation event
6,digital audio content,computer software,additional service,memory device,electronic data,social network,data unit,thirdparty application server,response phase,user node,multimedia content,visible portion,digital file,social graph,social network,content item,source video,access device techniques
7,addition digital content,retrieve data,account holder,cpuintensive computational task,instant message,digital medium,temporal period,calendar entry,social network,access code,content object,input tool,medium item,timeline unit,computerreadable medium,computer system,bit rate,geographic location
8,digital audio player,,multiple identity,visibility preference,various type,central computer,gift account,authentication code,social captcha,social network,content item,social graph,noncompletion character,computerreadable medium,user account,online system,antenna element,retrieve message data
9,,,secondary identity interacts,contactability preference,,data item,trust level,user profile,response portion,participant list,search result,user interface,user interface,medium content item,medium presentation system,machine translation system,network access component,message experience


In [40]:
#this is my final keyphrase table

In [None]:
**Facebook first came up with the idea of tracking you down location: 

In [None]:
**mimic human...in 2004**

In [None]:
identity

In [None]:
digital/audio

In [41]:
df[df.abstract.str.contains('mimic')]

Unnamed: 0,abstract,applicant,assignee,date_filed,date_issued,inventor,p_num,title,type,url,filed_date
475,"An audio system, such as an audio system inclu...",,"FACEBOOK, INC.","December 28, 2015","October 17, 2017",Ian Werris,9794691,Using bone transducers to imply positioning of...,Grant,https://patents.justia.com/patent/9794691,2015-12-28
669,An Internet/world-wide-web based keyword bid m...,,"FACEBOOK, INC.","April 12, 2004","July 25, 2017","David Gilbert Carlson, Frank Wouter Watervoort",9715692,System for managing bids for pay-per-click sea...,Grant,https://patents.justia.com/patent/9715692,2004-04-12
