In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da
# Percentiles
from scipy.stats import percentileofscore
from IPython.display import display

  from ._conv import register_converters as _register_converters


## Inventor mobility effects on citations at new location

1.1 Find new location citations to inventor's old patents and inventor's new patents
- Show new patents have greater rates of citation compared to old patents

1.2 Show rate of citation at new location increases, but doesn't translate to more similar inventions
1. Find new patent's citations in new location
2. Find new patents' similarity to new citations
3. Find new patents' similarity to old patents of same assignee who don't cite new patent

### Find inventors who moved and their patents

In [95]:
yv = "appyear"
# All inventors who have moved
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
print(len(ip))

pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Add application year
ip[yv] = ip["patent"].map(dict(zip(pdf["patent"], pdf[yv])))

# Sort by inventor, grant year
ip = ip.sort_values(["inventor_id", yv])

# Only look at inventors' first and second cities
ip = ip.loc[(ip["inv_asg_rank"] <= 1)]

# Inventors' second cities
sc = ip.loc[(ip["inv_asg_rank"] == 1), ["inventor_id", "inv_msa", yv]].drop_duplicates(["inventor_id", "inv_msa"])

# Inventors' second city compared to first
ip["sec_inv_msa"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc["inv_msa"])))

# Second city's first grant year
ip["sec_fyear"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc[yv])))

# Get rid of the inventors whose second MSA matches the first
ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
print(len(ip))

# Get rid inventors with any missing cities
missing_cities = ip.loc[(ip["inv_msa"].isnull() | ip["sec_inv_msa"].isnull()), "inventor_id"].tolist()
ip = ip.loc[~ip["inventor_id"].isin(missing_cities)]
print(len(ip))

140076
12846
10826


### Citations to each of mobile inventors' patents

In [96]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")

%time c2 = cit[cit["cited"].isin(ip["patent"])].compute()

# Remove self-citations
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))

%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(c2["cited"], c2["citing"]))
%time asg_match = [len(i) for i in asg_match]
c2["asg_match"] = asg_match
c2 = c2.loc[c2["asg_match"] == 0]
c2 = c2[["citing", "cited"]]

# Add assignees
c2["cited_asg"] = c2["cited"].map(asgs)
c2["citing_asg"] = c2["citing"].map(asgs)
del(asgs)

print(len(c2))

CPU times: user 2min 14s, sys: 34.7 s, total: 2min 49s
Wall time: 1min 25s
CPU times: user 50 ms, sys: 0 ns, total: 50 ms
Wall time: 47.2 ms
CPU times: user 1.72 s, sys: 0 ns, total: 1.72 s
Wall time: 1.62 s
331597


#### Similarities for citation pairs

In [97]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))

l2 = pd.DataFrame({"tp": c2["cited"], "op": c2["citing"]})

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    # Store copy as array
    l3 = l2.loc[l2["tp"].isin(pat_dict.keys()) & l2["op"].isin(pat_dict.keys()), ["tp", "op"]].copy()

    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
.to_pandas().values

    # Convert to chunks
    print("Getting chunks")
    print(datetime.datetime.now())
    # Split into chunks
    n_rows = 3000
    n_chunks = int(np.round(len(l3)/n_rows))
    tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
    op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])
    del(pm)
    chunks = itertools.zip_longest(tp_chunks, op_chunks)

    print("Getting patent pair cosine similarity")
    print(datetime.datetime.now())
    # Cosine

    cos_dis = np.empty(len(l3))

    for r, c in enumerate(chunks):
        cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

    l3["sim_{0}".format(dm)] = 1-cos_dis
    
    # Rename columns
    l3 = l3.rename(columns={"tp": "cited", "op": "citing"})
    c2 = c2.merge(l3, how = "left", on = ["cited", "citing"])
    del(l3)
    print("finished")
    print(datetime.datetime.now())

Getting row values
2018-09-12 17:18:15.702909
('ldavecs', 'started')
Loading matrix and dict
2018-09-12 17:18:18.019595
Getting chunks
2018-09-12 17:18:27.765365
Getting patent pair cosine similarity
2018-09-12 17:18:28.850094
finished
2018-09-12 17:18:48.055166
('docvecs', 'started')
Loading matrix and dict
2018-09-12 17:18:48.055292
Getting chunks
2018-09-12 17:19:06.406469
Getting patent pair cosine similarity
2018-09-12 17:19:08.236596
finished
2018-09-12 17:19:34.887668


In [98]:
# Get location for each patent
# pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
# .to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Get MSA of cited patent
c2["cited_inv_msa"] = c2["cited"].map(dict(zip(pdf["patent"], pdf["inv_msa"])))
c2["citing_inv_msa"] = c2["citing"].map(dict(zip(pdf["patent"], pdf["inv_msa"])))

# Get gyear of cited patent
c2["cited_"+yv] = c2["cited"].map(dict(zip(pdf["patent"], pdf[yv])))
c2["citing_"+yv] = c2["citing"].map(dict(zip(pdf["patent"], pdf[yv])))
del(pdf)

# Get second cities for each patent
c2["sec_inv_msa"] = c2["cited"].map(dict(zip(ip["patent"], ip["sec_inv_msa"])))

# Get second cities first grant year for each patent
c2["sec_fyear"] = c2["cited"].map(dict(zip(ip["patent"], ip["sec_fyear"])))

# Matching citing patent MSA to second MSA
c2["sec_inv_msa_match"] = (c2["citing_inv_msa"] == c2["sec_inv_msa"])

# Match rate to second MSA
# Before move
prior = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]), ["cited", "sec_inv_msa_match"]].groupby("cited").mean()
# After move
post = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]), ["cited", "sec_inv_msa_match"]].groupby("cited").mean()

# Average similarity before and after move
for c in ["sim_docvecs", "sim_ldavecs"]:
    # Average of second MSA citations similarity to cited patent, before and after move
    c3 = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True),
                ["cited", c]].groupby("cited").mean()
    c4 = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True),
                ["cited", c]].groupby("cited").mean()
    prior = pd.concat([prior, c3], axis=1)
    post = pd.concat([post, c4], axis=1)

In [99]:
# Get match rate to second MSA for each patent
for c in ["sec_inv_msa_match", "sim_docvecs", "sim_ldavecs"]:
    ip["{0}_prior".format(c)] = ip["patent"].map(prior[c])
    ip["{0}_post".format(c)] = ip["patent"].map(post[c])
print(len(ip))
# Get rid of the inventors whose second MSA matches the first
ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
print(len(ip))

10826
10826


In [100]:
ip[["inv_asg_rank", "sec_inv_msa_match_prior", "sec_inv_msa_match_post",
   "sim_docvecs_prior", "sim_docvecs_post"]].groupby("inv_asg_rank").mean()

Unnamed: 0_level_0,sec_inv_msa_match_prior,sec_inv_msa_match_post,sim_docvecs_prior,sim_docvecs_post
inv_asg_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.053176,0.088089,0.311498,0.311007
1,0.083393,0.070523,0.317783,0.299359


In [101]:
c2.to_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip.to_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

### Compare new assignees at new location's similarity to assignees who already cite prior patent

In [78]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)

# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))
prev_cite_asgs = list(set(a2).intersection(set(a1)))

In [80]:
csim = {}
csim["prev_prior"] = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(prev_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("prev_prior").reset_index()
csim["prev_post"] = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(prev_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("prev_post").reset_index()
csim["new_post"] = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(new_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("new_post_").reset_index()

In [84]:
ip.columns

Index(['patent', 'inventor_id', 'location_id', 'city', 'state', 'country',
       'inv_msa', 'gyear', 'assignee_id', 'inv_asg_rank', 'appyear',
       'sec_inv_msa', 'sec_fyear', 'sec_inv_msa_match_prior',
       'sec_inv_msa_match_post', 'sim_docvecs_prior', 'sim_docvecs_post',
       'sim_ldavecs_prior', 'sim_ldavecs_post', 'prev_priorsim_docvecs',
       'prev_priorsim_ldavecs', 'prev_postsim_docvecs', 'prev_postsim_ldavecs',
       'new_post_sim_docvecs', 'new_post_sim_ldavecs'],
      dtype='object')

In [82]:
# Get match rate to second MSA for each patent
for k,v in csim.items():
    ip = ip.merge(v, how="left", left_on="patent", right_on="cited").drop("cited",1)

In [85]:
ip[['sec_inv_msa_match_prior',
       'sec_inv_msa_match_post', 'sim_docvecs_prior', 'sim_docvecs_post',
       'sim_ldavecs_prior', 'sim_ldavecs_post', 'prev_priorsim_docvecs',
       'prev_priorsim_ldavecs', 'prev_postsim_docvecs', 'prev_postsim_ldavecs',
       'new_post_sim_docvecs', 'new_post_sim_ldavecs']].mean()

sec_inv_msa_match_prior    0.057886
sec_inv_msa_match_post     0.084252
sim_docvecs_prior          0.312521
sim_docvecs_post           0.308676
sim_ldavecs_prior          0.550057
sim_ldavecs_post           0.527295
prev_priorsim_docvecs      0.312827
prev_priorsim_ldavecs      0.556394
prev_postsim_docvecs       0.291704
prev_postsim_ldavecs       0.526657
new_post_sim_docvecs       0.316823
new_post_sim_ldavecs       0.524154
dtype: float64

### Alternatively: Find control patent for each post-move citation
- Find new firms that cite prior patent post-move
- For each post-move citation at second MSA to inventor's prior move patent, collect all prior 5 year patents from the new citations.
- Find "pre-move" control in same primary class as citation
- Compare their similarities to the prior patent

In [2]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

In [4]:
# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)
# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))

# New cites
c3 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]) & c2["citing_asg"].isin(new_cite_asgs)]

# Merge new citing patents with pdf
c3["citing_primclass"] = c3["patent"].map(dict(zip(pdf["patent"], pdf["primclass"])))

In [19]:
# Patents by newly citing assignees
pdf = pdf.loc[pdf["assignee_id"].isin(new_cite_asgs)]
len(pdf)

# Sort by assignee, primclass, app year
pdf = pdf.sort_values(["assignee_id", "primclass", yv], ascending = [1,1,0])

# Groupby assignee, primclass, app year
p2 = pdf.groupby(["assignee_id", "primclass"])

# Control patent by assignee, primclass and by assignee
cdict = {}
adict = {}
for yr in range(1975, 2016):
    print(yr)
    print(datetime.datetime.now())
    # Control patent by assignee, primclass
    p2 = pdf.loc[pdf["appyear"] <= yr].groupby(["assignee_id", "primclass"])
    p2 = {n+(yr,): (g["patent"].tolist()[0] if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    cdict.update(p2)
    del(p2)
    
    # Control patent by assignee
    p2 = pdf.loc[pdf["appyear"] <= yr].groupby(["assignee_id"])
    p2 = {n+(yr,): (g["patent"].tolist()[0] if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    adict.update(p2)
    del(p2)

(1975, datetime.datetime(2018, 9, 12, 18, 28, 51, 635257))
(1976, datetime.datetime(2018, 9, 12, 18, 28, 52, 135947))
(1977, datetime.datetime(2018, 9, 12, 18, 28, 52, 835931))
(1978, datetime.datetime(2018, 9, 12, 18, 28, 53, 726811))
(1979, datetime.datetime(2018, 9, 12, 18, 28, 55, 496199))
(1980, datetime.datetime(2018, 9, 12, 18, 28, 56, 776997))
(1981, datetime.datetime(2018, 9, 12, 18, 28, 59, 236089))
(1982, datetime.datetime(2018, 9, 12, 18, 29, 0, 969631))
(1983, datetime.datetime(2018, 9, 12, 18, 29, 3, 8536))
(1984, datetime.datetime(2018, 9, 12, 18, 29, 4, 930146))
(1985, datetime.datetime(2018, 9, 12, 18, 29, 7, 101588))
(1986, datetime.datetime(2018, 9, 12, 18, 29, 9, 467290))
(1987, datetime.datetime(2018, 9, 12, 18, 29, 11, 997276))
(1988, datetime.datetime(2018, 9, 12, 18, 29, 14, 729875))
(1989, datetime.datetime(2018, 9, 12, 18, 29, 17, 744897))
(1990, datetime.datetime(2018, 9, 12, 18, 29, 21, 451266))
(1991, datetime.datetime(2018, 9, 12, 18, 29, 25, 27712))
(1992

In [36]:
adict = {}
for yr in range(1975, 2016):
    print(yr)
    print(datetime.datetime.now())
    # Control patent by assignee
    p2 = pdf.loc[pdf["appyear"] <= yr].groupby(["assignee_id"])
    p2 = {(n,yr): (g["patent"].tolist()[0] if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    adict.update(p2)
    del(p2)

1975
2018-09-12 18:40:39.833854
1976
2018-09-12 18:40:40.281794
1977
2018-09-12 18:40:40.829632
1978
2018-09-12 18:40:41.458402
1979
2018-09-12 18:40:41.817494
1980
2018-09-12 18:40:42.116286
1981
2018-09-12 18:40:42.429928
1982
2018-09-12 18:40:42.777987
1983
2018-09-12 18:40:43.177171
1984
2018-09-12 18:40:43.583857
1985
2018-09-12 18:40:44.020038
1986
2018-09-12 18:40:44.491436
1987
2018-09-12 18:40:45.025346
1988
2018-09-12 18:40:45.604174
1989
2018-09-12 18:40:46.243011
1990
2018-09-12 18:40:46.963860
1991
2018-09-12 18:40:48.049021
1992
2018-09-12 18:40:48.867267
1993
2018-09-12 18:40:49.760100
1994
2018-09-12 18:40:51.412896
1995
2018-09-12 18:40:52.493863
1996
2018-09-12 18:40:53.827819
1997
2018-09-12 18:40:55.554711
1998
2018-09-12 18:40:56.962803
1999
2018-09-12 18:40:58.554763
2000
2018-09-12 18:41:00.336829
2001
2018-09-12 18:41:02.270491
2002
2018-09-12 18:41:04.289600
2003
2018-09-12 18:41:06.791542
2004
2018-09-12 18:41:09.193989
2005
2018-09-12 18:41:11.706246
2006
201

In [56]:
c3["citing_control_asg_pc"] = [cdict.get((asg, pc, fyr)) for asg,pc,fyr in \
                        zip(c3["citing_asg"], c3["citing_primclass"], c3["sec_fyear"])]
c3["citing_control_asg"] = [adict.get((asg, fyr)) for asg,fyr in \
                        zip(c3["citing_asg"], c3["sec_fyear"])]

In [57]:
c3.to_pickle("DataStore/2018-08/post_move_new_asgs_cites_0912.pkl")

In [71]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))


l2 = c3.copy()

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
    .to_pandas().values
    
    for col in ["citing", "citing_control_asg_pc", "citing_control_asg"]:
        l3 = pd.DataFrame({"tp": c3["cited"], "op": c3[col]})
        l3 = l3.dropna(how="any").drop_duplicates()
        # Store copy as array
        l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
        print(len(l3))

        # Convert to chunks
        print("Getting chunks")
        print(datetime.datetime.now())
        # Split into chunks
        n_rows = 3000
        n_chunks = int(np.round(len(l3)/n_rows))
        tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
        op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])
        
        chunks = itertools.zip_longest(tp_chunks, op_chunks)

        print("Getting patent pair cosine similarity")
        print(datetime.datetime.now())
        # Cosine

        cos_dis = np.empty(len(l3))

        for r, c in enumerate(chunks):
            cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

        l3["sim_{0}_{1}".format(dm,col)] = 1-cos_dis

        # Rename columns
        l3 = l3.rename(columns={"tp": "cited", "op": col})
        l2 = l2.merge(l3, how = "left", on = ["cited", col])
        print(len(l2))
        del(l3)
        print("finished")
        print(datetime.datetime.now())
    del(pm)

Getting row values
2018-09-12 19:00:47.175140
('ldavecs', 'started')
Loading matrix and dict
2018-09-12 19:00:48.730896
20279
Getting chunks
2018-09-12 19:01:00.924875
Getting patent pair cosine similarity
2018-09-12 19:01:01.124369
27817
finished
2018-09-12 19:01:08.122148
3155
Getting chunks
2018-09-12 19:01:08.680456
Getting patent pair cosine similarity
2018-09-12 19:01:08.713049
27817
finished
2018-09-12 19:01:09.393616
5177
Getting chunks
2018-09-12 19:01:10.158479
Getting patent pair cosine similarity
2018-09-12 19:01:10.209352
27817
finished
2018-09-12 19:01:10.764584
('docvecs', 'started')
Loading matrix and dict
2018-09-12 19:01:10.844574
20279
Getting chunks
2018-09-12 19:01:27.229880
Getting patent pair cosine similarity
2018-09-12 19:01:27.420612
27817
finished
2018-09-12 19:01:30.879961
3155
Getting chunks
2018-09-12 19:01:31.411230
Getting patent pair cosine similarity
2018-09-12 19:01:31.446447
27817
finished
2018-09-12 19:01:32.018743
5177
Getting chunks
2018-09-12 19:

In [76]:
c3 = l2
c4 = c3[["cited", "sim_docvecs_citing", "sim_docvecs_citing_control_asg_pc", "sim_docvecs_citing_control_asg"]]\
.groupby("cited").mean()

In [74]:
l2.dropna(subset=["sim_docvecs_citing", "sim_docvecs_citing_control_asg_pc"], how="any")\
[["sim_docvecs_citing", "sim_docvecs_citing_control_asg_pc"]].mean()

sim_docvecs_citing                   0.296676
sim_docvecs_citing_control_asg_pc    0.259042
dtype: float64

In [75]:
l2.dropna(subset=["sim_docvecs_citing", "sim_docvecs_citing_control_asg"], how="any")\
[["sim_docvecs_citing", "sim_docvecs_citing_control_asg"]].mean()

sim_docvecs_citing                0.281617
sim_docvecs_citing_control_asg    0.176468
dtype: float64

In [77]:
c4

Unnamed: 0_level_0,sim_docvecs_citing,sim_docvecs_citing_control_asg_pc,sim_docvecs_citing_control_asg
cited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3930285,0.152503,-0.031136,0.310015
3932329,0.248710,,0.087314
3932330,0.351866,0.350457,-0.027972
3932797,0.434263,,
3933628,0.286781,,
3934161,0.301041,,
3934528,0.441375,0.161984,0.055646
3934617,0.276021,,0.094469
3934618,0.146126,,0.002138
3934727,0.399669,,
