## Use the vocabulary gathered on the semantic scholar dataset to vectorize the data.

- The vocabulary is about 1 million terms
- we don't actually need a 1 million x 2 million matrix

In [2]:
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import random
import pickle
import pandas as pd

import sys
import time
import csv
sys.path.append("../../tools")
import my_stopwords3

from sys import getsizeof

stop = my_stopwords3.get_stopwords()

In [3]:
vocabulary = pickle.load(open("vocabulary.p", "rb"))

### Semantic scholar

In [3]:
years = list(range(1981, 2021))
vectors = []
for year in years:
    t0 = time.time()
    with open("../../Data/semantic_scholar_cleaned_langdetect/"+str(year)+".txt", "r") as f:
        documents = f.readlines()
        documents = [d.strip() for d in documents] 

    vectorizer = CountVectorizer(strip_accents='unicode',
                             ngram_range=(1,4),
                             vocabulary=vocabulary,
                             stop_words=stop
                            )
    
    vector = vectorizer.fit_transform(documents)
    
    del documents
    
    vector[vector>1] = 1
    summed = np.squeeze(np.asarray(np.sum(vector, axis=0)))
    
    del vector
    
    vectors.append(summed)
    
    del summed
    
    print(year, time.time()-t0)
    
# Turn the vector into a pandas dataframe
# This has to happen because list indexing is waaaaaay too slow
df = pd.DataFrame(vectors, columns=vocabulary) 
df.index = years
pickle.dump(df, open("../stacked_vectors/semantic_scholar.p", "wb"))

del vectors
del df

1981 1.953984260559082
1982 2.189453363418579
1983 2.5410115718841553
1984 2.602524518966675
1985 3.117190361022949
1986 3.5410685539245605
1987 4.0352842807769775
1988 4.608623504638672
1989 5.237196207046509
1990 7.498921155929565
1991 7.936845541000366
1992 8.258606433868408
1993 9.754759311676025
1994 11.668789625167847
1995 13.430156946182251
1996 12.849214315414429
1997 14.286081790924072
1998 16.66336727142334
1999 17.9203839302063


**Get year sizes**

In [10]:
years = list(range(1981, 2021))
document_count_per_year = []
for year in years:
    t0 = time.time()
    with open("../../Data/semantic_scholar_cleaned_langdetect/"+str(year)+".txt", "r") as f:
        documents = f.readlines()
        documents = [d.strip() for d in documents]
        document_count_per_year.append(len(documents))
    
df = pd.DataFrame(document_count_per_year, columns=['documents']) 
df.index = years
pickle.dump(df, open("../stacked_vectors/semantic_scholar_document_count.p", "wb"))

### USPTO



In [3]:

for y in range(2018, 2021):
    vectors = []
    files = []
    for m in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
        filename = str(y)+m
        files.append(filename)
        
        t0 = time.time()
        with open("../../Data/uspto_cleaned/"+str(filename)+".txt", "r") as f:
            documents = f.readlines()
            documents = [d.strip() for d in documents]

        vectorizer = CountVectorizer(strip_accents='unicode',
                                 ngram_range=(1,4),
                                 vocabulary=vocabulary,
                                 stop_words=stop
                                )

        vector = vectorizer.fit_transform(documents)

        del documents

        vector[vector>1] = 1
        summed = np.squeeze(np.asarray(np.sum(vector, axis=0)))

        del vector

        vectors.append(summed)

        del summed
        
        if m=='12':
            # Turn the vector into a pandas dataframe
            # This has to happen because list indexing is waaaaaay too slow
            df = pd.DataFrame(vectors, columns=vocabulary) 
            df.index = files
            pickle.dump(df, open("../stacked_vectors/uspto_"+str(y)+".p", "wb"))            
            
            del df
            del vectors 
        print(filename, time.time()-t0)
        


201801 80.6875684261322
201802 69.93550062179565
201803 87.11116528511047
201804 70.86444330215454
201805 73.76353549957275
201806 77.17390489578247
201807 71.7418704032898
201808 74.30070400238037
201809 66.70476150512695
201810 68.35228276252747
201811 64.8125205039978
201812 156.31264281272888
201901 57.26950144767761
201902 48.47320866584778
201903 55.42344832420349
201904 52.347954511642456
201905 48.598304271698
201906 41.65903568267822
201907 50.00160551071167
201908 33.42923831939697
201909 29.382107257843018
201910 26.390461921691895
201911 21.33091115951538
201912 119.59065771102905
202001 14.68824315071106
202002 15.876067399978638
202003 10.64989447593689
202004 8.24580979347229
202005 6.135050296783447
202006 5.312021017074585
202007 3.696535348892212
202008 1.9552507400512695
202009 1.5853545665740967
202010 1.0329370498657227
202011 0.7327725887298584
202012 92.4698874950409


**Splice these stacked vectors into one vector**


In [6]:
uspto = pickle.load(open("../stacked_vectors/uspto_2006.p", "rb"))
for year in range(2007,2021):
    new = pickle.load(open("../stacked_vectors/uspto_"+str(year)+".p", "rb"))
    uspto = pd.concat([uspto, new])
    
pickle.dump(uspto, open("../stacked_vectors/uspto.p", "wb"))     

**get doc count**

In [3]:
document_count_per_year = []
years = list(range(2006,2021))

for y in range(2006, 2021):
    count = 0
    for m in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
        filename = str(y)+m
        with open("../../Data/uspto_cleaned/"+str(filename)+".txt", "r") as f:
            documents = f.readlines()
            documents = [d.strip() for d in documents]
            count+=len(documents)
    document_count_per_year.append(count)
            
    
df = pd.DataFrame(document_count_per_year, columns=['documents']) 
df.index = years
pickle.dump(df, open("../stacked_vectors/uspto_document_count_by_year.p", "wb"))

In [19]:
df = pd.DataFrame(columns = vocabulary)

In [32]:
for y in range(2006, 2021):
    year_filenames = []
    for m in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
        year_filenames.append(str(y)+m)
    series = uspto.loc[year_filenames].sum()
    series = series.rename(y)
    df = df.append(series)
    

In [34]:
pickle.dump(df, open("../stacked_vectors/uspto_by_year.p", "wb"))  

In [35]:
df

Unnamed: 0,multilingual,bottleneck,feature,improving,asr,performance,code,switched,speech,resourced,...,interface definition language idl,mu double,spl mu double,block motion compensation,ttcn,layout tool,1999a,verification reactive system,embedded bit stream,event list
2006,60,1319,66490,13353,143,36846,29087,9519,2949,5,...,24,0,0,21,1,56,3,0,7,59
2007,66,1283,71218,13970,197,38797,32875,9569,3273,5,...,22,0,0,27,0,39,2,0,16,62
2008,64,1380,74139,14101,151,39308,34879,9626,3215,6,...,22,0,0,22,0,52,2,0,4,78
2009,35,1183,69315,13584,150,36562,31853,9367,2784,9,...,3,0,0,32,0,35,2,0,4,55
2010,58,1342,75721,14570,144,39957,34498,9643,2987,11,...,6,0,0,21,0,44,2,0,4,57
2011,61,1430,86729,16795,148,45513,39234,10498,3666,7,...,13,0,0,23,1,43,3,0,5,53
2012,63,1749,103367,18927,265,52327,48687,11908,4682,14,...,13,0,0,26,0,41,4,0,4,81
2013,68,1875,115690,20987,312,57262,53011,12579,5199,20,...,12,0,0,27,1,46,4,0,4,79
2014,67,1913,120178,22000,316,58763,54035,12731,4998,26,...,13,0,0,36,0,40,2,0,0,79
2015,80,1945,124431,22448,302,60273,57093,12704,5044,28,...,4,0,0,47,1,41,1,0,5,81


### NSF


In [5]:
document_count_per_year = []
funders = ['Direct_For_Computer_&_Info_Scie_&_Enginr', 'Directorate_For_Engineering', 'Direct_For_Mathematical_&_Physical_Scien']

years = list(range(2000, 2021))
vectors = []
for year in years:
    t0 = time.time()
    documents = []
    for funder in funders:
        with open("../../Data/nsf_cleaned/"+funder+'/'+str(year)+".txt", "r") as f:
            docs = f.readlines()
            documents += [d.strip() for d in docs] 
    
    document_count_per_year.append(len(documents))

    vectorizer = CountVectorizer(strip_accents='unicode',
                             ngram_range=(1,4),
                             vocabulary=vocabulary,
                             stop_words=stop
                            )
    
    vector = vectorizer.fit_transform(documents)
    
    del documents
    
    vector[vector>1] = 1
    summed = np.squeeze(np.asarray(np.sum(vector, axis=0)))
    
    del vector
    
    vectors.append(summed)
    
    del summed
    
    print(year, time.time()-t0)
    
# Turn the vector into a pandas dataframe
# This has to happen because list indexing is waaaaaay too slow
df = pd.DataFrame(vectors, columns=vocabulary) 
df.index = years
pickle.dump(df, open("../stacked_vectors/nsf.p", "wb"))

del vectors
del df

df = pd.DataFrame(document_count_per_year, columns=['documents']) 
df.index = years
pickle.dump(df, open("../stacked_vectors/nsf_document_count.p", "wb"))

2000 3.6608619689941406
2001 3.271656036376953
2002 3.6599881649017334
2003 4.528121709823608
2004 4.262118577957153
2005 4.344035863876343
2006 4.5587427616119385
2007 5.31236457824707
2008 6.152634859085083
2009 6.9369494915008545
2010 6.028690576553345
2011 5.961461544036865
2012 5.977890968322754
2013 5.619373083114624
2014 6.165205240249634
2015 6.9628005027771
2016 7.047996997833252
2017 6.690347909927368
2018 7.752470254898071
2019 7.742755651473999
2020 7.8365442752838135


In [11]:
pickle.dump(result, open("../stacked_vectors/semantic_scholar.p", "wb"))