### Imports

In [1]:
import pandas as pd
import re
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Connect to MongoDB

In [2]:
client = MongoClient()
sci = client.metis_p4_db.science

In [5]:
year_2017 = sci.find({'year': 2017})

In [6]:
year_2017[0]

{'_id': ObjectId('59ff7729127ab46797dafb56'),
 'description': 'Producing mass quantities of chemicals has its roots in the industrial revolution. But industrial synthesis leads to sizeable sustainability and socioeconomic challenges. The rapid advances in biotechnology suggest that biological manufacturing may soon be a feasible alternative, but can it produce chemicals at scale? Clomburg et al. review the progress made in industrial biomanufacturing, including the tradeoffs between highly tunable biocatalysts and units of scale. The biological conversion of single-carbon compounds such as methane, for example, has served as a testbed for more sustainable, decentralized production of desirable compounds.',
 'title': 'Industrial biomanufacturing: The future of chemical production',
 'year': 2017}

In [7]:
all_descriptions = []

for record in year_2017:
    all_descriptions.append(record['description'])

In [8]:
all_descriptions[:2]

['Producing mass quantities of chemicals has its roots in the industrial revolution. But industrial synthesis leads to sizeable sustainability and socioeconomic challenges. The rapid advances in biotechnology suggest that biological manufacturing may soon be a feasible alternative, but can it produce chemicals at scale? Clomburg et al. review the progress made in industrial biomanufacturing, including the tradeoffs between highly tunable biocatalysts and units of scale. The biological conversion of single-carbon compounds such as methane, for example, has served as a testbed for more sustainable, decentralized production of desirable compounds.',
 'The human genome generates many thousands of long noncoding RNAs (lncRNAs). A very small number of lncRNAs have been shown to be functional. Liu et al. carried out a large-scale CRISPR-based screen to assess the function of ∼17,000 lncRNAs in seven different human cell lines. A considerable number (∼500) of the tested lncRNAs influenced cell

In [9]:
len(all_descriptions)

1492

## Workflow

#### Write function for tokenizer

In [10]:
from spacy.en import STOP_WORDS as stopwords
import string
import spacy
nlp = spacy.load('en')
stopwords.update(['et', 'al', "'s", "—", '-'])
punctuations = string.punctuation

In [11]:
def spacy_tokenizer_1(doc):
    doc = doc.replace('—', '').replace(',', '').replace("'s", '')
    doc = re.sub('\s\d+\s', '', doc)
    tokens = nlp(doc)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

In [12]:
def spacy_tokenizer_2(doc):
    doc = doc.replace('—', '').replace(',', '').replace("'s", '')
    doc = re.sub('\s\d+(\s)?', '', doc)
    tokens = nlp(doc)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

In [13]:
def spacy_tokenizer_3(doc):
    doc = doc.replace('—', '').replace(',', '').replace("'s", '')
    # doc = re.sub('\s\d+(\s)?(,)?', ' ', doc)
    doc = re.sub('\D\d+(\D)?', ' ', doc)
    doc = doc.replace('–', '')
    tokens = nlp(doc)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

#### Functions for vectorizing

In [14]:
def count_vectorizer(tokenizer, max_feat, X, ngram_start=1, ngram_stop=2, max_df=0.6):

    vectorizer = CountVectorizer(tokenizer = tokenizer,
                                 ngram_range=(ngram_start, ngram_stop),
                                 max_df = max_df,
                                 max_features=max_feat)
    return vectorizer, vectorizer.fit_transform(X)

In [15]:
def tfidf_vectorizer(tokenizer, max_feat, X, ngram_start=1, ngram_stop=2, max_df=0.6):

    vectorizer = TfidfVectorizer(tokenizer = tokenizer,
                                 ngram_range=(ngram_start, ngram_stop),
                                 max_df = max_df,
                                 max_features=max_feat)
    return vectorizer, vectorizer.fit_transform(X)

#### Write functions for models

In [16]:
from sklearn.decomposition import NMF, TruncatedSVD

In [17]:
def lda_cv(X, n_comp, n_iter = 10):
    lda = LatentDirichletAllocation(n_components=n_comp,
                                    max_iter=n_iter,
                                    random_state=42,
                                    learning_method='online')
    return lda, lda.fit_transform(X)

def lda_tfidf(X, n_comp, n_iter = 10):
    lda = LatentDirichletAllocation(n_components=n_comp,
                                    max_iter=n_iter,
                                    random_state=42,
                                    learning_method='online')
    return lda, lda.fit_transform(X)

In [18]:
def lsa_tfidf(X, n_comp):
    lsa = TruncatedSVD(n_components=n_comp, random_state=42)
    return lsa, lsa.fit_transform(X)
def lsa_cv(X, n_comp):
    lsa = TruncatedSVD(n_components=n_comp, random_state=42)
    return lsa, lsa.fit_transform(X)
def nmf_tfidf(X, n_comp):
    nmf = NMF(n_components=n_comp, random_state=42)
    return nmf, nmf.fit_transform(X)
def nmf_cv(X, n_comp):
    nmf = NMF(n_components=n_comp, random_state=42)
    return nmf, nmf.fit_transform(X)

#### function for displaying topics

In [19]:
def display_topics(model, feature_names, no_top_words):
    for ix, topic in enumerate(model.components_):
        print("Topic ", ix)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Function to display modeling results

In [20]:
def use_vectorizer(cv_vect, cv_vect_trans, n_comp=10):
    models = [lsa_cv, nmf_cv, lda_tfidf]
    names = ['LSA', 'NMF', 'LDA']
    i = 0
    for item in models:
        model, model_transformed = item(cv_vect_trans, n_comp=n_comp)
        print('\n\n-------' + names[i] + '------\n\n')
        display_topics(model, cv_vect.get_feature_names(), 10)
        i += 1

# Make different vectorizers and test models (LSA, NMF, LDA)

#### Start with tokenizer 3 as this worked best for a single year

In [21]:
cv_vect1, cv_vect1_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [22]:
use_vectorizer(cv_vect1, cv_vect1_transformed)



-------LSA------


Topic  0
cell use protein new human science study year gene cancer
Topic  1
cell protein t cancer stem cell stem t cell immune gene response
Topic  2
science cell research u.s scientist new president trump agency world
Topic  3
material cell carbon energy quantum use state solar temperature electron
Topic  4
protein science dna structure complex bind chromosome research sequence president
Topic  5
cancer protein c carbon science tumor pd drug patient therapy
Topic  6
cancer material protein information offer drug interest potential laboratory researcher
Topic  7
dna carbon change c climate science gene use chromosome genome
Topic  8
protein carbon year climate c plant change researcher structure million
Topic  9
dna particle cancer dark hole physicist black black hole new gene


-------NMF------


Topic  0
cell stem stem cell single type immune single cell gene cell type tissue
Topic  1
year new particle researcher dark physicist black hole million black hole
Topic

count vectorizer 2

In [23]:
cv_vect2, cv_vect2_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [24]:
use_vectorizer(cv_vect2, cv_vect2_transformed)



-------LSA------


Topic  0
cell use protein new human science study year gene cancer
Topic  1
cell protein t cancer stem cell t cell stem immune pd response
Topic  2
material information potential researcher interest offer laboratory apparatus newly instrumentation
Topic  3
science cell research u.s scientist new president trump world agency
Topic  4
carbon cell material energy use quantum c state climate temperature
Topic  5
cancer c pd therapy patient t tumor t cell response drug
Topic  6
protein science structure dna pd cancer bind complex c target
Topic  7
dna carbon change gene c climate science genome chromosome histone
Topic  8
dna particle year hole black black hole new physicist dark protein
Topic  9
dna cancer quantum use gene science material state particle light


-------NMF------


Topic  0
cell stem stem cell type single immune single cell tissue gene cell type
Topic  1
particle black hole black hole dark physicist matter year new massive
Topic  2
researcher material i

count vectorizer 3

In [25]:
cv_vect3, cv_vect3_transformed = count_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

In [26]:
use_vectorizer(cv_vect3, cv_vect3_transformed)



-------LSA------


Topic  0
cell use protein new human science study year gene issue
Topic  1
cell protein t cancer stem cell stem t cell immune gene response
Topic  2
science cell research u.s scientist new president trump agency world
Topic  3
cell material carbon energy quantum use solar state temperature climate
Topic  4
protein science dna structure complex bind chromosome research histone nucleosome
Topic  5
protein cancer drug science c patient tumor therapy target pd
Topic  6
carbon change climate dna c gene use emission histone plant
Topic  7
material cancer information dna potential newly interest offer laboratory apparatus
Topic  8
particle dark physicist hole black black hole cancer new dna matter
Topic  9
protein carbon c researcher climate plant year potential change apparatus


-------NMF------


Topic  0
cell stem stem cell single type immune single cell tissue cell type gene
Topic  1
disease vaccine study brain virus cause new hiv neuron health
Topic  2
science resea

### Lets try tfidf vectorizer to compare
tfidf vectorizer 1

In [27]:
tf_vect1, tf_vect1_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

### NMF GOOD

In [28]:
use_vectorizer(tf_vect1, tf_vect1_transformed)



-------LSA------


Topic  0
roundup information newly apparatus laboratory newly offer roundup information offer instrumentation weekly weekly roundup laboratory material interest researcher
Topic  1
cell science human use new protein research study year change
Topic  2
cell protein dna gene chromosome structure cancer mouse function expression
Topic  3
cell science cancer research gene patient u.s stem cell stem scientist
Topic  4
dna human genome neandertal modern ancient protein gene sequence specie
Topic  5
dna protein science structure chromosome research nucleosome bind quantum u.s
Topic  6
carbon climate c change plant dna cell protein climate change emission
Topic  7
cancer protein drug vaccine patient tumor carbon disease bond therapy
Topic  8
dna cancer neandertal carbon cell ancient solar year hydrogen black hole
Topic  9
dna cancer chromosome quantum change tumor genome nucleosome histone epigenetic


-------NMF------


Topic  0
instrumentation apparatus information newly

tfidf vectorizer 2

In [29]:
tf_vect2, tf_vect2_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [30]:
use_vectorizer(tf_vect2, tf_vect2_transformed)



-------LSA------


Topic  0
weekly roundup information newly interest researcher laboratory material potential interest laboratory material potential material potential interest researcher roundup information newly offer apparatus laboratory apparatus laboratory material apparatus laboratory material potential instrumentation apparatus laboratory material
Topic  1
cell science human use new protein research study year change
Topic  2
cell protein dna gene chromosome structure cancer function mouse expression
Topic  3
quantum material carbon electron light energy use temperature bond hydrogen
Topic  4
human dna genome neandertal modern ancient specie gene year sequence
Topic  5
dna protein science chromosome structure bind nucleosome research complex strand
Topic  6
carbon climate c cell change plant dna climate change protein emission
Topic  7
cancer protein drug patient vaccine carbon tumor disease bond trial
Topic  8
dna cancer neandertal carbon cell solar hydrogen ancient year hol

tfidf vectorizer 3

In [31]:
tf_vect3, tf_vect3_transformed = tfidf_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

In [32]:
use_vectorizer(tf_vect3, tf_vect3_transformed)



-------LSA------


Topic  0
roundup information offer instrumentation material potential potential interest information newly weekly instrumentation apparatus interest researcher laboratory material newly offer
Topic  1
cell protein science human use new cancer study dna gene
Topic  2
cell protein cancer pd t cell t gene immune mouse tumor
Topic  3
pd cancer science cell t cell therapy t research patient tumor
Topic  4
pd c carbon t cell t therapy quantum cancer tcr pd therapy
Topic  5
cell science perovskite stem cell quantum solar cell stem material solar research
Topic  6
dna histone science nucleosome epigenetic silence trump chromosome replication pd
Topic  7
carbon cell climate plant change c atmosphere soil specie climate change
Topic  8
quantum cancer gene neandertal spin particle epigenetic histone hole state
Topic  9
neandertal dna black hole hole modern modern human black ancient protein ancestor


-------NMF------


Topic  0
instrumentation apparatus newly offer offer ins

#### Try tokenizer 1 to compare

In [33]:
cv_vect1, cv_vect1_transformed = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [34]:
use_vectorizer(cv_vect1, cv_vect1_transformed)



-------LSA------


Topic  0
cell use protein new human science study 1 gene issue
Topic  1
cell protein t cancer stem cell stem t cell immune cell type type
Topic  2
science cell research scientist u.s new president trump world agency
Topic  3
cell material carbon energy quantum science state solar use electron
Topic  4
protein science dna structure bind complex chromosome sequence research histone
Topic  5
researcher material information potential laboratory offer newly apparatus instrumentation offer instrumentation
Topic  6
cancer protein drug patient tumor therapy material pd-1 structure offer
Topic  7
particle new dark physicist hole black black hole light matter cancer
Topic  8
dna carbon change use climate cancer co2 gene base histone
Topic  9
science cancer dna quantum use gene system state issue material


-------NMF------


Topic  0
cell stem stem cell type single immune gene cell type single cell tissue
Topic  1
human new study researcher disease gene modern genome brain v

In [35]:
cv_vect2, cv_vect2_transformed = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [36]:
use_vectorizer(cv_vect2, cv_vect2_transformed)



-------LSA------


Topic  0
cell use protein new human science study 1 gene issue
Topic  1
cell protein t cancer t cell stem cell stem immune response cell type
Topic  2
material information potential interest researcher offer laboratory apparatus newly instrumentation
Topic  3
science cell research scientist u.s new researcher president trump world
Topic  4
human cancer disease dna gene patient therapy drug tumor pd-1
Topic  5
cancer pd-1 carbon cd28 t therapy t cell response patient tumor
Topic  6
protein science structure dna bind complex signal pd-1 research receptor
Topic  7
dna particle use dark gene physicist hole black new black hole
Topic  8
carbon change dna climate science co2 gene emission plant chromosome
Topic  9
science dna quantum material issue cancer use system 1 human


-------NMF------


Topic  0
cell stem stem cell type single immune gene cell type single cell tissue
Topic  1
human new study researcher disease gene vaccine brain modern virus
Topic  2
material res

In [37]:
cv_vect3, cv_vect3_transformed = count_vectorizer(spacy_tokenizer_1, 110000, all_descriptions)

In [38]:
use_vectorizer(cv_vect3, cv_vect3_transformed)



-------LSA------


Topic  0
cell use protein new human science study 1 gene issue
Topic  1
cell protein t cancer stem cell stem t cell immune gene cell type
Topic  2
science cell research scientist u.s new president trump world agency
Topic  3
cell material carbon energy quantum solar state temperature electron science
Topic  4
protein dna science structure bind chromosome histone complex nucleosome sequence
Topic  5
researcher dna gene human material new cell particle genome information
Topic  6
carbon change climate gene co2 human plant emission term dna
Topic  7
cancer dna use tumor carbon therapy patient drug material require
Topic  8
particle dark physicist new black protein hole black hole matter experiment
Topic  9
science quantum gene cancer issue 1 system pageof pageof issue state


-------NMF------


Topic  0
cell stem stem cell type single immune cell type single cell tissue gene
Topic  1
new researcher study research scientist vaccine world year health work
Topic  2
scien

### Lets try tfidf vectorizer with tokenizer 2 to compare
tfidf vectorizer 1

In [39]:
tf_vect1, tf_vect1_transformed = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions)

In [40]:
use_vectorizer(tf_vect1, tf_vect1_transformed)



-------LSA------


Topic  0
material potential instrumentation apparatus roundup information roundup potential interest laboratory material weekly roundup weekly newly offer apparatus laboratory
Topic  1
cell science use human new protein study research change scientist
Topic  2
cell protein dna gene chromosome structure cancer function mouse expression
Topic  3
cell science cancer research gene u.s patient stem cell stem scientist
Topic  4
dna human genome neandertal modern ancient protein gene sequence chromosome
Topic  5
dna protein science chromosome structure bind research nucleosome strand u.s
Topic  6
carbon climate cell change plant co2 climate change dna atmosphere emission
Topic  7
cancer carbon protein drug bond vaccine patient tumor reaction disease
Topic  8
protein brain neuron quantum structure state memory behavior function 1
Topic  9
dna cancer quantum chromosome change climate tumor histone epigenetic nucleosome


-------NMF------


Topic  0
weekly roundup potential 

tfidf vectorizer 2

In [64]:
tf_vect2, tf_vect2_transformed = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions, ngram_stop=4)

In [65]:
use_vectorizer(tf_vect2, tf_vect2_transformed)



-------LSA------


Topic  0
cell protein new science gene use human research study change
Topic  1
offer instrumentation apparatus laboratory weekly roundup information newly instrumentation apparatus laboratory material instrumentation apparatus offer instrumentation apparatus roundup information newly offer information newly offer instrumentation information newly offer information newly weekly roundup
Topic  2
cell protein gene signal receptor rna t bind expression mouse
Topic  3
cell science research week stem stem cell national u.s cancer university
Topic  4
week story story science download story science download pdf science download pdf entire science download pdf science download entire section roundup week story science roundup week story roundup week
Topic  5
cell quantum stem stem cell electron state t energy t cell material
Topic  6
£ university press university press isbn protein york.pp new york.pp cambridge new
Topic  7
£ cell press university university press climate 

tfidf vectorizer 3

In [66]:
tf_vect3, tf_vect3_transformed = tfidf_vectorizer(spacy_tokenizer_2, 110000, all_descriptions)

In [67]:
use_vectorizer(tf_vect3, tf_vect3_transformed)



-------LSA------


Topic  0
apparatus laboratory weekly roundup newly offer instrumentation apparatus roundup information information newly potential interest offer instrumentation laboratory material interest researcher
Topic  1
cell science new protein use gene human research change study
Topic  2
link roundup pdf entire science download download pdf entire section roundup week follow link week story pdf story science
Topic  3
science roundup recent monthly roundup news project publisher american recent news project science science publisher association advancement advancement science
Topic  4
cell protein news project monthly roundup roundup recent publisher american recent news project science science publisher monthly
Topic  5
cell research science national stem stem cell u.s week scientist new
Topic  6
cell quantum stem stem cell state electron energy spin atom t
Topic  7
cell climate ice change ocean climate change global sea water stem cell
Topic  8
£ press university univers

# RESUME HERE

In [49]:
use_vectorizer(cv_vect3, cv_vect3_transformed, n_comp=11)



-------LSA------


Topic  0
cell protein use gene human new system signal study change
Topic  1
cell t t cell stem stem cell signal immune mouse receptor tissue
Topic  2
protein gene bind rna dna complex structure function signal expression
Topic  3
science gene research human new protein genome disease scientist dna
Topic  4
protein science research structure bind new state membrane complex national
Topic  5
change climate protein ice ocean global climate change water year increase
Topic  6
human disease system virus study immune new infection brain drug
Topic  7
quantum state signal system change gene response receptor control climate
Topic  8
change use dna climate human state quantum specie climate change cell
Topic  9
t t cell virus response rna infection immune dna structure science
Topic  10
state quantum system dna human protein t rna t cell virus


-------NMF------


Topic  0
cell stem stem cell signal tissue cancer type tumor mouse division
Topic  1
use structure high molec

tfidf vectorizer 1

In [421]:
tf_vect1, tf_vect1_transformed = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [422]:
use_count_vectorizer(tf_vect1, tf_vect1_transformed)



-------LSA------


Topic  0
apparatus laboratory interest researcher weekly potential interest roundup information offer instrumentation instrumentation apparatus information newly weekly roundup newly offer
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein t gene t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene rna structure science complex genome enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme science methane
Topic  8
protein change climate structure science policy climate change electron state bind
Topic  9
protein ice planet earth surface structure year star solar gravitational


-------NMF------


Topic  0
offer instrumentation interest researcher weekly w

tfidf vectorizer 2

In [423]:
tf_vect2, tf_vect2_transformed = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [424]:
use_count_vectorizer(tf_vect2, tf_vect2_transformed)



-------LSA------


Topic  0
potential interest newly offer instrumentation apparatus instrumentation apparatus weekly roundup instrumentation apparatus laboratory instrumentation apparatus laboratory material information newly offer instrumentation weekly roundup information newly offer instrumentation roundup information newly
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune tumor disease virus
Topic  4
gene specie human change climate plant genome virus genetic rna
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant enzyme climate protein science
Topic  8
protein change climate structure science climate change policy bind electron rna
Topic  9
ice protein planet earth year gravita

tfidf vectorizer 3

In [425]:
tf_vect3, tf_vect3_transformed = tfidf_vectorizer(spacy_tokenizer_1, 110000, all_descriptions)

In [426]:
use_count_vectorizer(tf_vect3, tf_vect3_transformed)



-------LSA------


Topic  0
offer instrumentation apparatus laboratory newly offer instrumentation apparatus weekly roundup weekly potential interest information newly interest researcher roundup information
Topic  1
cell new science use human protein gene change system study
Topic  2
cell t t cell immune protein gene tumor cancer mouse tissue
Topic  3
surface pluto solar system material new horizon atom horizon electron quantum
Topic  4
pluto cell new horizon horizon solar system surface moon solar system t
Topic  5
gene protein virus rna antibody dna human vaccine genome zika
Topic  6
virus antibody zika vaccine infect zika virus viral ebola ebola virus infection
Topic  7
climate change ice climate change specie plant global co2 impact increase
Topic  8
carbon bond c reaction h catalyst – enzyme c – complex
Topic  9
protein structure leucine mtorc1 electron climate change bind complex t


-------NMF------


Topic  0
offer instrumentation weekly roundup laboratory material instrumen

# test with tokenizer 3

count vectorizer 1

In [437]:
cv_vect1, cv_vect1_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [438]:
use_count_vectorizer(cv_vect1, cv_vect1_transformed)



-------LSA------


Topic  0
cell new use science human study year protein research system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new world scientist u.s year national researcher
Topic  3
system surface solar pluto cell material new new horizon horizon datum
Topic  4
gene human protein new system disease virus pluto horizon new horizon
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
virus researcher material potential information offer human laboratory instrumentation roundup
Topic  7
virus use zika antibody vaccine infect state atom zika virus quantum
Topic  8
protein virus antibody structure zika vaccine infect science change bind
Topic  9
state system quantum issue science research united page immune page issue


-------NMF------


Topic  0
science research u.s new national world scientist institute health scientific
Topic  1
cell t t cell immune cancer stem mouse stem cell 

try fewer topics

In [439]:
use_count_vectorizer(cv_vect1, cv_vect1_transformed, n_comp=12)



-------LSA------


Topic  0
cell new use science human study year protein research system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new world scientist u.s year national researcher
Topic  3
system surface solar pluto cell material new new horizon horizon datum
Topic  4
gene system protein human new disease virus pluto horizon new horizon
Topic  5
change climate climate change human global virus specie ice cause impact
Topic  6
virus researcher material potential information human offer laboratory instrumentation roundup
Topic  7
virus use zika vaccine antibody state infect atom quantum zika virus
Topic  8
protein virus structure antibody zika vaccine change bind science infect
Topic  9
year carbon protein earth new wave gravitational form ice scientist
Topic  10
state protein quantum time brain study electron change wave year
Topic  11
immune t issue system page t cell page issue human science response


-------NMF------


count vectorizer 2

In [442]:
cv_vect2, cv_vect2_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [443]:
use_count_vectorizer(cv_vect2, cv_vect2_transformed)



-------LSA------


Topic  0
cell new use science human study year protein research system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system material surface pluto solar use new horizon horizon complex protein
Topic  3
system pluto surface solar new horizon horizon present solar system include complex
Topic  4
cell science new research system pluto solar horizon year new horizon
Topic  5
gene protein human disease virus dna genome new rna genetic
Topic  6
change climate virus human climate change specie global cause ice increase
Topic  7
virus use zika study vaccine infect antibody zika virus atom infection
Topic  8
protein virus antibody structure zika vaccine infect bind zika virus hiv
Topic  9
year carbon protein c new wave gravitational ice earth plant


-------NMF------


Topic  0
use state material atom high quantum issue electron page page issue
Topic  1
cell t t cell immune cancer stem mouse stem cell tumor tissue
Topic  2
system pluto s

count vectorizer 3

In [444]:
cv_vect3, cv_vect3_transformed = count_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

In [445]:
use_count_vectorizer(cv_vect3, cv_vect3_transformed)



-------LSA------


Topic  0
cell new use science human study year protein research system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new world scientist u.s year national researcher
Topic  3
system surface solar cell material pluto new horizon horizon datum ice
Topic  4
system new gene human disease pluto protein virus horizon new horizon
Topic  5
change climate virus climate change human specie global cause ice impact
Topic  6
virus researcher material use study information human zika potential vaccine
Topic  7
virus antibody protein zika vaccine structure infect zika virus hiv infection
Topic  8
protein change potential climate interest material newly offer material potential apparatus
Topic  9
state system quantum issue science research immune united page page issue


-------NMF------


Topic  0
science research new u.s national world scientist institute health scientific
Topic  1
cell t t cell immune cancer stem mouse 

tfidf vectorizer 1

In [446]:
tf_vect1, tf_vect1_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [447]:
use_count_vectorizer(tf_vect1, tf_vect1_transformed)



-------LSA------


Topic  0
apparatus laboratory weekly roundup laboratory material potential interest instrumentation apparatus offer instrumentation interest researcher weekly newly offer information newly
Topic  1
cell science new use human research year study change gene
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell immune research disease tumor virus
Topic  4
cell science t t cell quantum material solar atom state research
Topic  5
climate change cell climate change ice t t cell global impact immune
Topic  6
virus zika vaccine antibody zika virus infect hiv infection viral ebola
Topic  7
carbon reaction c bond catalyst protein science enzyme plant climate
Topic  8
plant year reaction star gravitational planet carbon new researcher wave
Topic  9
protein ice planet surface year dna earth structure climate rna


-------NMF------


Topic  0
instrumentation apparatus roundup information weekly laboratory material newly of

tfidf vectorizer 2

In [448]:
tf_vect2, tf_vect2_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [449]:
use_count_vectorizer(tf_vect2, tf_vect2_transformed)



-------LSA------


Topic  0
instrumentation apparatus laboratory instrumentation apparatus laboratory material laboratory material potential interest newly offer instrumentation material potential interest researcher apparatus laboratory material potential information newly offer information newly offer instrumentation material potential interest laboratory material potential
Topic  1
cell science new use human research year study change gene
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell immune research tumor disease virus
Topic  4
cell science t t cell quantum material atom research solar state
Topic  5
climate change cell climate change ice t t cell global impact immune
Topic  6
virus zika vaccine antibody zika virus infect hiv infection viral ebola
Topic  7
carbon reaction c bond catalyst protein enzyme plant science climate
Topic  8
protein change climate science structure policy state climate change electron quantum


tfidf vectorizer 3

In [450]:
tf_vect3, tf_vect3_transformed = tfidf_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

In [451]:
use_count_vectorizer(tf_vect3, tf_vect3_transformed)



-------LSA------


Topic  0
weekly roundup newly offer roundup information laboratory material offer instrumentation instrumentation apparatus information newly potential interest weekly apparatus laboratory
Topic  1
cell new science use human protein gene change research study
Topic  2
cell t t cell protein immune gene tumor cancer mouse dna
Topic  3
pluto surface solar system new horizon material horizon atom electron quantum
Topic  4
pluto cell new horizon horizon solar system t t cell moon solar system
Topic  5
virus gene protein antibody rna dna vaccine human zika genome
Topic  6
virus antibody vaccine zika infect hiv zika virus viral ebola infection
Topic  7
climate change ice climate change specie plant c global impact temperature
Topic  8
protein carbon complex leucine reaction mtor c bond enzyme science
Topic  9
flow fluid electron plant protein immune graphene response t t cell


-------NMF------


Topic  0
potential interest instrumentation apparatus newly offer laboratory

for 2016, the best topics were obtained from cv vect 1:

-------NMF------


Topic  0
science research u.s new national world scientist institute health scientific
Topic  1
cell t t cell immune cancer stem mouse stem cell tumor tissue
Topic  2
state quantum electron atom magnetic field optical system material structure
Topic  3
system surface pluto solar new horizon new horizon datum solar system present
Topic  4
gene dna genome disease genetic human expression rna mutation cancer
Topic  5
change climate climate change global ice impact policy temperature increase model
Topic  6
virus zika vaccine antibody infect zika virus health human infection disease
Topic  7
material researcher information potential offer laboratory interest newly apparatus material potential
Topic  8
protein structure complex bind function site target rna membrane enzyme
Topic  9
new year study researcher human time scientist work people find
Topic  10
use reaction carbon c bond material high enzyme catalyst metal
Topic  11
issue page plant page issue specie system report response immune mechanism

# end!!

### Test LSA / tune parameters with tokenizer 1

Different vectorizers

In [397]:
lsa_cv_vect, lsa_cv_vect_transformed = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [306]:
lsa_cv_vect2, lsa_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [307]:
lsa_cv_vect3, lsa_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_1, 110000, all_descriptions, ngram_stop=4)

In [308]:
lsa_tf_vect, lsa_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [309]:
lsa_tf_vect2, lsa_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [310]:
lsa_tf_vect3, lsa_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_1, 110000, all_descriptions, ngram_stop=4)

Different models

In [311]:
lsa_cv1_model, lsa_cv1_model_transformed = lsa_cv(lsa_cv_vect_transformed, n_comp=10)

In [312]:
display_topics(lsa_cv1_model, lsa_cv_vect.get_feature_names(),10)

Topic  0
cell new use human science study protein research system disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science research cell new world scientist u.s national researcher institute
Topic  3
system surface solar pluto new material new horizon horizon cell datum
Topic  4
gene protein human disease new virus system dna rna genome
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
virus researcher material information potential offer human laboratory instrumentation roundup
Topic  7
virus zika vaccine antibody infect use zika virus infection atom report
Topic  8
protein virus structure antibody change climate zika bind vaccine science
Topic  9
1 issue 2 pageof pageof issue science 3 system report immune


In [313]:
lsa_cv1_model2, lsa_cv1_model_transformed2 = lsa_cv(lsa_cv_vect_transformed2, n_comp=10)

In [314]:
display_topics(lsa_cv1_model2, lsa_cv_vect2.get_feature_names(),10)

Topic  0
cell new use human science study protein research system disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system surface material pluto solar new horizon horizon solar system complex present
Topic  3
researcher material information potential offer laboratory interest newly apparatus material potential
Topic  4
science cell new research system pluto solar researcher horizon new horizon
Topic  5
gene protein human disease virus dna rna genome genetic new
Topic  6
change climate human virus climate change specie global cause ice response
Topic  7
virus zika vaccine study use infect antibody zika virus human infection
Topic  8
protein virus antibody structure zika vaccine carbon bind complex science
Topic  9
1 issue 2 pageof pageof issue science 3 report system immune


In [315]:
lsa_cv1_model3, lsa_cv1_model_transformed3 = lsa_cv(lsa_cv_vect_transformed3, n_comp=10)

In [316]:
display_topics(lsa_cv1_model3, lsa_cv_vect3.get_feature_names(),10)

Topic  0
cell new use human science study protein system research disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system pluto surface solar new horizon horizon solar system present moon analysis
Topic  3
science cell research new world scientist u.s national t institute
Topic  4
researcher material information laboratory offer potential interest apparatus newly roundup
Topic  5
protein gene human disease rna virus dna new genome genetic
Topic  6
science protein structure complex research use state new atom material
Topic  7
change climate protein climate change leucine complex structure mtorc1 science policy
Topic  8
gene science dna expression mutation genome change genetic climate crispr
Topic  9
state human leucine study mtorc1 gene growth time find electron


In [317]:
lsa_tf_model1, lsa_tf_model_transformed1 = lsa_tfidf(lsa_tf_vect_transformed, n_comp=10)

In [318]:
display_topics(lsa_tf_model1, lsa_tf_vect.get_feature_names(),10)

Topic  0
apparatus laboratory interest researcher weekly potential interest roundup information offer instrumentation instrumentation apparatus information newly weekly roundup newly offer
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein t gene t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene rna structure science complex genome enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme science methane
Topic  8
protein change climate structure science policy climate change electron state bind
Topic  9
protein ice planet earth surface structure year star solar gravitational


In [319]:
lsa_tf_model2, lsa_tf_model_transformed2 = lsa_tfidf(lsa_tf_vect_transformed2, n_comp=10)

In [320]:
display_topics(lsa_tf_model2, lsa_tf_vect2.get_feature_names(),10)

Topic  0
potential interest newly offer instrumentation apparatus instrumentation apparatus weekly roundup instrumentation apparatus laboratory instrumentation apparatus laboratory material information newly offer instrumentation weekly roundup information newly offer instrumentation roundup information newly
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune tumor disease virus
Topic  4
gene specie human change climate plant genome virus genetic rna
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant enzyme climate protein science
Topic  8
protein change climate structure science climate change policy bind electron rna
Topic  9
ice protein planet earth year gravitational star surface w

In [321]:
lsa_tf_model3, lsa_tf_model_transformed3 = lsa_tfidf(lsa_tf_vect_transformed3, n_comp=10)

In [322]:
display_topics(lsa_tf_model3, lsa_tf_vect3.get_feature_names(),10)

Topic  0
interest researcher newly offer instrumentation apparatus material potential interest potential interest researcher material potential interest researcher weekly roundup information newly information newly offer roundup information apparatus laboratory material potential
Topic  1
cell science new human use research change study gene protein
Topic  2
cell t t cell immune protein gene tumor dna cancer expression
Topic  3
pluto surface solar system material carbon new horizon horizon quantum atom
Topic  4
cell pluto new horizon horizon t t cell science solar system new
Topic  5
cell t t cell science quantum material solar atom light state
Topic  6
climate change climate change ice t t cell global specie impact cell
Topic  7
virus zika antibody vaccine zika virus infect infection viral zikv ebola
Topic  8
carbon bond c reaction – protein h c – catalyst enzyme
Topic  9
protein structure climate ice change bind leucine gravitational rna mtorc1


### Test LSA / tune parameters with tokenizer 2

Different vectorizers

In [324]:
lsa_cv_vect, lsa_cv_vect_transformed = count_vectorizer(spacy_tokenizer_2, 5000, all_descriptions)

In [325]:
lsa_cv_vect2, lsa_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_2, 5000, all_descriptions, ngram_stop=4)

In [326]:
lsa_cv_vect3, lsa_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_2, 110000, all_descriptions, ngram_stop=4)

In [327]:
lsa_tf_vect, lsa_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions)

In [328]:
lsa_tf_vect2, lsa_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions, ngram_stop=4)

In [329]:
lsa_tf_vect3, lsa_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_2, 110000, all_descriptions, ngram_stop=4)

Different models

In [330]:
lsa_cv1_model, lsa_cv1_model_transformed = lsa_cv(lsa_cv_vect_transformed, n_comp=10)

In [331]:
display_topics(lsa_cv1_model, lsa_cv_vect.get_feature_names(),10)

Topic  0
cell new use human science study protein research system disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new world scientist u.s national researcher institute
Topic  3
system surface solar pluto new material cell new horizon horizon datum
Topic  4
gene protein human disease new virus system dna rna pluto
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
virus researcher material information potential offer human laboratory instrumentation roundup
Topic  7
virus zika vaccine antibody infect use zika virus infection atom report
Topic  8
protein virus antibody structure change climate zika science bind vaccine
Topic  9
issue 1 pageof pageof issue 2 science system 3 immune report


In [332]:
lsa_cv1_model2, lsa_cv1_model_transformed2 = lsa_cv(lsa_cv_vect_transformed2, n_comp=10)

In [333]:
display_topics(lsa_cv1_model2, lsa_cv_vect2.get_feature_names(),10)

Topic  0
cell new use science human study protein research system disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system surface pluto solar new horizon horizon material solar system present moon
Topic  3
material researcher information potential offer interest laboratory newly apparatus material potential
Topic  4
science cell new research researcher scientist world u.s pluto national
Topic  5
gene protein human disease virus dna rna genome genetic new
Topic  6
change climate human virus climate change specie global cause ice increase
Topic  7
virus zika vaccine study use infect antibody zika virus human infection
Topic  8
protein virus antibody structure zika vaccine science bind carbon complex
Topic  9
issue 1 pageof pageof issue 2 – science 3 bond report


In [334]:
lsa_cv1_model3, lsa_cv1_model_transformed3 = lsa_cv(lsa_cv_vect_transformed3, n_comp=10)

In [335]:
display_topics(lsa_cv1_model3, lsa_cv_vect3.get_feature_names(),10)

Topic  0
cell new use human science study protein system research disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system pluto surface solar new horizon horizon solar system present moon analysis
Topic  3
science cell research new world scientist u.s national t year
Topic  4
researcher material information laboratory offer potential interest apparatus roundup instrumentation
Topic  5
protein gene human disease rna virus dna new genome genetic
Topic  6
science protein structure complex research use state new atom material
Topic  7
change climate protein climate change leucine complex structure mtorc1 science policy
Topic  8
gene science change dna expression mutation genome climate genetic crispr
Topic  9
state human leucine study mtorc1 growth gene time electron sestrin2


In [336]:
lsa_tf_model1, lsa_tf_model_transformed1 = lsa_tfidf(lsa_tf_vect_transformed, n_comp=10)

In [337]:
display_topics(lsa_tf_model1, lsa_tf_vect.get_feature_names(),10)

Topic  0
potential interest weekly newly offer weekly roundup laboratory material apparatus laboratory offer instrumentation roundup information information newly instrumentation apparatus
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst climate plant science enzyme methane
Topic  8
protein change climate science structure policy climate change electron state bind
Topic  9
protein ice planet earth surface year structure star solar crater


In [338]:
lsa_tf_model2, lsa_tf_model_transformed2 = lsa_tfidf(lsa_tf_vect_transformed2, n_comp=10)

In [339]:
display_topics(lsa_tf_model2, lsa_tf_vect2.get_feature_names(),10)

Topic  0
apparatus laboratory material information newly offer instrumentation potential interest researcher material potential interest newly offer interest researcher apparatus laboratory material potential material potential interest researcher apparatus laboratory newly offer instrumentation
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna cancer expression
Topic  3
cell science cancer t t cell research immune tumor disease virus
Topic  4
cell science t t cell quantum material atom research electron state
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme protein science
Topic  8
protein change climate structure science climate change policy bind electron state
Topic  9
ice protein planet earth year star gravitational surface wave crat

In [340]:
lsa_tf_model3, lsa_tf_model_transformed3 = lsa_tfidf(lsa_tf_vect_transformed3, n_comp=10)

In [341]:
display_topics(lsa_tf_model3, lsa_tf_vect3.get_feature_names(),10)

Topic  0
potential interest offer instrumentation material potential interest researcher information newly offer instrumentation apparatus laboratory material instrumentation apparatus laboratory laboratory material potential interest information newly interest researcher instrumentation apparatus
Topic  1
cell science new human use research change study gene protein
Topic  2
cell t t cell protein immune gene tumor dna cancer expression
Topic  3
pluto surface solar system material new horizon carbon horizon quantum atom
Topic  4
cell pluto new horizon t horizon t cell science solar immune system
Topic  5
cell t t cell science quantum material solar light atom state
Topic  6
climate change climate change ice t t cell global virus specie impact
Topic  7
virus zika antibody vaccine zika virus infect viral infection ebola zikv
Topic  8
carbon bond c reaction protein – h c – enzyme science
Topic  9
protein structure climate ice change bind leucine rna antibody mtorc1


### Test LSA / tune parameters with tokenizer 3

Different vectorizers

In [343]:
lsa_cv_vect, lsa_cv_vect_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [344]:
lsa_cv_vect2, lsa_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [345]:
lsa_cv_vect3, lsa_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

In [346]:
lsa_tf_vect, lsa_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [347]:
lsa_tf_vect2, lsa_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [348]:
lsa_tf_vect3, lsa_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_3, 110000, all_descriptions)

Different models

In [349]:
lsa_cv1_model, lsa_cv1_model_transformed = lsa_cv(lsa_cv_vect_transformed, n_comp=10)

In [350]:
display_topics(lsa_cv1_model, lsa_cv_vect.get_feature_names(),10)

Topic  0
cell new use science human study protein research year system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new scientist world u.s researcher national year
Topic  3
system surface solar pluto material new cell new horizon horizon datum
Topic  4
gene protein human disease new system virus pluto dna rna
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
researcher virus material information potential offer laboratory instrumentation roundup human
Topic  7
virus zika antibody vaccine infect use zika virus infection atom report
Topic  8
protein virus antibody structure change climate zika science vaccine bind
Topic  9
issue 1 page page issue 2 science system 3 report immune


In [351]:
lsa_cv1_model2, lsa_cv1_model_transformed2 = lsa_cv(lsa_cv_vect_transformed2, n_comp=10)

In [352]:
display_topics(lsa_cv1_model2, lsa_cv_vect2.get_feature_names(),10)

Topic  0
cell new use science human study protein research system year
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system material surface use solar pluto protein complex new horizon page
Topic  3
researcher information potential offer laboratory interest material newly apparatus roundup
Topic  4
cell new science system pluto solar research horizon new horizon surface
Topic  5
gene protein human disease virus dna rna genome genetic new
Topic  6
change climate human virus climate change specie global cause ice response
Topic  7
virus zika use study vaccine infect antibody zika virus human researcher
Topic  8
virus protein antibody structure zika vaccine science infect zika virus bind
Topic  9
issue 1 page page issue 2 science system 3 report immune


In [353]:
lsa_cv1_model3, lsa_cv1_model_transformed3 = lsa_cv(lsa_cv_vect_transformed3, n_comp=10)

In [354]:
display_topics(lsa_cv1_model3, lsa_cv_vect3.get_feature_names(),10)

Topic  0
cell new use science human study protein research system year
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system pluto surface solar new horizon horizon solar system present moon analysis
Topic  3
science cell research new scientist world u.s year national t
Topic  4
researcher material information potential offer laboratory interest apparatus newly instrumentation
Topic  5
protein gene human disease rna dna virus new genome genetic
Topic  6
science protein structure complex research state use new atom material
Topic  7
change climate protein climate change leucine complex structure mtorc1 science policy
Topic  8
gene science expression mutation genome dna change genetic climate crispr
Topic  9
state human leucine study mtorc1 year find growth gene time


In [355]:
lsa_tf_model1, lsa_tf_model_transformed1 = lsa_tfidf(lsa_tf_vect_transformed, n_comp=10)

In [356]:
display_topics(lsa_tf_model1, lsa_tf_vect.get_feature_names(),10)

Topic  0
weekly roundup information newly potential interest interest researcher newly offer instrumentation apparatus apparatus laboratory roundup information offer instrumentation weekly
Topic  1
cell science new use human research study change gene year
Topic  2
cell protein gene t t cell immune tumor dna cancer expression
Topic  3
cell science cancer t t cell immune research disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene science structure rna complex genome enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst climate plant science enzyme methane
Topic  8
protein change climate science structure policy climate change electron state bind
Topic  9
protein ice planet year earth surface structure solar star crater


In [357]:
lsa_tf_model2, lsa_tf_model_transformed2 = lsa_tfidf(lsa_tf_vect_transformed2, n_comp=10)

In [358]:
display_topics(lsa_tf_model2, lsa_tf_vect2.get_feature_names(),10)

Topic  0
information newly offer instrumentation material potential interest apparatus laboratory material newly offer instrumentation apparatus newly offer instrumentation potential interest researcher offer instrumentation apparatus laboratory newly offer roundup information newly offer instrumentation apparatus laboratory
Topic  1
cell science new use human research study change gene year
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell immune research disease tumor virus
Topic  4
cell science t t cell quantum material atom research electron state
Topic  5
protein dna gene science rna structure complex genome enzyme research
Topic  6
virus zika antibody vaccine zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme protein science
Topic  8
protein change climate science structure policy climate change electron bind state
Topic  9
ice protein year planet earth dna surfa

In [359]:
lsa_tf_model3, lsa_tf_model_transformed3 = lsa_tfidf(lsa_tf_vect_transformed3, n_comp=10)

In [360]:
display_topics(lsa_tf_model3, lsa_tf_vect3.get_feature_names(),10)

Topic  0
offer instrumentation apparatus information newly offer instrumentation material potential interest researcher material potential interest roundup information newly offer roundup information newly information newly information newly offer weekly roundup information newly apparatus laboratory material
Topic  1
cell science new human use research change gene year study
Topic  2
cell t t cell protein immune gene tumor dna expression cancer
Topic  3
surface pluto solar system material carbon new horizon atom horizon quantum
Topic  4
cell pluto new horizon horizon t t cell solar science system immune
Topic  5
cell t t cell quantum science material solar atom light state
Topic  6
climate change climate change ice t global t cell specie virus impact
Topic  7
virus antibody zika vaccine zika virus infect viral infection ebola zikv
Topic  8
carbon bond c reaction protein – h enzyme c – science
Topic  9
gene bond c specie – h c – sleep atom health


# Test NMF / tune parameters with tokenizer 1

Different vectorizers

In [362]:
nmf_cv_vect, nmf_cv_vect_transformed = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [363]:
nmf_cv_vect2, nmf_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [364]:
nmf_cv_vect3, nmf_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_1, 110000, all_descriptions)

In [365]:
nmf_tf_vect, nmf_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions)

In [366]:
nmf_tf_vect2, nmf_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_1, 5000, all_descriptions, ngram_stop=4)

In [367]:
nmf_tf_vect3, nmf_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_1, 110000, all_descriptions)

Different models

In [368]:
nmf_cv1_model, nmf_cv1_model_transformed = nmf_cv(nmf_cv_vect_transformed, n_comp=10)

In [369]:
display_topics(nmf_cv1_model, nmf_cv_vect.get_feature_names(),10)

Topic  0
use state material atom high electron quantum carbon reaction light
Topic  1
cell t t cell immune cancer stem mouse stem cell tumor tissue
Topic  2
science research new world scientist u.s national researcher institute university
Topic  3
system surface pluto new solar horizon new horizon datum solar system present
Topic  4
gene human dna genome genetic disease study rna expression mutation
Topic  5
change climate climate change global ice impact increase carbon model temperature
Topic  6
virus zika vaccine human antibody infect zika virus health infection cause
Topic  7
material researcher information potential offer laboratory interest newly apparatus material potential
Topic  8
protein structure complex bind function site enzyme target rna membrane
Topic  9
1 issue 2 pageof pageof issue 3 report 4 specie system


In [371]:
nmf_cv1_model2, nmf_cv1_model_transformed2 = nmf_cv(nmf_cv_vect_transformed2, n_comp=10)

In [372]:
display_topics(nmf_cv1_model2, nmf_cv_vect2.get_feature_names(),10)

Topic  0
use material state atom high electron quantum light carbon reaction
Topic  1
cell t t cell immune cancer mouse stem stem cell tumor tissue
Topic  2
system pluto surface new solar horizon new horizon datum solar system present
Topic  3
material researcher information potential offer laboratory interest newly apparatus material potential
Topic  4
science research new world scientist u.s national researcher institute university
Topic  5
gene human dna genome genetic disease study rna expression mutation
Topic  6
change climate climate change global ice impact increase carbon model temperature
Topic  7
virus zika vaccine human antibody infect zika virus health infection cause
Topic  8
protein structure complex bind function site enzyme target rna membrane
Topic  9
1 issue 2 pageof pageof issue 3 report system 4 specie


In [373]:
nmf_cv1_model3, nmf_cv1_model_transformed3 = nmf_cv(nmf_cv_vect_transformed3, n_comp=10)

In [374]:
display_topics(nmf_cv1_model3, nmf_cv_vect3.get_feature_names(),10)

Topic  0
use state material atom electron quantum high reaction magnetic carbon
Topic  1
cell t t cell immune cancer mouse stem stem cell tumor tissue
Topic  2
science research new world scientist u.s national researcher university institute
Topic  3
system surface pluto new solar horizon new horizon datum solar system present
Topic  4
gene human dna genome genetic disease study rna expression mutation
Topic  5
change climate climate change global ice impact carbon temperature increase model
Topic  6
virus zika vaccine human antibody infect zika virus infection health cause
Topic  7
material researcher information potential offer laboratory interest newly apparatus material potential
Topic  8
protein structure complex bind function site enzyme target rna membrane
Topic  9
1 issue 2 pageof pageof issue 3 report 4 specie system


In [375]:
nmf_tf_model1, nmf_tf_model_transformed1 = nmf_tfidf(nmf_tf_vect_transformed, n_comp=10)

In [376]:
display_topics(nmf_tf_model1, nmf_tf_vect.get_feature_names(),10)

Topic  0
offer instrumentation interest researcher weekly weekly roundup roundup information instrumentation apparatus apparatus laboratory laboratory material information newly newly offer
Topic  1
gene genome dna human genetic expression disease mutation study rna
Topic  2
cell t t cell immune tumor cancer tissue immune cell stem mouse
Topic  3
quantum material electron optical atom system magnetic state light use
Topic  4
science research new u.s scientist world national university institute researcher
Topic  5
climate change climate change global ice impact ecosystem specie water temperature
Topic  6
virus zika vaccine antibody zika virus infect infection health ebola viral
Topic  7
reaction carbon bond catalyst c metal molecule h hydrogen methane
Topic  8
plant crop soil growth specie yield light sugar nitrogen pathway
Topic  9
protein structure complex rna bind membrane enzyme function site dna


In [377]:
nmf_tf_model2, nmf_tf_model_transformed2 = nmf_tfidf(nmf_tf_vect_transformed2, n_comp=10)

In [378]:
display_topics(nmf_tf_model2, nmf_tf_vect2.get_feature_names(),10)

Topic  0
roundup information newly offer laboratory material offer instrumentation apparatus laboratory offer instrumentation apparatus offer instrumentation interest researcher laboratory material potential interest laboratory material potential roundup information instrumentation apparatus laboratory material
Topic  1
gene genome human genetic dna expression plant disease evolution mutation
Topic  2
cell t t cell immune tumor cancer immune cell tissue mouse stem
Topic  3
quantum electron material optical atom state magnetic device use particle
Topic  4
science research new u.s world scientist national university institute researcher
Topic  5
climate change climate change global specie ecosystem temperature impact increase water
Topic  6
virus zika vaccine antibody zika virus infect infection health ebola viral
Topic  7
reaction carbon bond catalyst c metal molecule methane hydrogen organic
Topic  8
protein structure complex rna bind dna enzyme membrane function site
Topic  9
star pla

In [379]:
nmf_tf_model3, nmf_tf_model_transformed3 = nmf_tfidf(nmf_tf_vect_transformed3, n_comp=10)

In [380]:
display_topics(nmf_tf_model3, nmf_tf_vect3.get_feature_names(),10)

Topic  0
offer instrumentation weekly roundup laboratory material instrumentation apparatus potential interest information newly interest researcher roundup information apparatus laboratory weekly
Topic  1
science research new u.s world scientist national researcher institute university
Topic  2
cell t t cell immune tumor cancer immune cell stem cell stem tissue
Topic  3
quantum electron atom optical material magnetic state device use light
Topic  4
pluto new horizon horizon system solar surface moon solar system analysis present
Topic  5
gene dna genome rna genetic expression mutation human crispr disease
Topic  6
virus antibody zika vaccine infect zika virus viral ebola infection ebola virus
Topic  7
climate change climate change ice specie plant global impact temperature ecosystem
Topic  8
bond reaction c carbon catalyst h metal – c – h bond
Topic  9
protein structure complex leucine mtorc1 bind sestrin2 enzyme function membrane


# Test NMF / tune parameters with tokenizer 2

Different vectorizers

In [381]:
nmf_cv_vect, nmf_cv_vect_transformed = count_vectorizer(spacy_tokenizer_2, 5000, all_descriptions)

In [385]:
nmf_cv_vect2, nmf_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_2, 5000, all_descriptions, ngram_stop=4)

In [386]:
nmf_cv_vect3, nmf_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_2, 110000, all_descriptions)

In [387]:
nmf_tf_vect, nmf_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions)

In [388]:
nmf_tf_vect2, nmf_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_2, 5000, all_descriptions, ngram_stop=4)

In [389]:
nmf_tf_vect3, nmf_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_2, 110000, all_descriptions)

Different models

In [391]:
nmf_cv1_model, nmf_cv1_model_transformed = nmf_cv(nmf_cv_vect_transformed, n_comp=10)

In [392]:
display_topics(nmf_cv1_model, nmf_cv_vect.get_feature_names(),10)

Topic  0
use state material atom electron quantum high reaction light magnetic
Topic  1
cell t t cell immune cancer mouse stem stem cell tumor tissue
Topic  2
science research new world scientist u.s national researcher year institute
Topic  3
system surface pluto new solar horizon new horizon datum solar system present
Topic  4
virus zika vaccine human antibody infect zika virus health infection cause
Topic  5
change climate climate change global ice impact carbon increase model temperature
Topic  6
material researcher information potential offer laboratory interest newly apparatus material potential
Topic  7
gene human dna genome genetic disease study rna expression mutation
Topic  8
protein structure complex bind function site enzyme target rna membrane
Topic  9
1 issue pageof pageof issue 2 3 report specie system plant


In [393]:
nmf_cv1_model2, nmf_cv1_model_transformed2 = nmf_cv(nmf_cv_vect_transformed2, n_comp=10)

In [394]:
display_topics(nmf_cv1_model2, nmf_cv_vect2.get_feature_names(),10)

Topic  0
use state material quantum electron magnetic high light atom energy
Topic  1
cell t t cell immune cancer stem mouse stem cell tumor tissue
Topic  2
system pluto surface new solar horizon new horizon solar system datum present
Topic  3
material researcher potential information offer interest laboratory newly apparatus material potential
Topic  4
science research new world scientist u.s national researcher year institute
Topic  5
gene human dna disease genome genetic study expression rna mutation
Topic  6
change climate climate change global ice impact increase policy specie temperature
Topic  7
virus zika vaccine human antibody infect zika virus health infection cause
Topic  8
protein structure complex bind function site enzyme target rna design
Topic  9
issue 1 bond pageof pageof issue – carbon reaction 2 c


In [334]:
lsa_cv1_model3, lsa_cv1_model_transformed3 = lsa_cv(lsa_cv_vect_transformed3, n_comp=10)

In [335]:
display_topics(lsa_cv1_model3, lsa_cv_vect3.get_feature_names(),10)

Topic  0
cell new use human science study protein system research disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system pluto surface solar new horizon horizon solar system present moon analysis
Topic  3
science cell research new world scientist u.s national t year
Topic  4
researcher material information laboratory offer potential interest apparatus roundup instrumentation
Topic  5
protein gene human disease rna virus dna new genome genetic
Topic  6
science protein structure complex research use state new atom material
Topic  7
change climate protein climate change leucine complex structure mtorc1 science policy
Topic  8
gene science change dna expression mutation genome climate genetic crispr
Topic  9
state human leucine study mtorc1 growth gene time electron sestrin2


In [336]:
lsa_tf_model1, lsa_tf_model_transformed1 = lsa_tfidf(lsa_tf_vect_transformed, n_comp=10)

In [337]:
display_topics(lsa_tf_model1, lsa_tf_vect.get_feature_names(),10)

Topic  0
potential interest weekly newly offer weekly roundup laboratory material apparatus laboratory offer instrumentation roundup information information newly instrumentation apparatus
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell research immune disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst climate plant science enzyme methane
Topic  8
protein change climate science structure policy climate change electron state bind
Topic  9
protein ice planet earth surface year structure star solar crater


In [338]:
lsa_tf_model2, lsa_tf_model_transformed2 = lsa_tfidf(lsa_tf_vect_transformed2, n_comp=10)

In [339]:
display_topics(lsa_tf_model2, lsa_tf_vect2.get_feature_names(),10)

Topic  0
apparatus laboratory material information newly offer instrumentation potential interest researcher material potential interest newly offer interest researcher apparatus laboratory material potential material potential interest researcher apparatus laboratory newly offer instrumentation
Topic  1
cell science new use human research study change gene protein
Topic  2
cell protein gene t t cell immune tumor dna cancer expression
Topic  3
cell science cancer t t cell research immune tumor disease virus
Topic  4
cell science t t cell quantum material atom research electron state
Topic  5
protein dna gene rna structure science genome complex enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme protein science
Topic  8
protein change climate structure science climate change policy bind electron state
Topic  9
ice protein planet earth year star gravitational surface wave crat

In [340]:
lsa_tf_model3, lsa_tf_model_transformed3 = lsa_tfidf(lsa_tf_vect_transformed3, n_comp=10)

In [341]:
display_topics(lsa_tf_model3, lsa_tf_vect3.get_feature_names(),10)

Topic  0
potential interest offer instrumentation material potential interest researcher information newly offer instrumentation apparatus laboratory material instrumentation apparatus laboratory laboratory material potential interest information newly interest researcher instrumentation apparatus
Topic  1
cell science new human use research change study gene protein
Topic  2
cell t t cell protein immune gene tumor dna cancer expression
Topic  3
pluto surface solar system material new horizon carbon horizon quantum atom
Topic  4
cell pluto new horizon t horizon t cell science solar immune system
Topic  5
cell t t cell science quantum material solar light atom state
Topic  6
climate change climate change ice t t cell global virus specie impact
Topic  7
virus zika antibody vaccine zika virus infect viral infection ebola zikv
Topic  8
carbon bond c reaction protein – h c – enzyme science
Topic  9
protein structure climate ice change bind leucine rna antibody mtorc1


### Test LSA / tune parameters with tokenizer 3

Different vectorizers

In [343]:
lsa_cv_vect, lsa_cv_vect_transformed = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [344]:
lsa_cv_vect2, lsa_cv_vect_transformed2 = count_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [345]:
lsa_cv_vect3, lsa_cv_vect_transformed3 = count_vectorizer(spacy_tokenizer_3, 110000, all_descriptions, ngram_stop=4)

In [346]:
lsa_tf_vect, lsa_tf_vect_transformed = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions)

In [347]:
lsa_tf_vect2, lsa_tf_vect_transformed2 = tfidf_vectorizer(spacy_tokenizer_3, 5000, all_descriptions, ngram_stop=4)

In [348]:
lsa_tf_vect3, lsa_tf_vect_transformed3 = tfidf_vectorizer(spacy_tokenizer_3, 110000, all_descriptions, ngram_stop=4)

Different models

In [349]:
lsa_cv1_model, lsa_cv1_model_transformed = lsa_cv(lsa_cv_vect_transformed, n_comp=10)

In [350]:
display_topics(lsa_cv1_model, lsa_cv_vect.get_feature_names(),10)

Topic  0
cell new use science human study protein research year system
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science cell research new scientist world u.s researcher national year
Topic  3
system surface solar pluto material new cell new horizon horizon datum
Topic  4
gene protein human disease new system virus pluto dna rna
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
researcher virus material information potential offer laboratory instrumentation roundup human
Topic  7
virus zika antibody vaccine infect use zika virus infection atom report
Topic  8
protein virus antibody structure change climate zika science vaccine bind
Topic  9
issue 1 page page issue 2 science system 3 report immune


In [351]:
lsa_cv1_model2, lsa_cv1_model_transformed2 = lsa_cv(lsa_cv_vect_transformed2, n_comp=10)

In [352]:
display_topics(lsa_cv1_model2, lsa_cv_vect2.get_feature_names(),10)

Topic  0
cell new use science human study protein research system year
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system material surface use solar pluto protein complex new horizon page
Topic  3
researcher information potential offer laboratory interest material newly apparatus roundup
Topic  4
cell new science system pluto solar research horizon new horizon surface
Topic  5
gene protein human disease virus dna rna genome genetic new
Topic  6
change climate human virus climate change specie global cause ice response
Topic  7
virus zika use study vaccine infect antibody zika virus human researcher
Topic  8
virus protein antibody structure zika vaccine science infect zika virus bind
Topic  9
issue 1 page page issue 2 science system 3 report immune


In [353]:
lsa_cv1_model3, lsa_cv1_model_transformed3 = lsa_cv(lsa_cv_vect_transformed3, n_comp=10)

In [354]:
display_topics(lsa_cv1_model3, lsa_cv_vect3.get_feature_names(),10)

Topic  0
cell new use science human study protein research system year
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
system pluto surface solar new horizon horizon solar system present moon analysis
Topic  3
science cell research new scientist world u.s year national t
Topic  4
researcher material information potential offer laboratory interest apparatus newly instrumentation
Topic  5
protein gene human disease rna dna virus new genome genetic
Topic  6
science protein structure complex research state use new atom material
Topic  7
change climate protein climate change leucine complex structure mtorc1 science policy
Topic  8
gene science expression mutation genome dna change genetic climate crispr
Topic  9
state human leucine study mtorc1 year find growth gene time


In [355]:
lsa_tf_model1, lsa_tf_model_transformed1 = lsa_tfidf(lsa_tf_vect_transformed, n_comp=10)

In [356]:
display_topics(lsa_tf_model1, lsa_tf_vect.get_feature_names(),10)

Topic  0
weekly roundup information newly potential interest interest researcher newly offer instrumentation apparatus apparatus laboratory roundup information offer instrumentation weekly
Topic  1
cell science new use human research study change gene year
Topic  2
cell protein gene t t cell immune tumor dna cancer expression
Topic  3
cell science cancer t t cell immune research disease tumor virus
Topic  4
cell science t t cell quantum material atom research solar electron
Topic  5
protein dna gene science structure rna complex genome enzyme research
Topic  6
virus zika vaccine antibody zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst climate plant science enzyme methane
Topic  8
protein change climate science structure policy climate change electron state bind
Topic  9
protein ice planet year earth surface structure solar star crater


In [357]:
lsa_tf_model2, lsa_tf_model_transformed2 = lsa_tfidf(lsa_tf_vect_transformed2, n_comp=10)

In [358]:
display_topics(lsa_tf_model2, lsa_tf_vect2.get_feature_names(),10)

Topic  0
information newly offer instrumentation material potential interest apparatus laboratory material newly offer instrumentation apparatus newly offer instrumentation potential interest researcher offer instrumentation apparatus laboratory newly offer roundup information newly offer instrumentation apparatus laboratory
Topic  1
cell science new use human research study change gene year
Topic  2
cell protein gene t t cell immune tumor dna expression cancer
Topic  3
cell science cancer t t cell immune research disease tumor virus
Topic  4
cell science t t cell quantum material atom research electron state
Topic  5
protein dna gene science rna structure complex genome enzyme research
Topic  6
virus zika antibody vaccine zika virus infect infection viral ebola outbreak
Topic  7
carbon reaction bond c catalyst plant climate enzyme protein science
Topic  8
protein change climate science structure policy climate change electron bind state
Topic  9
ice protein year planet earth dna surfa

In [359]:
lsa_tf_model3, lsa_tf_model_transformed3 = lsa_tfidf(lsa_tf_vect_transformed3, n_comp=10)

In [360]:
display_topics(lsa_tf_model3, lsa_tf_vect3.get_feature_names(),10)

Topic  0
offer instrumentation apparatus information newly offer instrumentation material potential interest researcher material potential interest roundup information newly offer roundup information newly information newly information newly offer weekly roundup information newly apparatus laboratory material
Topic  1
cell science new human use research change gene year study
Topic  2
cell t t cell protein immune gene tumor dna expression cancer
Topic  3
surface pluto solar system material carbon new horizon atom horizon quantum
Topic  4
cell pluto new horizon horizon t t cell solar science system immune
Topic  5
cell t t cell quantum science material solar atom light state
Topic  6
climate change climate change ice t global t cell specie virus impact
Topic  7
virus antibody zika vaccine zika virus infect viral infection ebola zikv
Topic  8
carbon bond c reaction protein – h enzyme c – science
Topic  9
gene bond c specie – h c – sleep atom health


## Testing functions to reproduce above results

In [236]:
vect, vect_transformed = count_vectorizer(spacy_tokenizer, 5000, all_descriptions)

In [237]:
model, lda = LDA_options(vectorizer, 10)

In [238]:
lda[:3]

array([[ 0.00232595,  0.00232564,  0.00232618,  0.00232725,  0.00232609,
         0.97906605,  0.0023256 ,  0.00232587,  0.00232564,  0.00232574],
       [ 0.00312508,  0.00312504,  0.00312527,  0.82336822,  0.00312502,
         0.00312575,  0.003125  ,  0.15163022,  0.00312527,  0.00312513],
       [ 0.00125006,  0.00125002,  0.0012501 ,  0.98874822,  0.00125   ,
         0.00125038,  0.00125   ,  0.00125019,  0.00125056,  0.00125047]])

In [223]:
len(vect.get_feature_names())

5000

In [224]:
len(topic)

118353

In [225]:
len(vect.get_feature_names())

5000

In [228]:
a = [vect.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

IndexError: list index out of range

In [241]:
for ix, topic in enumerate(model.components_):
    b = " ".join([vect.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]])

In [240]:
display_topics(model, vect.get_feature_names(),10)

Topic  0
cell cancer t tumor t cell memory immune ray protein tissue
Topic  1
wave gravitational channel gravitational wave peptide physicist 3d hole calcium heart
Topic  2
science new research scientist world researcher human health year u.s
Topic  3
cell gene dna disease human mouse protein rna genetic genome
Topic  4
laboratory researcher information potential offer material apparatus newly instrumentation interest
Topic  5
use change 1 plant 2 issue protein model pageof issue pageof
Topic  6
system obama pluto new horizon new horizon barack obama president barack barack solar
Topic  7
use material quantum reaction light electron atom high metal molecule
Topic  8
earth ice surface ocean space planet year star gas large
Topic  9
virus antibody translation viral mrna partner hiv-1 envelope codon vaccine


In [158]:
display_topics(model, vect.get_feature_names(),10)

Topic  0
cell cancer t tumor t cell memory immune ray protein tissue
Topic  1
wave gravitational channel gravitational wave peptide physicist 3d hole calcium heart
Topic  2
science new research scientist world researcher human health year u.s
Topic  3
cell gene dna disease human mouse protein rna genetic genome
Topic  4
laboratory researcher information potential offer material apparatus newly instrumentation interest
Topic  5
use change 1 plant 2 issue protein model pageof issue pageof
Topic  6
system obama pluto new horizon new horizon barack obama president barack barack solar
Topic  7
use material quantum reaction light electron atom high metal molecule
Topic  8
earth ice surface ocean space planet year star gas large
Topic  9
virus antibody translation viral mrna partner hiv-1 envelope codon vaccine


In [153]:
display_topics(model, vect.get_feature_names(),10)

Topic  0
center cardiovascular target tunneling targeting metastatic importance real time provision tomography
Topic  1
wavelength growth factor chinese growth hormone persistent placoderm absorption house representative capable heterogeneity
Topic  2
scientific community new zealand resemble seasonal world large resistant hydrocarbon helminth year ago u.s science
Topic  3
center geneticist dopamine neuron distribute hydrocarbon muscular provision rocky geology pluto girl
Topic  4
lastyear resistant injection precise onset mechanical army nix intensity inthe
Topic  5
use new chimpanzee 1 play key 2–4 julythe new provision molecule paper publish paper new
Topic  6
tackle occurrence pollinator new zealand human body news belt prime belong song
Topic  7
use new mechanical r&d rearrange liquid embrace avenue historically mice morgan
Topic  8
edge image surveillance ongoing sparsely play central year ago stellar genetic engineering latitudinal
Topic  9
visible approval transplant virus outb

In [148]:
display_topics(model, vect.get_feature_names(),10)

Topic  0
center cardiovascular target tunneling targeting metastatic importance real time provision tomography
Topic  1
wavelength growth factor chinese growth hormone persistent placoderm absorption house representative capable heterogeneity
Topic  2
scientific community new zealand resemble seasonal world large resistant hydrocarbon helminth year ago u.s science
Topic  3
center geneticist dopamine neuron distribute hydrocarbon muscular provision rocky geology pluto girl
Topic  4
lastyear resistant injection precise onset mechanical army nix intensity inthe
Topic  5
use new chimpanzee 1 play key 2–4 julythe new provision molecule paper publish paper new
Topic  6
tackle occurrence pollinator new zealand human body news belt prime belong song
Topic  7
use new mechanical r&d rearrange liquid embrace avenue historically mice morgan
Topic  8
edge image surveillance ongoing sparsely play central year ago stellar genetic engineering latitudinal
Topic  9
visible approval transplant virus outb

In [182]:
vect, vect_transformed = count_vectorizer(spacy_tokenizer, 111971, all_descriptions)

In [183]:
model, lda = LDA_options(vectorizer, 10)

In [184]:
lda[:3]

array([[ 0.00232595,  0.00232564,  0.00232618,  0.00232725,  0.00232609,
         0.97906605,  0.0023256 ,  0.00232587,  0.00232564,  0.00232574],
       [ 0.00312508,  0.00312504,  0.00312527,  0.82336822,  0.00312502,
         0.00312575,  0.003125  ,  0.15163022,  0.00312527,  0.00312513],
       [ 0.00125006,  0.00125002,  0.0012501 ,  0.98874822,  0.00125   ,
         0.00125038,  0.00125   ,  0.00125019,  0.00125056,  0.00125047]])

In [185]:
display_topics(model, vect.get_feature_names(), 10)

Topic  0
36)is 3 little amyotrophic lateral ancestral early anabolic advance synthetic activity ultimately akhmediev aim design analysis trend
Topic  1
animal association activate n 4 provide activate nuclease agent purge aggressively judge 1 application activity deep 3 demonstrate active material
Topic  2
alloy catalyst afflict neurodegenerative alkama allure paleontologist animal robust alkyl arylamine activity likely active climate animal victimize ancient collagen
Topic  3
36)is act like abundant seafloor absurdity classical activity likely affect accord aim design allow distant act reduce act terminate
Topic  4
address legislation alkyl arylamine adapt agree design age 3 advance clock 1)their aficionado convince adaptation possibility adaptively alter
Topic  5
andean 4 polycrystalline 'll restore ago contrast -ryanodol addition organic aim design advocate offer agency esa agency endanger
Topic  6
amyloid associate aftermath sputnik ago large afflict neurodegenerative activity east

In [180]:
display_topics(model, vect.get_feature_names(), 10)

Topic  0
2 essa 2 achieve alarm house allosterically alarm morning activate nuclease access carbohydrate aeruginosa vogeley adversary align graphene
Topic  1
alter neural abscission schardon 2 policy absence adenosylmethionine sam administer 5% 1 achieve academic researcher 1–4 light abundance conventional
Topic  2
agency funder actually amplify africa million agency maximize alternative process africa startlingly accelerate fifth abu dis altitude allow darpa
Topic  3
2 essa abnormal meiotic 40-year 4-hectare accelerate fifth activity dependent adversary age incandescent abolish aboutbc reconcile
Topic  4
achieve bulk africa startlingly accompany increase adsorption 111 adaptation tissue action individual 1.3 actuator account baseline accounting open
Topic  5
allow switching 2 phenomenon 'll restore administration speak -ryanodol accurate enzymatic adversary active site additional impact additional gorilla
Topic  6
alain adaptation diverse admit wait actually amplify academy art actual

### Trying LSA!!!

In [269]:
lsa_vect, lsa_vect_transformed = count_vectorizer(spacy_tokenizer, 5000, all_descriptions)

In [270]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=5000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function spacy_tokenizer at 0x128885620>,
        vocabulary=None)

In [271]:
lsa_model, lsa = lsa_cv(lsa_vect_transformed, 10)

In [272]:
lsa[:3]

array([[ 0.94153473, -0.34751925, -0.27912847, -0.47465336,  0.3840288 ,
        -0.27063718,  0.3931151 ,  0.17325652, -0.1917408 , -0.43232284],
       [ 0.66561169, -0.13334834, -0.31210556, -0.49606575,  0.3804588 ,
         0.33364062,  0.02592389, -0.38060436, -0.22006209, -0.21158919],
       [ 1.93195539,  0.58805522, -0.25000451, -0.57404095,  0.25727837,
         0.89451977,  0.43677663, -0.01895827, -0.03202209,  0.08014516]])

In [199]:
vect.get_feature_names()

['.-',
 '1',
 '1 billion',
 '1 pageof',
 '1 provide',
 '1 report',
 '1–3',
 '1–4',
 '2',
 '2 pageof',
 '2 provide',
 '2 report',
 '2 respectively',
 '2d',
 '2–4',
 '3',
 '3 pageof',
 '3 report',
 '3d',
 '4',
 '4 provide',
 '4 report',
 '40',
 '5',
 '5 pageof',
 '6',
 '7',
 'aaas',
 'abandon',
 'ability',
 'able',
 'abnormality',
 'aborigine',
 'about%',
 'aboutyear',
 'aboutyear ago',
 'absence',
 'absolute',
 'absorb',
 'absorption',
 'abstract',
 'abundance',
 'abundant',
 'academia',
 'academic',
 'academic researcher',
 'academy',
 'academy science',
 'accelerate',
 'accelerator',
 'accept',
 'access',
 'accessible',
 'accident',
 'accompany',
 'accord',
 'account',
 'accretion',
 'accumulate',
 'accumulation',
 'accuracy',
 'accurate',
 'accuse',
 'acetylation',
 'acetylene',
 'achieve',
 'achieve high',
 'acid',
 'acidic',
 'acknowledge',
 'acquire',
 'acquisition',
 'act',
 'actin',
 'actin filament',
 'action',
 'activate',
 'activation',
 'active',
 'active site',
 'actively',

In [273]:
display_topics(lsa_model, lsa_vect.get_feature_names(),10)

Topic  0
cell new use human science study protein research system disease
Topic  1
cell t t cell immune protein cancer mouse tumor tissue stem cell
Topic  2
science research cell new world scientist u.s national researcher institute
Topic  3
system surface solar pluto new material new horizon horizon cell datum
Topic  4
gene protein human disease new virus system dna rna genome
Topic  5
change climate climate change human virus global specie ice cause impact
Topic  6
virus researcher material information potential offer human laboratory instrumentation roundup
Topic  7
virus zika vaccine antibody infect use zika virus infection atom report
Topic  8
protein virus structure antibody change climate zika bind vaccine science
Topic  9
1 issue 2 pageof pageof issue science 3 system report immune


In [262]:
len(lsa_vect.get_feature_names())

5000

In [264]:
test = topic.argsort()[:-10 - 1:-1]
test

array([ 5053, 37532, 37533, 16471, 71441, 16284, 56981, 50714, 84717, 75804])

In [267]:
lsa_vect.get_feature_names()[5053]

IndexError: list index out of range

In [265]:
for item in test:
    print(item, lsa_vect.get_feature_names()[item])

IndexError: list index out of range

In [249]:
vect.get_feature_names()

['.-',
 '1',
 '1 2',
 '1 billion',
 '1 pageof',
 '1 report',
 '1.4',
 '1.5',
 '10%',
 '1960']

In [254]:
for ix, topic in enumerate(lsa_model.components_):
    print(len(vect.get_feature_names()))
    b = " ".join([vect.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]])

5000


IndexError: list index out of range

In [202]:
topic

array([ 0.01496662,  0.00014335,  0.00027796, ...,  0.00027975,
        0.00027975,  0.00027975])

In [209]:
len(model.components_)

10

In [203]:
len(topic)

118353

In [207]:
len(topic)
len(vect.get_feature_names())

5000

In [None]:
vect.get_feature_names()[]

In [213]:
topic.argsort()[:-10]

array([103417,  39916,  94688, ...,  21371,  31746,  78271])

In [None]:
def display_topics(model, feature_names, no_top_words):
    for ix, topic in enumerate(model.components_):
        print("Topic ", ix)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
[:-10:-1]

In [217]:
a = [vect.get_feature_names()[i] for i in topic.argsort()]

IndexError: list index out of range

In [214]:
for ix, topic in enumerate(model.components_):
        print("Topic ", ix)
        print(" ".join([vect.get_feature_names()[i]
                        for i in topic.argsort()[:-10:-1]]))

Topic  0


IndexError: list index out of range

In [None]:
[:-no_top_words - 1:-1]

In [87]:
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), max_df = 0.6, max_features=5000)

In [88]:
X_spacy = vectorizer.fit_transform(all_descriptions)

In [89]:
X_spacy.shape

(1877, 5000)

In [90]:
n_topics = 10
n_iter = 10
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')
data = lda.fit_transform(X_spacy)
data[:3]

array([[ 0.00232632,  0.00232559,  0.00232633,  0.97906561,  0.00232601,
         0.00232589,  0.00232612,  0.00232588,  0.00232582,  0.00232642],
       [ 0.00312546,  0.003125  ,  0.1815002 ,  0.37891587,  0.20312908,
         0.00312537,  0.0031252 ,  0.00312579,  0.00312561,  0.21770241],
       [ 0.00131587,  0.00131579,  0.00131595,  0.00131595,  0.04398058,
         0.00131592,  0.00131588,  0.00131591,  0.0013158 ,  0.94549233]])

In [91]:
display_topics(lda,vectorizer.get_feature_names(),10)

Topic  0
science research new scientific scientist human state drug work social
Topic  1
planet solar star surface system crater ice comet solar system spacecraft
Topic  2
climate change global year impact climate change world water increase ocean
Topic  3
protein gene dna rna genetic genome potential use information interest
Topic  4
cell cancer disease immune t tissue tumor t cell mouse virus
Topic  5
quantum use material state electron earth system optical atom particle
Topic  6
new researcher research u.s world year science health scientist national
Topic  7
reaction carbon molecule use bond metal enzyme complex structure oxygen
Topic  8
gravitational wave crispr physicist university gravitational wave student muscle career new
Topic  9
human plant study brain animal 1 specie new page system


### Workflow for getting issues from each year

In [None]:
url = 'http://science.sciencemag.org/content/by/year/2017'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page,"html5lib")

In [None]:
link_search = soup.find_all(class_ = "highlight-image-linked")
link_search

In [None]:
issues2 = []
for link in link_search:
    a = link.get('href')
    issues2.append(a)

In [None]:
len(issues)

In [None]:
url = 'http://science.sciencemag.org/content/by/year/2016'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page,"html5lib")

In [None]:
link_search = soup.find_all(class_ = "highlight-image-linked")
link_search

### Find workflow for getting the links to articles from each issue

In [None]:
path = 'http://science.sciencemag.org'
response = requests.get(path + issues2[0])
page = response.text
soup = BeautifulSoup(page,"html5lib")

In [None]:
classes = ['abstract first', 'editor-summary first', 'summary first']
i = 1
article_content_links = []
for entry in classes:
    search = soup.find_all(class_ = entry)
    for item in search:
        article_content_links.append(item.find('a').get('href'))

In [None]:
article_content_links

### workflow 

In [None]:
content_classes = ['section editor-summary', 'section summary', 'section abstract']

In [None]:
article_content_links[0]

In [None]:
path = 'http://science.sciencemag.org'
response = requests.get(path + article_content_links[0])
page = response.text
soup = BeautifulSoup(page,"html5lib")
title = soup.find(class_ = 'highwire-cite-title').text
print(title)
for entry in content_classes:
    a = soup.find(class_ = entry)
    if a:
        # print(a.text)
        print(a.find('p').text)

In [None]:
test = 'http://science.sciencemag.org/content/308/5719'
response = requests.get(test)
page = response.text
soup = BeautifulSoup(page,"html5lib")

In [None]:
'abstract first', 'editor-summary first'
title_ref = soup.find_all(class_ = 'summary first')
title_ref

In [None]:
 next_base = 'http://science.sciencemag.org/'

In [137]:
all_descriptions[8]

'H2O exists in two spin isomers, ortho and para, in a ratio of 3:1 at room temperature. Some astronomical observations have found water with a ratio of less than 3, thought to be due to water being photodesorbed from ice that had been formed at very low temperatures (≾30 K). Hama et al. tested this idea in the laboratory, by forming water ice at low temperature and then photodesorbing it to measure the ortho:para ratio. They found a ratio of 3, even at 10 K. Thus, another explanation for the low ratios in some astronomical objects must be found.'

In [138]:
doctest = re.sub('\s\d+(\s)?(,)?', '', all_descriptions[8])

In [139]:
doctest

'H2O exists in two spin isomers, ortho and para, in a ratio of:1 at room temperature. Some astronomical observations have found water with a ratio of less than thought to be due to water being photodesorbed from ice that had been formed at very low temperatures (≾30 K). Hama et al. tested this idea in the laboratory, by forming water ice at low temperature and then photodesorbing it to measure the ortho:para ratio. They found a ratio of even atK. Thus, another explanation for the low ratios in some astronomical objects must be found.'