<a href="https://colab.research.google.com/github/chughrohit/Newslinker/blob/master/text_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pwd

/content


In [0]:
!ls ./drive/My\ Drive/CS255_Project/

CORD-19  __init__.py  PV_DBOW.txt  PV_DM.txt  __pycache__  text_process.ipynb


In [0]:
import json
from collections import defaultdict
import gensim
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import os
import glob
nlp = spacy.load('en', disable=['parser', 'ner'])

In [0]:
class textProcess:

    def preprocessText(self, text):

        text_gensim = ' '.join(gensim.utils.simple_preprocess(text, deacc=True))
        text_obj = nlp(text_gensim)
        text_lemma = [token.lemma_ for token in text_obj]
        return ' '.join(text_lemma)

    def createVocab(self, corpus, ngram_range=(1, 1)):

        countVec = CountVectorizer(stop_words='english', ngram_range=ngram_range,
                                   analyzer='word', strip_accents='unicode')
        countVocab = countVec.fit(corpus)
        return countVocab.vocabulary_

    def generateDoc2Vec(self, articles, is_dbow=False, is_hs=False, embedding_dim=100, max_epochs=10, win=5, min_c=1, neg=5):

        if is_dbow == True and is_hs == False:
                d2v_model_name = "PV-DBOW"
        elif is_dbow == False and is_hs == True:
                d2v_model_name = "PV-DM with hierarchical softmax"
        elif is_dbow == False and is_hs == False:
                d2v_model_name = "PV-DM with negative sampling"
        docTrain = []
        for artl in articles:
          docTrain.append(TaggedDocument(artl['words'], [artl['paper_id']]))
        if is_dbow:
                d2v_model = Doc2Vec(vector_size=embedding_dim, alpha=0.025, min_alpha=0.00025, window=win, min_count=min_c, workers=5, dm=0, dbow_words=0)
        elif is_hs and not is_dbow:
                d2v_model = Doc2Vec(vector_size=embedding_dim, alpha=0.025, min_alpha=0.00025, window=win, min_count=min_c, workers=5, dm=1,  hs=1, negative=neg, dm_mean=1, dm_concat=0)
        elif not is_hs and not is_dbow:
                d2v_model = Doc2Vec(vector_size=embedding_dim, alpha=0.025, min_alpha=0.00025, window=win, min_count=min_c, workers=5, dm=1,  hs=0, negative=neg, dm_mean=1, dm_concat=0)
        d2v_model.build_vocab(docTrain)
        print("Training Doc2Vec ({}) model on a corpus of {} articles".format(d2v_model_name, d2v_model.corpus_count))
        d2v_model.train(docTrain, total_examples=d2v_model.corpus_count, epochs=max_epochs)
        print("{} dimensional document vectors generated using {}".format(embedding_dim, d2v_model_name))
        d2v_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        return d2v_model


In [0]:
data_path_arxiv = './drive/My Drive/CS255_Project/CORD-19/arxiv/arxiv/pdf_json/'
data_path_bioarxiv = './drive/My Drive/CS255_Project/CORD-19/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/'
arxiv_articles = []
for fl in glob.glob(data_path_arxiv + "*json"):
    arxiv_articles.append(json.load(open(fl, "r")))
for fl in glob.glob(data_path_bioarxiv + "*json"):
    arxiv_articles.append(json.load(open(fl, "r")))

In [0]:
pd.DataFrame(arxiv_articles).paper_id.unique().shape

(3457,)

In [0]:
textproc = textProcess()
for artl in arxiv_articles:
  body_text = ''
  for text_json in artl['body_text']:
    body_text += text_json['text']
    body_text += " "
  artl['preprocessedText'] = textproc.preprocessText(body_text)
  artl['words'] = list(gensim.utils.simple_tokenize(artl['preprocessedText']))

In [0]:
## PV-DBOW
textproc = textProcess()
PV_DBOW_arxiv = textproc.generateDoc2Vec(arxiv_articles, is_dbow=True)

Training Doc2Vec (PV-DBOW) model on a corpus of 3457 articles
100 dimensional document vectors generated using PV-DBOW


In [0]:
## PV-DM

PV_DM_arxiv = textproc.generateDoc2Vec(arxiv_articles, is_dbow=False)

Training Doc2Vec (PV-DM with negative sampling) model on a corpus of 3457 articles
100 dimensional document vectors generated using PV-DM with negative sampling


In [0]:
PV_DBOW_arxiv.docvecs.vectors_docs.shape

(3457, 100)

In [0]:
output_file="./drive/My Drive/CS255_Project/PV_DBOW.doc2vec"
PV_DBOW_arxiv.save(output_file)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
output_file2="./drive/My Drive/CS255_Project/PV_DM.doc2vec"
PV_DM_arxiv.save(output_file2)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
import os
os.getcwd()

'/content'

In [0]:
!ls .

drive  sample_data


In [0]:
from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
km_model = KMeans(n_clusters=5)
output_file="./drive/My Drive/CS255_Project/PV_DBOW.doc2vec"
model = Doc2Vec.load(output_file)
X = km_model.fit(model.docvecs.vectors_docs)
#vector = model.infer_vector(["system", "response"])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model.docvecs.vectors_docs.shape

(3457, 100)

In [0]:
labels=X.labels_.tolist()

In [0]:
print(labels)

[1, 4, 2, 4, 4, 4, 4, 1, 4, 4, 3, 4, 1, 4, 4, 1, 1, 1, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 2, 4, 1, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 1, 4, 1, 4, 4, 1, 4, 1, 4, 2, 2, 4, 1, 1, 4, 4, 1, 4, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 4, 1, 2, 1, 4, 1, 3, 1, 4, 4, 1, 4, 1, 4, 4, 1, 4, 1, 4, 4, 1, 4, 4, 4, 4, 1, 1, 4, 1, 4, 1, 4, 2, 1, 4, 1, 4, 4, 4, 2, 4, 4, 1, 4, 4, 4, 4, 3, 4, 4, 4, 1, 4, 1, 4, 2, 4, 1, 4, 1, 4, 1, 4, 4, 1, 1, 1, 4, 4, 4, 4, 4, 4, 1, 1, 1, 0, 0, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 4, 1, 2, 1, 1, 4, 4, 1, 4, 1, 1, 4, 4, 4, 4, 0, 3, 4, 1, 1, 4, 1, 4, 4, 4, 1, 1, 1, 4, 1, 4, 3, 1, 4, 4, 1, 0, 4, 4, 4, 4, 4, 4, 1, 1, 2, 1, 1, 4, 2, 4, 1, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 2, 4, 1, 4, 1, 4, 4, 1, 4, 2, 4, 1, 4, 1, 4, 4, 2, 4, 4, 1, 4, 4, 4, 1, 4, 2, 4, 1, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 1, 4, 4, 1, 4, 1, 4, 4, 4, 1, 1, 4, 4, 4, 4, 1, 4, 4, 4, 1, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 0, 4, 4, 4, 2, 1, 4, 4, 4, 4, 4, 4, 2, 