In [3]:
import gensim
import nltk
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models, similarities
import pandas as pd

In [4]:
file = 'newmeta.csv'

new_data = pd.read_csv(file, low_memory=False)
print(new_data.head(5))

                                            abstract publish_time
0  OBJECTIVE: This retrospective chart review des...   2001-07-04
1  Inflammatory diseases of the respiratory tract...   2000-08-15
2  Surfactant protein-D (SP-D) participates in th...   2000-08-25
3  Endothelin-1 (ET-1) is a 21 amino acid peptide...   2001-02-22
4  Respiratory syncytial virus (RSV) and pneumoni...   2001-05-11


In [5]:
def clean(text):
    text = re.sub("[^a-zA-Z]", "", str(text))
    text = text.lower()
    text = nltk.word_tokenize(text)
    return text

# as seen in the first few runs of lda, these words contributed to the noise
# when it came to topic modeling. To remove the noise, we'll remove these words
# and get the relevant topics
common_words = stopwords.words('english')
common_words.extend(['of', 'and', 'the', 'in', 'were', 'to', 'nan', 'with'])
def remove_words(text):
  return [word for word in text if word not in common_words]

# stemming helps reduce some of the noise too. It chops some words, but we still
# know what the word says
stemmer = PorterStemmer()
def stem_words(text):
  text = [stemmer.stem(word) for word in text]
  return text

# this function applies the stemmer, removal of common words, removal of punctuation,
# lowercases the text, and tokenizes everything
def preprocess(text):
  return stem_words(remove_words(clean(text)))

# token the data to be used with gensim, just looking at the abstracts
new_data['tokenized_data'] = new_data['abstract'].apply(preprocess)

# print the first two rows of the cleaned data
# this will show the tokenized data from the abstract, basically splits up
# the abstract into single words
print(new_data.head(2))

'''
# (3) CREATE GENSIM LIBRARY AND CORPUS
# gensim dictionary from tokenized data
token = new_data['tokenized_data']

# dictionary will be used in corpus
dictionary = corpora.Dictionary(token)

# filter keywords, we want the ones that show up the most in abstracts
dictionary.filter_extremes(no_below = 1, no_above = 0.8)         # filter keywords

# dictionary to corpus
corpus = [dictionary.doc2bow(tokens) for tokens in token]

# prints the corpus for the first document
# corpus (1, 1) implies that the word with the id of 1 has occurred only once in the first document
# corpus (14, 4) implies that the word with the id of 14 has occurred 4 times in the first document
print(corpus[:1])'''

                                            abstract publish_time  \
0  OBJECTIVE: This retrospective chart review des...   2001-07-04   
1  Inflammatory diseases of the respiratory tract...   2000-08-15   

                                      tokenized_data  
0  [objectivethisretrospectivechartreviewdescribe...  
1  [inflammatorydiseasesoftherespiratorytractarec...  


"\n# (3) CREATE GENSIM LIBRARY AND CORPUS\n# gensim dictionary from tokenized data\ntoken = new_data['tokenized_data']\n\n# dictionary will be used in corpus\ndictionary = corpora.Dictionary(token)\n\n# filter keywords, we want the ones that show up the most in abstracts\ndictionary.filter_extremes(no_below = 1, no_above = 0.8)         # filter keywords\n\n# dictionary to corpus\ncorpus = [dictionary.doc2bow(tokens) for tokens in token]\n\n# prints the corpus for the first document\n# corpus (1, 1) implies that the word with the id of 1 has occurred only once in the first document\n# corpus (14, 4) implies that the word with the id of 14 has occurred 4 times in the first document\nprint(corpus[:1])"

In [9]:
# (3) CREATE GENSIM LIBRARY AND CORPUS
# gensim dictionary from tokenized data
token = new_data['tokenized_data']

# dictionary will be used in corpus
dictionary = corpora.Dictionary(token)

# filter keywords, we want the ones that show up the most in abstracts
dictionary.filter_extremes(no_below = 1, no_above = 0.8)         # filter keywords

# dictionary to corpus
corpus = [dictionary.doc2bow(tokens) for tokens in token]

# prints the corpus for the first document
# corpus (1, 1) implies that the word with the id of 1 has occurred only once in the first document
# corpus (14, 4) implies that the word with the id of 14 has occurred 4 times in the first document
print(corpus[:1])

[[(0, 1)]]


In [11]:
model = gensim.models.word2vec(sentences=corpus)

Dictionary(99039 unique tokens: ['objectivethisretrospectivechartreviewdescribestheepidemiologyandclinicalfeaturesofpatientswithcultureprovenmycoplasmapneumoniaeinfectionsatkingabdulazizuniversityhospitaljeddahsaudiarabiamethodspatientswithpositivempneumoniaeculturesfromrespiratoryspecimensfromjanuarythroughdecemberwereidentifiedthroughthemicrobiologyrecordschartsofpatientswerereviewedresultspatientswereidentifiedofwhomrequiredadmissionmostinfectionswerecommunityacquiredtheinfectionaffectedallagegroupsbutwasmostcommonininfantsandpreschoolchildrenitoccurredyearroundbutwasmostcommoninthefallandspringmorethanthreequartersofpatientshadcomorbiditiestwentyfourisolateswereassociatedwithpneumoniawithupperrespiratorytractinfectionsandwithbronchiolitiscoughfeverandmalaisewerethemostcommonsymptomsandcrepitationsandwheezeswerethemostcommonsignsmostpatientswithpneumoniahadcrepitationsbutonlyhadbronchialbreathingimmunocompromisedpatientsweremorelikelythannonimmunocompromisedpatientstopresentwithpneu

In [28]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

# Load pre-trained Word2Vec model.
model = gensim.models.Word2Vec.load("w2v.model")

In [30]:
%pip install sklearn
%pip install plotly

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np 

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    '''trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)'''

NameError: name 'model' is not defined