Allowing NLTK to see the data folder

In [3]:
from dotenv import load_dotenv
import os
word2vec_dir = os.path.dirname(os.getcwd())
dotenv_path = os.path.join(word2vec_dir, '.env')
load_dotenv(dotenv_path)
NLTK_PATH = os.getenv("NLTK_DATA")
USER_PATH = os.getenv("USER_PATH")

Gensim

In [4]:
import os
import gensim
from gensim import utils
import random

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    
    def __init__(self, 
                 user_filename=None,
                 user_path = USER_PATH):
        self.user_filename = user_filename
        self.neg_path = os.path.join(NLTK_PATH, 'corpora/movie_reviews/neg')
        self.pos_path = os.path.join(NLTK_PATH, 'corpora/movie_reviews/pos')
        self.user_path = user_path
        
    def __iter__(self):
        if self.user_filename==None:    
            for file_path in self._get_default_file_paths():
                yield from self._read_and_preprocess_file(file_path)
        fil = os.path.join(self.user_path, self.user_filename)
        yield from self._read_and_preprocess_file(fil)
        
    
    def _get_default_file_paths(self, randomize=True):
        neg_files = [os.path.join(self.neg_path, file) for file in os.listdir(self.neg_path)]
        pos_files = [os.path.join(self.pos_path, file) for file in os.listdir(self.pos_path)]
        if randomize == True:
            random.shuffle(neg_files + pos_files)
        return neg_files + pos_files
    
    
    def _read_and_preprocess_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                yield utils.simple_preprocess(line)


In [5]:
sentences = MyCorpus() 
for s in sentences:
    break
print(s)

['synopsis', 'in', 'sooner', 'than', 'you', 'think', 'america', 'the', 'future', 'of', 'law', 'enforcement', 'resides', 'in', 'blade', 'squad', 'ragtag', 'group', 'of', 'culturally', 'diverse', 'rollerblading', 'cops', 'with', 'jetpacks', 'strapped', 'on', 'their', 'backs']


In [None]:
sentences = MyCorpus()    #TODO build path parameter for MyCorpus Object
model = gensim.models.Word2Vec(sentences=sentences, workers=10)


# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-download-auto-examples-tutorials-run-word2vec-py
# https://radimrehurek.com/gensim/auto_examples/tutorials/index.html
# https://github.com/piskvorky/gensim

In [60]:
a = model.wv.most_similar(positive=['car'], topn=5)
print(a)

[('room', 0.8718190789222717), ('apartment', 0.8587338924407959), ('hotel', 0.855017900466919), ('plane', 0.8471948504447937), ('house', 0.8300197124481201)]


In [5]:
from collections import Counter
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

stoplist = set(['the', 'of', 'and', 'for', 'in', 'a', 'to', 'on', 'with'])

texts = [[token for token in document.split() if token.lower() not in stoplist] for document in documents]
all_tokens = [token for text in texts for token in text]
word_counts = Counter(all_tokens)
singleton_words = set(word for word, count in word_counts.items() if count == 1)
texts = [[token for token in text if token not in singleton_words] for text in texts]

# print(texts)

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)



{'computer': 0, 'interface': 1, 'response': 2, 'survey': 3, 'system': 4, 'time': 5, 'user': 6, 'EPS': 7, 'trees': 8, 'Graph': 9, 'minors': 10}


In [1]:
# https://github.com/plotly/plotly.py

import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
df = px.data.tips()
fig = px.pie(df, values='tip', names='day')
fig.show()

Streamlit