In [1]:
from dask.distributed import Client
import dask.array as da
import dask.dataframe as dd
import joblib
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords

In [2]:
client = Client()

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:62157  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.06 GB


In [4]:
tf_project_matrix = dd.from_pandas(joblib.load("./models/tf_project_matrix.pkl"), npartitions=1)
tf_libs_matrix = dd.from_pandas(joblib.load("./models/tf_lib_matrix.pkl"), npartitions=1)
tf_frameworks_matrix = dd.from_pandas(joblib.load("./models/tf_framework_matrix.pkl"), npartitions=1)
tf_dbs_matrix = dd.from_pandas(joblib.load("./models/tf_dbs_matrix.pkl"), npartitions=1)

tf_projects = joblib.load("./models/tf_project.pkl")
tf_libs = joblib.load("./models/tf_lib.pkl")
tf_frameworks = joblib.load("./models/tf_framework.pkl")
tf_dbs = joblib.load("./models/tf_dbs.pkl")

In [5]:
context_based_feature_matrix = dd.from_pandas(joblib.load("./models/context_based_feature_matrix.pkl"), npartitions=1)

In [6]:
id2word = Dictionary.load("./models/dictionary")
lda_model = LdaModel.load("./models/ldamodel")

In [7]:
tf_project_matrix = client.persist(tf_project_matrix)
tf_libs_matrix = client.persist(tf_libs_matrix)
tf_frameworks_matrix = client.persist(tf_frameworks_matrix)
tf_dbs_matrix = client.persist(tf_dbs_matrix)
context_based_feature_matrix = client.persist(context_based_feature_matrix)

In [8]:
def get_stopwords():
    with open("long_stopwords.txt", "r") as fp:
        stopwords_long = fp.read().split("\n")
    common_stopwords = list(stopwords.words("english"))
    stopwords_list = list(set(stopwords_long + common_stopwords))
    return stopwords_list

In [9]:
stop_words = get_stopwords()
stop_words = stop_words + ["input", "output", "fig", "database", "pict", "time", "paper", "description", "function", "class", "based", "process", "cid", "http", "www", "com", "electronics", "telecommunication"]
stop_words = stop_words + ["project", "pict", "phase", "fulfillment", "partial", "requirements", "engineering", "pune", "bachelor", "submitted", "computer", "phule", "savitribai", "university", "degree"]

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from dask_ml.metrics.pairwise import euclidean_distances
import string
import re
import dask.array as da
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
## Tf-idf models

client.publish_dataset(tf_project_matrix=tf_project_matrix)
client.publish_dataset(tf_dbs_matrix=tf_dbs_matrix)
client.publish_dataset(tf_libs_matrix=tf_libs_matrix)
client.publish_dataset(tf_frameworks_matrix=tf_frameworks_matrix)
client.publish_dataset(tf_dbs=tf_dbs)
client.publish_dataset(tf_libs=tf_libs)
client.publish_dataset(tf_projects=tf_projects)
client.publish_dataset(tf_frameworks=tf_frameworks)

# BERT and LDA Models

client.publish_dataset(context_based_feature_matrix=context_based_feature_matrix)
client.publish_dataset(id2word=id2word)
client.publish_dataset(lda_model=lda_model)
client.publish_dataset(stop_words=stop_words)

# Dependencies

client.publish_dataset(lemmatizer=lemmatizer)
client.publish_dataset(word_tokenize=word_tokenize)
client.publish_dataset(euclidean_distances=euclidean_distances)
client.publish_dataset(string=string)
client.publish_dataset(re=re)
client.publish_dataset(da=da)
client.publish_dataset(dd=dd)
client.publish_dataset(np=np)
client.publish_dataset(pd=pd)