In [46]:
import numpy as np
import torch 
from DNN import FFN
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [66]:
class model():
    
    def __init__(self,
                 des_path,
                 des_feat_path,
                 install_path,
                 install_feat_path,
                 invoc_path,
                 invoc_feat_path,
                 cite_path,
                 cite_feat_path
                ):
        self.desc = "This class contains all four binary classifier"
        self.des_path = des_path
        self.install_path = install_path
        self.invoc_path = invoc_path
        self.cite_path = cite_path
        
        self.des_feat_path = des_feat_path
        self.install_feat_path = install_feat_path
        self.invoc_feat_path = invoc_feat_path
        self.cite_feat_path = cite_feat_path
        
        self.build()
        
    def build(self):
        self.model_desc = torch.load(self.des_path)
        self.model_install = torch.load(self.install_path)
        self.model_invoc = torch.load(self.invoc_path)
        self.model_cite = torch.load(self.cite_path)
        
        desc = pd.read_csv(self.des_feat_path).values
        install = pd.read_csv(self.install_feat_path).values
        invoc = pd.read_csv(self.invoc_feat_path).values
        cite = pd.read_csv(self.cite_feat_path).values
        self.desc_feat = {}
        self.install_feat = {}
        self.invoc_feat = {}
        self.cite_feat = {}
        for i in range(len(desc)):
            self.desc_feat[desc[i][0]] = i
        for i in range(len(install)):
            self.install_feat[install[i][0]] = i
        for i in range(len(invoc)):
            self.invoc_feat[invoc[i][0]] = i
        for i in range(len(cite)):
            self.cite_feat[cite[i][0]] = i        

        self.desc_vec = CountVectorizer(vocabulary = self.desc_feat)
        self.install_vec = CountVectorizer(vocabulary = self.install_feat)
        self.invoc_vec = CountVectorizer(vocabulary = self.invoc_feat)
        self.cite_vec = CountVectorizer(vocabulary = self.cite_feat)
        
        
        
    def inference(self,corpus):
        new_corpus = list(map(self.lower_stopwords,corpus.split(".")))
        encoded_corpus = self.desc_vec.transform(new_corpus).toarray()
        o1 = self.model_desc(torch.tensor(encoded_corpus).float())
        o2 = self.model_install(torch.tensor(encoded_corpus).float())
        o3 = self.model_invoc(torch.tensor(encoded_corpus).float())
        o4 = self.model_cite(torch.tensor(encoded_corpus).float())
        desc_pred = torch.argmax(o1,dim=1)
        install_pred = torch.argmax(o2,dim=1)
        invoc_pred = torch.argmax(o3,dim=1)
        cite_pred = torch.argmax(o4,dim=1)
        return new_corpus,desc_pred,install_pred,invoc_pred,cite_pred
    def lower_stopwords(self,x):
        x = re.sub(r'[^a-zA-Z\s]', '', x, re.I|re.A)
        x = x.lower()
        x = x.strip()
        text_tokens = [word for word in word_tokenize(x) if word not in stopwords.words()]
        return " ".join(text_tokens)
        
    

In [67]:
des_path = "../saved_models/DNN_description.pt"
des_feat_path = "../saved_models/description_feat.csv"

install_path = des_path
install_feat_path = des_feat_path

invoc_path = des_path
invoc_feat_path = des_feat_path

cite_path = des_path
cite_feat_path = des_feat_path

classifier = model(des_path,
                 des_feat_path,
                 install_path,
                 install_feat_path,
                 invoc_path,
                 invoc_feat_path,
                 cite_path,
                 cite_feat_path
            )
corpus = "pandas is a Python package that provides fast,\
    flexible, and expressive data structures designed to make \
    working with structured (tabular, multidimensional, potenti\
    ally heterogeneous) and time series data both easy and intu\
    itive. It aims to be the fundamental high-level building blo\
    ck for doing practical, real world data analysis in Python. A\
    dditionally, it has the broader goal of becoming the most power\
    ful and flexible open source data analysis / manipulation tool a\
    vailable in any language. It is already well on its way toward this goal."
classifier.inference(corpus)

(['pandas python package provides fast flexible expressive data structures designed make working structured tabular multidimensional potenti ally heterogeneous time series data easy intu itive',
  'aims fundamental highlevel building blo ck practical real world data analysis python',
  'dditionally broader goal becoming power ful flexible open source data analysis manipulation tool vailable language',
  'already well way toward goal',
  ''],
 tensor([1, 1, 1, 1, 0]),
 tensor([1, 1, 1, 1, 0]),
 tensor([1, 1, 1, 1, 0]),
 tensor([1, 1, 1, 1, 0]))

In [11]:
aa = torch.load("../saved_models/DNN_description.pt")