In [29]:
import os
import re
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import seaborn as sns
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [30]:
class preparedata():
    #Load the data
    def jsontocsv():
        dataset_path = r'C:\Deepankar\response_50.json'
        with open(dataset_path, encoding="utf8") as f:
            data = json.load(f)
        
        print('{} docs found in the JSON and type is {}'.format(len(data["response"]["docs"]), type(data["response"]["docs"])))
      
        #now will format the data into dataframe - id, pagenum, tag, sentences
        try:
            sentences = []
            for doc in data["response"]["docs"]:
                id_ = doc["id"]
                pagenum = doc["pagenumber"][0]
                tag = doc["tags"][0]
                
                try:
                    if doc["metainfo"] is not None:
                        for sen in doc["metainfo"]:
                            sen = re.sub(r'\d+', '', sen)
                            sen = sen.strip()
                            sen = sen.encode('ascii', 'ignore')
                            sentences.append([id_, pagenum, tag, sen])
                except:
                    #print('error in line num {}'.format(id_))
                    pass

            df = pd.DataFrame(sentences, columns=['id', 'pagenum', 'tag', 'sentence'])
            df.to_csv(r'C:\Deepankar\NLP\model\datafiles\50.csv')
            return df
        except:
            print('error in line num {}'.format(id_))

class getsenembeddings():
    """
    Obtains sentence embeddings for each sentence in the page
    """
    
    def __init__(self):
        
        self.embed = hub.Module(r"C:\Deepankar\NLP\model\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47",trainable=True)
        self.similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
        self.similarity_message_encodings = self.embed(self.similarity_input_placeholder)
        print('About to create tf session..')
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.session.run(tf.tables_initializer())
        print('Model Loaded..')
        
    
    def groupsentences(self, df):
        data_frame = df
        embed = []
        for key in df.groupby(['tag','pagenum']).groups.keys():            
            print('num of sentences in page {} of book {} is {}'.format(key[1],
                                                                        key[0],
                                                                        len(df.groupby(['tag','pagenum']).groups[key]))
                                                                        )
            sen_nums = df.groupby(['tag','pagenum']).groups[key] #This will give index of each sentences         
            if len(sen_nums) >= 6:
                page, embed = self.pagesenembeddings(sen_nums, df)
                embed = embed.reshape(embed.shape[0], -1)                
                summary = self.summarize(page, embed)
                print('Summary of page {} of book {} is {}'.format(key[1], key[0], summary))
            else:
                pass
            
    
    def pagesenembeddings(self, sen_nums, df):
        print('.....................Embedding of page started.....................')
        page_embed = []
        page = []
        for i in sen_nums:
            #print('sentence is {}'.format(str(df.iloc[i]['sentence'])))
            page.append(str(df.iloc[i]['sentence']))
            page_embed.append(self.session.run(self.similarity_message_encodings, feed_dict =
                                             {self.similarity_input_placeholder: [str(df.iloc[i]['sentence'])]}))
        page_embed = np.array(page_embed)
        
        print('.....................next page.....................')
        return page, page_embed
    
    def summarize(self, page, embed):
        print('xxxxxxxxxxxxxxxxxxxx Clustering Started xxxxxxxxxxxxxxxxxxxx')
        n_clusters = int(np.ceil(len(embed)**0.5))
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans = kmeans.fit(embed)
        avg = []
        closest = []
        for j in range(n_clusters):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
            
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,embed)

        ordering = sorted(range(n_clusters), key=lambda k: avg[k])
        summary = ' '.join([page[closest[idx]] for idx in ordering])
        
        print('xxxxxxxxxxxxxxxxxxxx Clustering Finished xxxxxxxxxxxxxxxxxxxx')
        return summary
    
    def start(self, df):        
        self.groupsentences(df)

In [27]:
df = preparedata.jsontocsv()
df.head(5)
#df.to_csv(r'C:\Deepankar\nlp\response_1000.csv')
obj = getsenembeddings()
obj.start(df)

50 docs found in the JSON and type is <class 'list'>


Unnamed: 0,id,pagenum,tag,sentence
0,b1761496-a33a-40fe-9288-a02685e486f0,7,5950-skip-thought-vectors.pdf,b'and vice-versa for sentences.'
1,b1761496-a33a-40fe-9288-a02685e486f0,7,5950-skip-thought-vectors.pdf,b'We also report the median rank of the closes...
2,b1761496-a33a-40fe-9288-a02685e486f0,7,5950-skip-thought-vectors.pdf,"b'Recently, showed that by using Fishervector..."
3,b1761496-a33a-40fe-9288-a02685e486f0,7,5950-skip-thought-vectors.pdf,b'Thus the method of is a strong baseline to ...
4,b1761496-a33a-40fe-9288-a02685e486f0,7,5950-skip-thought-vectors.pdf,"b'For our experiments, we represent images usi..."
