### Import relevant libraries

In [1]:
import pandas as pd
import os
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Read the Train data

In [2]:
from glob import glob
#list to attach the respective data
train_doc = []
train_tags = []
#directory to the text files in the train documents folder
filenames_doc = glob('Train_docs/case*.txt')
#directory to the text files in the train tags folder
filenames_tag = glob('Train_tags/case*.txt')

#iterate over each text file
for file_doc in filenames_doc:
    #open the current text file
    with open((file_doc), 'r') as f:
        #read that text file
        read_doc = f.read()
        #attach to the train_doc list
        train_doc.append(read_doc)
for file_tag in filenames_tag:
    with open((file_tag), 'r') as n:
        read_tag = n.read()
        #attach to the train_tags list
        train_tags.append(read_tag)

## Preprocess the text

In [3]:
STOP_WORDS = set(stopwords.words('english'))
#keep data in a data frame
train_doc = pd.DataFrame(train_doc[:])

def preprocess(text):
    
    #lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    #remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    return text

In [4]:
#convert back to list
doc = train_doc[0].tolist()
#create a vocabulary of words, ignore words that appear
#in 85% of documents, and eliminate stopwords
count_vect = CountVectorizer(max_df=0.85,stop_words=STOP_WORDS, preprocessor=preprocess)
word_count_vect = count_vect.fit_transform(doc)

In [5]:
#here we have 80 rows and a vocabulary size of 12737
word_count_vect.shape

(80, 11141)

In [6]:
#check 10 words from our vocabulary
list(count_vect.vocabulary_.keys())[:10]

['kurian',
 'joseph',
 'leave',
 'granted',
 'special',
 'petition',
 'civil',
 'around',
 'acres',
 'land']

## TfidfTransformer to Compute Inverse Document Frequency(IDF)

In [7]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True, norm='max',sublinear_tf=False)
tfidf_transformer.fit(word_count_vect)
#here are some Inverse document frequencies(IDF)s
tfidf_transformer.idf_

array([4.29583687, 4.70130197, 4.70130197, ..., 4.70130197, 4.70130197,
       4.29583687])

Once we have our IDF computed, we are now ready to compute TF-IDF and extract the top tags from the test document

## Read the Test data

In [8]:
#read the test data as we did earlier
test_doc = []
#directory to the text files in the train documents folder
filenames_doc = glob('Test_docs/case*.txt')

#iterate over each text file
for file_doc in filenames_doc:
    #open the current text file
    with open((file_doc), 'r') as f:
        #read that text file
        read_doc = f.read()
        #attach to the train_doc list
        test_doc.append(read_doc)

In [9]:
def sort_coo(coo_matrix):
    #join both the coo_matrix.col and data together
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1],x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        feature_name = feature_names[idx]
        
        #keep track of feature names and its corresponding score 
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
    #create a tuples of feature,score
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [10]:
# you only need to do this once
feature_names=count_vect.get_feature_names()

def get_tags(idx):
    
    # get the document that we want to extract tags from
    #doc=test_doc[0]
    
    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(count_vect.transform([test_doc[idx]]))
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    
    #extract only the top n; n here is 10
    tags=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return tags

# now print the results
def print_result(idx,tags):
    print('===='*10+'Body'+'===='*10)
    print(test_doc[idx])
    print('===='*10+'tags'+'===='*10)
    for t in tags:
        print(t,tags[t])

## Generate tags from the test data

In [11]:
#enter the document number, idx max is 100
idx = 16
tags = get_tags(idx)
print_result(idx,tags)



1. The Petitioner is being tried before the Sessions Judge, Jodhpur for offences punishable Under Sections 370(4), 342, 354A, 376(2)(f), 376D, 506, 509/34, 120B, Indian Penal Code, Sections 23 and 26, Juvenile Justice (Care and Protection of Children) Act, 2000 and Sections 5(F)/6, 5(G)/6, 7/8, Protection of Children From Sexual Offences Act, 2012. As earlier application for bail filed by him having been declined, a second application was moved before the Trial Court which too came to be rejected by the said Court. The matter was then taken up before the High Court who has concurred with the view taken by the Trial Court and dismissed the plea for bail. The present special leave petition calls in question the correctness of the said order.

2. When this petition initially came up before us on 15.10.2014 Mr. Salman Khurshid, learned senior counsel appearing for the Appellant submitted that although several witnesses for the prosecution had already been examined at the trial, the Petit

# Generate tags from the whole data

In [12]:
#generate tf-idf for all documents in your list 
tf_idf_vector = tfidf_transformer.transform(count_vect.transform(test_doc))

results = []
for i in range(tf_idf_vector.shape[0]):
    
    #get vector for each document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n, n=10
    tags=extract_topn_from_vector(feature_names,sorted_items,10)
    
    results.append(tags)

Test_doc_tags=pd.DataFrame(zip(test_doc,results),columns=['Test Document','Test Tags'])
#save document as csv
Test_doc_tags.to_csv('Test tags.csv')
Test_doc_tags

Unnamed: 0,Test Document,Test Tags
0,"\n\nP. Venkatarama Reddi, J.\n\n1. The opinion...","{'assessee': 1.0, 'tax': 0.745, 'income': 0.50..."
1,"\n\nG.B. Pattanaik, J.\n\n1. This batch of Spe...","{'shops': 1.0, 'excise': 0.924, 'liquor': 0.70..."
2,\n\n1. After hearing the Learned Counsel for b...,"{'excise': 1.0, 'notification': 0.519, 'cestat..."
3,\n\n1. This appeal is directed against the jud...,"{'pw': 1.0, 'appellant': 0.807, 'trial': 0.697..."
4,\n\n1. An issue raised by the appellants was t...,"{'search': 1.0, 'shop': 0.73, 'count': 0.311, ..."
...,...,...
96,"\n\nVikramajit Sen, J.\n\n1. This Appeal assai...","{'shares': 1.0, 'buy': 0.473, 'target': 0.37, ..."
97,"\n\nAnil R. Dave, J.\n\n1. Leave granted.\n\n2...","{'lakhs': 1.0, 'sum': 0.985, 'deposited': 0.93..."
98,"\n\nAnil R. Dave, J.\n\n1 This appeal has been...","{'document': 1.0, 'property': 0.85, 'late': 0...."
99,"\n\n1. By this interlocutory application, Mr. ...","{'kerala': 1.0, 'mr': 0.674, 'dogs': 0.591, 's..."
