In [31]:
# import libraries
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
# nltk.download()
# # download in specific location
# nltk.download('all', download_dir='./stopwords')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import OrderedDict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import PorterStemmer
import re
import inspect

In [20]:
# read/write json

import json
import os

def JsonUploader(localpath, filename):
    with open(os.path.join(localpath, filename)) as pkgnames:
        return json.load(pkgnames)

def JsonSaver(dictfile, localpath, filename):
    with open(os.path.join(localpath, filename), 'w') as outfile:
        json.dump(dictfile, outfile)

# # write json
# temp = copy.deepcopy(json1)
# JsonSaver(dictfile=temp, localpath="./", filename="temp.json")
# # read json
# json2 = JsonUploader(localpath="./", filename="temp.json")

In [28]:
# functions to clean data

import unicodedata
import mistune
from bs4 import BeautifulSoup

# 'data_raw' to be created in this format
# temp = {}
# temp['pck1'] = {}
# temp['pck1']['txt1'] = txt1
# data_raw = copy.deepcopy(temp)

# removes stopwords and special chars also
def remove_stopwords(sentence):
    return " ".join([word.lower() for word in word_tokenize(sentence)
                    if word.lower() not in stop_words and word.isalpha()])

def generic_clean(sentence):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(sentence))
    # Remove "_"
    document = document.replace("_", " ")
    # remove unicode characters
    document = str(document.encode('ascii','ignore'))
    document = document[1:]
    document = document[1:]
    document = document[:-1]
    # remove all numerics
    document=''.join(c if c not in map(str,range(0,10)) else "" for c in document)
    # remove all extra spaces
    document = " ".join(document.split())
#     # Remove single characters from the start
#     document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
#     # remove all single characters
#     document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Converting to Lowercase
    document = document.lower()
    return document

# need to create 
def clean_web_data(text_raw):
    # readme is like html page
    web_text = text_raw
    # adds <h>header <p>para <a>anchor <code> html tags
    web_text = mistune.markdown(web_text, escape=False)
    # renders text like html page (<h> and <p> in separate lines)
    soup = BeautifulSoup(web_text, "html.parser")
    # Replaces anchors with content where relevant and extract otherwise
    for link in soup.findAll('a'):
        if link.text.startswith('http'):
            link.extract() # removes
        else:
            link.replaceWithChildren()
    # Removes all the images
    for image in soup.findAll('img'):
        image.extract()
    # Removes all the code blocks
    for code_block in soup.findAll('code'):
        code_block.extract()
    text_clean = generic_clean(soup.text)
    return text_clean

def clean_normal_text(text_raw):
    text_clean = generic_clean(text_raw)
    return text_clean

def remove_words(text_raw, word):
    text_clean = text_raw.replace(word, "")
    # remove extra spaces generated by word removal
    text_clean = " ".join(text_clean.split())
    return text_clean

def clean_unicode(text_raw):
    text_clean = str(unicodedata.normalize('NFKD', text_raw).encode('ascii','ignore'))
    text_clean = text_clean[1:]
    text_clean = text_clean[1:]
    text_clean = text_clean[:-1]
    return text_clean
    # unicode text example: "Sîne klâwen durh die"
        
# Note: data_clean1 not declared as global inside fn, still changes were affected

In [25]:
# Stem/Lemmatize with POS Tag

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
                "N": wordnet.NOUN,        
                "J": wordnet.ADJ,
                "V": wordnet.VERB,
                "R": wordnet.ADV,
                }
    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(text1):
    # lemmitizes
    # works on list of strings
    documents = []
    lemmatizer = WordNetLemmatizer() 
    for sen in text1:      
        document = ""
        for word in sen.split():
#             word1 = lemmatizer.lemmatize(word) # only removes 's' at the end of nouns as of now
#             word1 = lemmatizer.lemmatize(word, pos ="a") # also converts adjective 'better' to 'good'
            word1 = lemmatizer.lemmatize(word, get_wordnet_pos(word)) # as per word's POS tag
            document = (''.join(document+" "+word1)).strip()
        documents.append(document)
    return documents


def stem(text1):
    # lemmitizes
    # works on list of strings
    ps = PorterStemmer()
    documents = []
    for sen in text1:
        words = word_tokenize(sen)
        document = []
        for w in words:
            document.append(ps.stem(w)) # can try other stemming algos to improve accuracy
        document = ' '.join(document)
        documents.append(document)
    return documents

In [16]:
# Rouge - function to find rouge score between 2 strings

import rouge

def rouge_strings(hypothesis_1, references_1):

    precision = []
    recall = []
    f1_score = []

    def prepare_results(p, r, f):
        return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

    # def rouge_accuracy():

#     for aggregator in ['Avg', 'Best', 'Individual']:
    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
#         apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=3,
                               limit_length=True,
                               length_limit=100,
                               length_limit_type='words',
                               apply_avg=apply_avg,
#                                apply_best=apply_best,
                               alpha=0.5, # Default F1_score # weight for 2*p*r(p+r); gives same val at 0.5
                               weight_factor=1.2,
                               stemming=True)

    #     hypothesis_1 = "Steve-Jobs was a technology pioneer who created i-phone" # '-' does not make a difference
    #     references_1 = "Steve-Jobs created best phone named i-phone"
        all_hypothesis = [hypothesis_1]
        all_references = [references_1]

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                pass
            else:
#                 print(prepare_results(results['p'], results['r'], results['f']))
                precision.append(results['p'])
                recall.append(results['r'])
                f1_score.append(results['f'])
#         print()
    
    return precision, recall, f1_score

In [44]:
# Generic function:

# retrieve name of variable
def retrieve_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]
# # Usage:
# var_name = {}
# retrieve_name(var_name)