In [4]:
# !pip3.6 install python-Levenshtein

In [5]:
import requests
import re
import pandas as pd
import datetime
import Levenshtein

In [6]:
# GLOBAL PARAMETERS
CMT_LENGTH_THRESHOLD = 100

In [7]:
def get_entity_detail(msid):
    """
    returns detail of the entity as json retrieved from cmsgraphread1 API call
    """
    entity_detail = None
    url = 'http://cmsgraphread1.indiatimes.com/multicontentdetailswohost?q={"type":"msid","id":"' + str(msid) + '","activationStatus":{"status":[0,1,2,3,4]}}'
    try:
        json_resp = requests.get(url).json()
        entity_detail = json_resp['entities'][0]
    except Exception as ex:
        print('Exception in retrieving entity detail for msid : {}'.format(msid), ex)
    return entity_detail

In [8]:
# entity_detail['story'], entity_detail['subject']

In [9]:
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub(' ', text)

In [10]:
def clean_text(string):
    """
    clean text function used to clean article content and comment text
    """
    #replacing break signifiers with full stop so as to split into sentences properly
    string = re.sub(r"\n", " . ", string)
    string = re.sub(r"<br[ /]*>", " . ", string)
    
    string = remove_tags(string)
    string = clean_html_entities(string)
    #string = re.sub(r"\n", " ", string)
    string = re.sub(r"\t", " ", string)
#     string = re.sub(r"[\'\`\"]", "", string)
    string = re.sub(r"[()!?\'\`\"\”\“\‘\’\′\″\\\/*$«»°@#≈≠≤≥<>]", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [11]:
def clean_html_entities(string):
    """
    removes html entities which are sometimes copied when trying to copy from the template
    """
    string = re.sub(r"&ldquo;|&rdquo;|&lsquo;|&rsquo;|&mdash;|&ndash;|&plusmn;|&deg;|&laquo;|&raquo;|&quot;", "", string)
    string = re.sub(r"&ne;|&le;|&ge;|&lt;|&gt;|&asymp;|&prime;|&Prime;|&bull;", "", string)
    string = re.sub(r"&zwj;|&zwnj;|&lrm;|&rlm;|&bdquo;|&hellip;|&permil;|&lsaquo;|&rsaquo;|&oline;|&frasl;|&ensp;|&emsp;|&thinsp;", "", string)
    string = re.sub(r"&amp;|&nbsp;|&:cent;|&brvbar;|&brkbar;|&sect;|&uml;|&copy;|&laqu;|&not;|&reg;|&hibar;|&sup1;|&sup2;|&sup3;|", "", string)
    return string

In [12]:
def perc_copied_from_article(cmt, raw_art_body, LD_based=True, debug=False):
    """
    returns the percentage of comment that was copied from the given article body
    process - calculates by the length of the different sentences from the comment that are present in the given art_body and the cleaned art_body
    """
    #sum of matched length
    sum_match_len = 0

    cleaned_art_body = clean_text(raw_art_body)

    #length zero of either comment or article body
    if(len(cmt) == 0 or len(cleaned_art_body)==0):
        return 0.0
    
    cmt_split_list = split_into_sentences(cmt)
    
    #total length of the parts of the sentence
    tot_len = len(''.join(cmt_split_list))
    
    if (tot_len<CMT_LENGTH_THRESHOLD):
        return 0.0

    #splitting by full stop or purnaviram    
    for y in cmt_split_list:
        ld_factor = LD_factor(y, cleaned_art_body, LD_based=LD_based, debug=debug)
        sum_match_len = sum_match_len + ld_factor
    
    #to handle for those sentences that do not have anything left after cleaning and stripping and length check
    #return (sum_match_len/len(cmt))
    return (sum_match_len/tot_len)

In [13]:
def split_into_sentences(text):
    """
    returns a list of sentences from the given text
    """
    MIN_SENTENCE_LENGTH = 5
    return ([x.strip() for x in re.split('[.|।]', text) if len(x.strip())>=MIN_SENTENCE_LENGTH])

In [14]:
def test_split_sentences_in_cmt(cmt):
    """
    splits the given text into sentences by logic and prints them
    note - not actually used in code.. for testing purposes
    """
    MIN_SENTENCE_LENGTH = 5
    cmt = clean_text(cmt)
    for index, y in enumerate(split_into_sentences(cmt)):
        print("{} : {}".format(index+1, y))

In [15]:
def LD_factor(sent, art_body, LD_based=True, debug=False):
    """
    returns the contributing length of the given sentence to the overall match length
    """
    #if completely found in article body   
    if(sent in art_body):
#         print(art_body)
        if(debug):
            print('exact match found for sentence')
            print('sent : {}'.format(sent))
        LD_factor = len(sent)
    elif(LD_based):
        #finds the lowest LD for the given sentence among the sentences in the article body 
        #TODO - optimize this for the case when LD found to be 0 
        #min_LD_value = min([Levenshtein.distance(sent, x) for x in split_into_sentences(art_body)])

        art_split_sentences = split_into_sentences(art_body)
        list_ld_values = [Levenshtein.distance(sent, x) for x in art_split_sentences]
        min_LD_value = min(list_ld_values)
        
        #print([(Levenshtein.distance(sent, x),x) for x in split_into_sentences(art_body)])
        LD_factor = max(len(sent) - min_LD_value, 0)
        
        #to exclude the cases where multiple contexually simiar sentences contribute to a good overall score
        if(LD_factor/len(sent)<0.8):
            LD_factor = 0
        else:
            #to print the value of the closest sentence
            if(debug):
                print(min_LD_value, LD_factor, art_split_sentences[list_ld_values.index(min_LD_value)])
                print("sent: {}".format(sent))
                print()
            pass
    else:
        LD_factor = 0
    return LD_factor

In [16]:
# get_entity_detail(72018567)['subject']

In [17]:
# clean_text(get_entity_detail(71470538)['subject'] + get_entity_detail(71470538)['story'])

In [14]:
def perc_copied(cmt, msid, LD_based=True, debug=False):
    """ 
    returns percentage copied given comment text and msid of the article
    """
    if (len(cmt)<CMT_LENGTH_THRESHOLD):
        return 0.0
    entity_detail = get_entity_detail(msid)
    raw_art_body = (entity_detail['subject'] if ('subject' in entity_detail) else "") + " " + (entity_detail['story'] if ('story' in entity_detail) else "")
    if (cmt in raw_art_body or (clean_text(cmt) in clean_text(raw_art_body))):
        return 1.0
    perc_copied = perc_copied_from_article(clean_text(cmt), raw_art_body, LD_based=LD_based, debug=debug)
    return perc_copied

In [18]:
entity_detail = get_entity_detail(71644780)
raw_art_body = (entity_detail['subject'] if ('subject' in entity_detail) else "") + " " + (entity_detail['story'] if ('story' in entity_detail) else "")
print(raw_art_body)

The Tata Sons' Chairman Emeritus recently shared that he was an accidental startup investor.  <em>The Tata Sons' Chairman Emeritus recently shared that he was an accidental startup investor. This is what he told the BOHECO co-founder in whose company he has invested.<br /></em><br />Earlier in the week during an interview, Ratan Tata, Chairman Emeritus of Tata Sons shared that he began startup investment by accident. After retiring, he said that he "made small token investments" from his pocket "in what he considered to be exciting companies."<br /><br />One of these companies was the Bombay Hemp Company. <br /><br />Yash Kotak, one of the co-founders of BOHECO told ET Panache, "People like Mr. Tata don’t just build businesses, they build legacies. He had once said, 'I don’t believe in taking the right decisions. I take decisions then make them right.' This statement has quite an ironic representation of what we’ve tried to do with BOHECO."<br /><br />Kotak went on to share that at the

In [None]:
perc_copied
perc_copied_from_article

In [15]:
cmt_1 = "निजाम मीर उस्मान अली खान ने लंदन स्थित नेटवेस्ट बैंक में 1,007,940 पाउंड (करीब 8 करोड़ 87 लाख रुपये) जमा कराए थे"
msid_1 = 71408304
print(perc_copied(cmt_1, msid_1, True))
print('\n',test_split_sentences_in_cmt(cmt_1))

1.0
1 : निजाम मीर उस्मान अली खान ने लंदन स्थित नेटवेस्ट बैंक में 1,007,940 पाउंड करीब 8 करोड़ 87 लाख रुपये जमा कराए थे

 None


In [None]:
# test_split_sentences_in_cmt(clean_text(cmt_1))

In [25]:
# entity_detail = get_entity_detail(72021252)

In [16]:
# entity_detail = get_entity_detail(72021252)
# raw_art_body = (entity_detail['subject'] if ('subject' in entity_detail) else "") + " " + (entity_detail['story'] if ('story' in entity_detail) else "")
# print(raw_art_body)

In [None]:
import urllib.request, json 
import math

BASE_ELASTIC_URL = None
BATCH_SIZE_PARAM = 250
# ITER_BUFFER = 2
CSV_FILE_NAME = None
DEFAULT_FIELDS = ['C_T']

In [None]:
startEpoch = None
endEpoch = None

In [None]:
def getJsonFromUrl(url_param):
    with urllib.request.urlopen(url_param) as url:
        data = json.loads(url.read().decode())
        return data

In [None]:
def getCsvListFromListComments(list_comments, fields_to_take = DEFAULT_FIELDS):
    
    csv_list = []
    for x in list_comments:
        if 'C_T' in x:
            list1 = []
            for fieldStr in fields_to_take:
                if "." in fieldStr:
                    fieldStrArr = fieldStr.split(".")
                    if fieldStrArr[0] in x and fieldStrArr[1] in x[fieldStrArr[0]]:
                        list1.append(x[fieldStrArr[0]][fieldStrArr[1]])
                    else:
                        list1.append("")
                elif fieldStr in x:
                    list1.append(x[fieldStr])
                else:
                    list1.append("")
            csv_list.append(list1)
            
    return csv_list

In [51]:
def getModifiedUrl(url, from_param, size_param, startEpoch=None, endEpoch=None):
    result_url = url + "&from=" + str(from_param) + "&size=" + str(size_param)
    if (startEpoch!=None and endEpoch!=None):
        result_url = result_url + "&sDateEpoch=" + str(startEpoch) + "&eDateEpoch=" + str(endEpoch)
    return result_url

In [52]:
def getModifiedUrlDate(url, startEpoch=None, endEpoch=None):
#     result_url = url + "&from=" + str(from_param) + "&size=" + str(size_param)
    result_url = url
    if (startEpoch!=None and endEpoch!=None):
        result_url = result_url + "&sDateEpoch=" + str(startEpoch) + "&eDateEpoch=" + str(endEpoch)
    return result_url

In [None]:
def generator_url_result(url, fields_to_take = DEFAULT_FIELDS, size_param = BATCH_SIZE_PARAM, file_path = CSV_FILE_NAME, force = False, to_download_till = None):
    
    final_url = getModifiedUrl(url, 0, size_param)
    json_raw_response = getJsonFromUrl(final_url)

    total_comment_count = json_raw_response['hits']['total']
    
    ELASTIC_LIMIT = 10000
    if (to_download_till != None and to_download_till < total_comment_count):
        total_comment_count = to_download_till
        
    if (total_comment_count > ELASTIC_LIMIT):
        print("total_comment_count greater than 10000: %d" %(total_comment_count))
        if ~force:
            print("aborting")
            return
        else:
            total_comment_count = ELASTIC_LIMIT
        
    num_iters = math.ceil(total_comment_count/size_param)
#     num_iters = num_iters + ITER_BUFFER

    print("total_count : %d" %(total_comment_count))
    print("num_iters : %d" %(num_iters))

    for iter_val in range(0, num_iters, 1):
        print("iteration : %d" %(iter_val))
        from_val = iter_val * size_param
        final_url = getModifiedUrl(url, from_val, size_param)
        json_raw_response = getJsonFromUrl(final_url)
        list_comments = [x['_source'] for x in json_raw_response['hits']['hits']]
        
        csv_list = getCsvListFromListComments(list_comments, fields_to_take)
        yield csv_list

In [None]:
def createNewCsv(file_path = CSV_FILE_NAME):
    with open(file_path, 'w+') as writeFile:
        pass

In [None]:
def get_counts_identified_and_total_V2(csv_list, file_path):
    PERC_THRESHOLD = 0.6
    total_count  = len(csv_list)
    identified_count = 0
    
    df_list = list()
    for elem in csv_list:
        try:
            perc = perc_copied(elem[1], elem[2])
            if (perc>PERC_THRESHOLD):
                identified_count = identified_count + 1
                print(elem[0], perc, elem[2], elem[1])
                #df_list.append([elem[0], perc, elem[2], elem[1], raw_art_body])
                df_list.append([elem[0], perc, elem[2], elem[1]])
        except Exception as ex:
            print('Exception for c_id : {}'.format(elem[0]), ex)
    
    df_csv = pd.DataFrame(df_list, columns = ['c_id', 'perc', 'msid', 'C_T'])
        
    with open(file_path, 'a') as f:
        df_csv.to_csv(f, index = False, header=f.tell()==0, encoding='utf-8')
        
    return identified_count, total_count

In [None]:
def get_datetime_epoch(epoch_val):
    """
    returns datetime according to GMT+5:30
    """
    return (datetime.datetime.utcfromtimestamp(epoch_val) + datetime.timedelta(hours=5, minutes=30))

In [None]:
# channel_list_str = "NBTO,MTO,ET,GTech,TOI"
channel_list_str = "NBTO,MTO"
channel_list = channel_list_str.split(",")

for channels in channel_list:
    print(channels)
    base_url = "http://commentmoderator.indiatimes.com/mytimes/elasticCommentQuery?sort=desc&appKey={}&filterCommentStatus=APPROVED,REJECTED,UNVERIFIED".format(channels)

    fields_to_take = ['c_id', 'C_T', 'msid']

    batch_write_size = 500

    date_file = "1_15_oct_V9"
    csv_file_path = '{}_{}.csv'.format(channels, date_file)
    createNewCsv(file_path = csv_file_path)
    
    NET_END_TIME_EPOCH = 1571164200000
    NUM_DAYS_IN_ONE_ITER = 0.25
    NUM_ITERS = 15*4

    # endEpoch = NET_START_TIME_EPOCH
    startEpoch = NET_END_TIME_EPOCH

    total_count = 0
    identified_count = 0

    for iter_val in range(0, NUM_ITERS, 1):
        print("\nIter_val : %d" %(iter_val))
        endEpoch = startEpoch
        startEpoch = endEpoch - int(NUM_DAYS_IN_ONE_ITER*24*60*60*1000)

        url = getModifiedUrlDate(base_url, startEpoch, endEpoch)
        print("url : " + url)
        for csv_list in generator_url_result(url, fields_to_take = fields_to_take, size_param = batch_write_size, file_path = csv_file_path):
            n_identified_count, n_total_count = get_counts_identified_and_total_V2(csv_list, csv_file_path)
            total_count = total_count + n_total_count
            identified_count = identified_count + n_identified_count
    #         print(csv_list)

#     print("\nnet startEpoch : ", startEpoch, "time : {}".format(get_datetime_epoch(startEpoch)))
#     print("net endEpoch : ", NET_END_TIME_EPOCH, "time : {}".format(get_datetime_epoch(NET_END_TIME_EPOCH)))
    print("\nnet startEpoch : {}".format(startEpoch))
    print("net endEpoch : {}".format(NET_END_TIME_EPOCH))
    print(channels, identified_count, total_count, identified_count/total_count)
    print("\n\n================================================================================\n\n")