### Previous workflow

1. Read the latest news articles from the RSS source (fetch_data)
2. Convert the news articles into a pandas dataframe, open the previous stored dataset, convert that into another pandas dataframe, and merge them, remove duplicates. 
3. Save the compiled file
4. Read the compiled file
5. Drop where url or content is null (for an extreme case)
6. Perform level 1 analysis by adding the supporting keywords into a column as list
7. Continue level 1 analysis by counting the rows with lists with len greater than 0
8. Check level 2 and level 3 continuation by validating if the articles have certain phrases
9. If condition met for level 2, for level 2 perform level 2 analysis by adding the supporting keywords into a column as list
10. Continue level 2 analysis by counting the rows with lists with len greater than 0
11. If condition met for level 3, for level 3 perform level 3 analysis by adding the supporting keywords into a column as list
10. Continue level 3 analysis by counting the rows with lists with len greater than 0
14. Store results (not here)

### Current Workflow

1. Read the latest news articles from the RSS source (fetch_data)
2. Convert the news articles into a pandas dataframe, open the previous stored dataset, convert that into another pandas dataframe, and merge them, remove duplicates. 
6. Perform level 1 analysis if not already previouly done by adding the supporting keywords into a column as list
7. Continue level 1 analysis by counting the rows with lists with len greater than 0
8. Check level 2 and level 3 continuation by validating if the articles have certain phrases
9. If condition met for level 2, for level 2 perform level 2 analysis by adding the supporting keywords into a column as list
10. Continue level 2 analysis by counting the rows with lists with len greater than 0
11. If condition met for level 3, for level 3 perform level 3 analysis by adding the supporting keywords into a column as list
10. Continue level 3 analysis by counting the rows with lists with len greater than 0
14. Store results (not here)

### Why is this fast?

1. Instead of performing the analysis for all the newspapers, this only analyses new dataset
2. articles are cleaned as they are parsed
3. avoiding multiple read writes

In [73]:
from bs4 import BeautifulSoup
import feedparser
from datetime import date
import pandas as pd
import numpy as np
import re  
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import time

stop_words = stopwords.words('english')

In [74]:
data_level1=['arabl land','avail data','bureau statist','busi survey','cens publi','cens pop','children employ','civil registr','collect method','commerci export','complet rate','consum electr','consum energ','data access','data collect','data compil','data entri','data manag','data releas','data standard','data user','demograph data','densit popul','develop data','difusion dat','direct statist','disaggreg data','electr access','electr consumpt','energi consumpt','establish survey','exchang rate','extern debt','fertil rate','food import','food product','gender gap','govern debt','govern statist','gross domest','gross nation','health expenditur','health survey','import marchandis','improv data','improv statist','indic measur','indic preci','inflat rate','institut statist','interest payment','intern tourism','irrig land','land use','life expect','livestock product','merchandis export','merchandis trade','model statist','mortal rate','multilater debt','nation account','nation statist','nation survey','national brut','national statist','open data','part revenus','pay gap','popul census','popul growth','popul rate','price index','produccion aliment','purchas power','qualiti data','receit fiscal','releas data','revenu fiscal','rural popul','servic export','statist agenc','statist author','statist avail','statist committe','statist data','statist depart','statist national','statist offic','statist servic','statist studi','survey catalogu','tax payment','tax revenu','trade balanc','unemploy rate','use data','water suppli','youth unemploy']

data_level2=['accur','adequ','ambigu','ambígu','apropi','bancal','bias','confiabl','correct','deceit','deceiv','decept','defectu','delud','engan','equivoc','erreur','erro','erron','errone','error','exact','exat','fake','fallaci','faux','fiabl','generaliz','illus','imparcial','impartial','imprecis','improp','inaccur','incorrect','inexact','invalid','limit','manipul','mislead','mistaken','parcial','prec','precis','proper','reliabl','rigor','rigour','scientif','sol','solid','som','son','sound','spurious','tromp','trompeur','unbias','unreli','unscientif','unsound','vag','vagu','val','valid',]

data_level3=['data manipul','lead question','manipul dat','report bias','sampl select','sampl size']

data_level_indicator = [' cpi ', ' fdi ', ' gdp ', ' gnp ', ' hdi ', ' wdi ']

filter_list=[' data ',' record ',' research ',' statistics ',' study ']

In [81]:
def lengths_of_keywords():
    return len(data_level1), len(data_level2), len(data_level3)

def cleanHTML(raw_html):
    text = BeautifulSoup(raw_html, "lxml").text
    word_tokens = word_tokenize(text.lower().rstrip()) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

def performRSSNew(url, newspaper):
    all_links = []
    NewsFeed = feedparser.parse(url)
    entries = NewsFeed.entries
    for entry in entries:
            content =  cleanHTML(entry.content[0].value)
            published = entry.published
            temp_dict = ({'url': entry.link, 
            'content': content , 
            'newspaper': newspaper,
            'published_date': published})
            all_links.append(temp_dict)    
    return all_links

def level1_count(article):
    if article:
        keyword_list = []
        for word in data_level1:
            search_ = (r"\b"+word.split()[0]+r"[a-zA-Z]*\s\b"+word.split()[1]+"[a-zA-Z]*")
            if re.search(search_, article):
                keyword_list.append(word)
        for word in data_level_indicator:
            if (word in article):
                keyword_list.append(word)
        return keyword_list
    return []

def level2_count(article):
    if article:
        keyword_list = []
        for word in data_level2:
            search_ = (r"\b"+word+r"[a-zA-Z]*")
            if re.search(search_, article):
                keyword_list.append(word)
        return keyword_list
    return []


def level3_count(article):
    if article:
        word_tokens = word_tokenize(article.lower().rstrip()) 
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        article = ' '.join(filtered_sentence)  
        keyword_list = []
        for word in data_level3:
            search_ = (r"\b"+word+r"[a-zA-Z]*")
            if re.search(search_, article):
                keyword_list.append(word)
        return keyword_list
    return []



def level_2_3_filter(article):
    if article:
        article = article.lower().rstrip()
        for word in filter_list:
            if word in article:
                return 1
        return 0
    return 0

def level_len(count_list):
    return 1 if len(count_list)>0 else 0

In [66]:
def fetch_merge_analyze_data_new(reset_analysis = False):
    start = time.time()
    prev_HT = pd.read_csv('newspaper/static/datasets/all.csv')
    
#     prev_HT['content'] = prev_HT.content.apply(cleanHTML)
#     prev_HT.drop('category', inplace=True, axis=1)
    
    himalayan_times_url = "https://thehimalayantimes.com/feed/"
    raw_HT = pd.DataFrame(performRSSNew(himalayan_times_url)) 
    print ('fetch_data', time.time() - start)
    df_HT = pd.concat([prev_HT, raw_HT], sort=False).drop_duplicates(subset='url', keep="first").reset_index(drop=True) 

    print ('compile_data', time.time() - start)
    
    if (reset_analysis):
        df_HT['level1'] = np.nan
        df_HT['level2'] = np.nan
        df_HT['level3'] = np.nan
        print ('reset_data', time.time() - start)
 
    df_HT['level1'] = df_HT.apply(lambda x: level1_count(x['content']) if pd.isnull(x.level1) else x.level1, axis=1)
    df_HT['level_len'] = df_HT['level1'].apply(level_len)
    
    print ('level_1_analysis', time.time() - start)
    
    df_HT['level_2_3_valid'] = df_HT['content'].apply(level_2_3_filter)
    print ('level_2_filter ', time.time() - start)
    
    df_HT['level2'] =  df_HT.apply(lambda x: level2_count(x['content']) if pd.isnull(x.level2) else x.level2, axis=1)
    df_HT['level2_len'] = df_HT.level2.apply(level_len)
    print ('level_2_analysis ', time.time() - start)
    
    df_HT['level3'] =  df_HT.apply(lambda x: level3_count(x['content']) if pd.isnull(x.level3) else x.level3, axis=1)
    df_HT['level3_len'] = df_HT.level3.apply(level_len)
    print ('level_3_analysis ', time.time() - start)
    
#     df_HT.to_csv('newspaper/static/datasets/ht.csv', index=False)
    return df_HT

In [69]:
df = fetch_merge_analyze_data_new(reset_analysis=False)

fetch_data 0.9882938861846924
compile_data 0.9962596893310547
level_1_analysis 1.2743253707885742
level_2_filter  1.397298812866211
level_2_analysis  1.6033315658569336
level_3_analysis  1.8226597309112549


In [68]:
df

Unnamed: 0,url,content,published_date,level1,level_len,level_2_3_valid,level2,level3,level2_len,level3_len
0,https://thehimalayantimes.com/finance/analysis...,foreign aid plays critical role infrastructure...,"Sun, 27 Sep 2015 14:12:37 +0000",[ gdp ],1,0,"[proper, reliabl]",[],1,0
1,https://thehimalayantimes.com/finance/analysis...,experts opine loss incurred country calculated...,"Sat, 26 Sep 2015 19:45:19 +0000",[inflat rate],1,0,"[prec, precis, sol]",[],1,0
2,https://thehimalayantimes.com/finance/analysis...,"kathmandu : coca-cola recently came campaign ,...","Sun, 20 Sep 2015 14:08:57 +0000",[],0,0,"[proper, som]",[],1,0
3,https://thehimalayantimes.com/finance/analysis...,clarity government ’ plans programmes hydropow...,"Sun, 20 Sep 2015 13:41:56 +0000",[ fdi ],1,0,[proper],[],1,0
4,https://thehimalayantimes.com/finance/analysis...,"working 66 days year , able prosper ? kathmand...","Sun, 13 Sep 2015 11:15:42 +0000",[ gdp ],1,0,"[limit, proper, sol]",[],1,0
...,...,...,...,...,...,...,...,...,...,...
4892,https://thehimalayantimes.com/nepal/majority-m...,"kathmandu , november 9 five members nepal comm...","Tue, 10 Nov 2020 01:45:47 +0000",[],0,0,[],[],0,0
4893,https://thehimalayantimes.com/nepal/three-apf-...,birgunj : government set three armed police fo...,"Mon, 09 Nov 2020 16:20:58 +0000",[],0,0,[],[],0,0
4894,https://thehimalayantimes.com/nepal/one-held-i...,hetauda : police sunday arrested one person co...,"Mon, 09 Nov 2020 16:08:43 +0000",[],0,0,[],[],0,0
4895,https://thehimalayantimes.com/nepal/dao-prohib...,"lamjung : owing surge covid-19 transmission , ...","Mon, 09 Nov 2020 15:57:43 +0000",[],0,0,[],[],0,0


### Fetch all

In [82]:
newspapers = [('https://thehimalayantimes.com/feed/', 'ht'), 
              ('https://english.onlinekhabar.com/feed', 'ok'),
              ('https://www.nepalitimes.com/feed/', 'nt'),
              ('https://kathmandutribune.com/feed/', 'kt'),
              ('http://english.lokaantar.com/feed/', 'lk'),
              ('https://www.nepalisansar.com/feed/', 'ns'),
              ('http://telegraphnepal.com/feed/', 'tn'),
              ]

In [91]:
def fetch_merge_analyze_data_new(reset_analysis = False):
    start = time.time()
    prev_all = pd.read_csv('newspaper/static/datasets/all.csv')

    for newspaper in newspapers:
        temp = pd.DataFrame(performRSSNew(newspaper[0], newspaper[1]))
        df_ALL = pd.concat([prev_all, temp], sort=False)
    
    df_ALL = df_ALL.drop_duplicates(subset='url', keep="first").reset_index(drop=True) 
     
    print ('fetch_data and compile_data', time.time() - start)
    
    if (reset_analysis):
        df_ALL['level1'] = np.nan
        df_ALL['level2'] = np.nan
        df_ALL['level3'] = np.nan
        print ('reset_data', time.time() - start)
 
    df_ALL['level1'] = df_ALL.apply(lambda x: level1_count(x['content']) if pd.isnull(x.level1) else x.level1, axis=1)
    df_ALL['level_len'] = df_ALL['level1'].apply(level_len)
    
    print ('level_1_analysis', time.time() - start)
    
    df_ALL['level_2_3_valid'] = df_ALL['content'].apply(level_2_3_filter)
    print ('level_2_filter ', time.time() - start)
    
    df_ALL['level2'] =  df_ALL.apply(lambda x: level2_count(x['content']) if pd.isnull(x.level2) else x.level2, axis=1)
    df_ALL['level2_len'] = df_ALL.level2.apply(level_len)
    print ('level_2_analysis ', time.time() - start)
    
    df_ALL['level3'] =  df_ALL.apply(lambda x: level3_count(x['content']) if pd.isnull(x.level3) else x.level3, axis=1)
    df_ALL['level3_len'] = df_ALL.level3.apply(level_len)
    print ('level_3_analysis ', time.time() - start)
    
#     df_ALL.to_csv('newspaper/static/datasets/ht.csv', index=False)
    return df_ALL

In [92]:
df = fetch_merge_analyze_data_new(reset_analysis=False)

fetch_data and compile_data 10.597536087036133
level_1_analysis 10.762979745864868
level_2_filter  10.908011198043823
level_2_analysis  11.077815771102905
level_3_analysis  11.251969575881958
