                                        README:

                                      Instructions:

1) For the first prompt, please respond ‘wall street journal’ or ‘Wall Street Journal’.

2) For the second prompt, please input a keyword you are interested in. 

3) For the third prompt, please enter a START date in the form of YYYY-MM-DD.

4) For the fourth prompt, please enter a END date in the form of YYYY-MM-DD.

5) For the fifth prompt, asked after the dataframes are computed, please answer ‘A’, ‘B’, or ‘C’ according to the prompt. 

It is normal for the code to take some time to compute the dataframes (it took 40 seconds for my computer to compute 1 month worth of ‘trump’ articles, which ended up being 913 articles.  I took more than a minute to compute 1 year’s worth of ‘egg’ articles). I advise you to set a time interval that is close to one another  (>1 year), especially with topics that might yield many results (ex. ‘trump’). Popular topics, such as ‘trump’, are so densely populated that they will yield an extreme amount of articles, which will take a very long time to compute. My computer times out (or reaches the return limit) after around 1500 articles. 

It is a good idea to restart the Kernel after each run. 

Also, please keep in mind that my code does not account for duplicates, or the re-publication of the same article in different subsections (ex. Travel, Business, etc.). This might cause some dates to look out of place in bigger (multi-year) datasets, and is explained more in the document below. 



In [1]:
import urllib.request
import bs4
import lxml
import re
import datetime
import pprint
from textblob import TextBlob
import pandas as pd
import seaborn
import plotly.express as px

In [2]:
def bias_report():
    print('Welcome to the Bias Report.')
    newspaper=input('Enter Newspaper Name:')   #ask for user inputs
    keyword=input('Enter a Keyword:')
    time1=input('Enter START date (YYYY/MM/DD):')
    time2=input('Enter END date (YYYY/MM/DD):')
    if ('wall street'in newspaper) or ('Wall Street' in newspaper):
        url_temp='https://www.wsj.com/search/term.html?KEYWORDS='+keyword+'&min-date='+time1+'&max-date='+time2+'&daysback=4y&isAdvanced=true&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,wsjvideo,interactivemedia,sitesearch,wsjpro&page='+'1'
        dict_title,dict_summary,answer,construction=wsj_bias(url_temp,keyword,time1,time2,'1',0,0)
        sentiment,sentiment_titles,sentiment_summaries=sentiment_analyzer(dict_summary)
        graph_bias(sentiment,sentiment_titles,sentiment_summaries,dict_title,construction)
    else:
        print('Please input "wall street journal" or "Wall Street Journal" as the newspaper. If confused, look at instructions.')
        
        
        
        
       
        
        
        

In [3]:
def sentiment_analyzer(summary_library):     #sentiment analysis tool using TextBlob

    keyview_summary=summary_library.keys
    sentiment_titles={}
    sentiment_summaries={}
    sentiment={}

    for k in keyview_summary():    
        summary=summary_library.get(k)
        score_title=TextBlob(k) 
        score_summary=TextBlob(summary)
        final_score=(score_title.sentiment.polarity +  score_summary.sentiment.polarity)/2
        sentiment[k]=final_score
        sentiment_titles[k]=score_title.sentiment.polarity
        sentiment_summaries[k]=score_summary.sentiment.polarity

    return sentiment, sentiment_titles, sentiment_summaries  #returns sentiment scores of net (title+sumary/2), only title scores and only summary scores as dictionaries


In [9]:
def graph_bias(sentiment,sentiment_titles,sentiment_summaries,dict_title,construction):        ###outputs plotly graph 
    
    dates=[]
    sentiment_final=[] 
    sentiment_title_final = []
    sentiment_summary_final = []
    article_name=[]
    
    pos_ct_net=0    #these will be used to count each negative, positive and neutral score
    neg_ct_net=0
    net_ct_net=0
    pos_ct_tit=0
    neg_ct_tit=0
    net_ct_tit=0
    pos_ct_sent=0
    neg_ct_sent=0
    net_ct_sent=0
    
    sentiment_keys=sentiment.keys()
    dict_title_keys=dict_title.keys()
    
    for m in sentiment_keys:        #extract sentiment score data from dictionaries
        value_sentiment=sentiment.get(m)
        sentiment_final = sentiment_final + [value_sentiment]
        if value_sentiment>0:
            pos_ct_net=pos_ct_net+1
        elif value_sentiment<0:
            neg_ct_net=neg_ct_net+1
        else:
            net_ct_net=net_ct_net+1
            
        
    for i in dict_title_keys:        #extract date data from dictionaries
        article_name= article_name + [i]
        date=dict_title.get(i)
        dates= dates + [date]
        
    sentiment_keys=list(sentiment_keys)
    index=dates
        
    for k in sentiment_titles:        #extract sentiment score data from dictionaries
        value_title=sentiment_titles.get(k)
        sentiment_title_final = sentiment_title_final + [value_title]
        if value_title>0:
            pos_ct_tit=pos_ct_tit+1
        elif value_title<0:
            neg_ct_tit=neg_ct_tit+1
        else:
            net_ct_tit=net_ct_tit+1    

    for j in sentiment_summaries:        #extract sentiment score data from dictionaries
        value_summary=sentiment_summaries.get(j)
        sentiment_summary_final = sentiment_summary_final + [value_summary]
        if value_summary>0:
            pos_ct_sent=pos_ct_sent+1
        elif value_summary<0:
            neg_ct_sent=neg_ct_sent+1
        else:
            net_ct_sent=net_ct_sent+1  
            
    df = pd.DataFrame(sentiment_keys, index = dates, columns = ['Title'])
    df['Net Sentiment']=sentiment_final
    df['Title Score']=sentiment_title_final
    df['Summary Score']=sentiment_summary_final
    
    print(df)
    
    ans=input('Would you like a graph of the (A) Title and Summary Average Sentiment (A) , (B) Only Title Sentiment (B), or (C) Only Summary Sentiment (C) ?')
    length=len(sentiment_keys)
    
    #the formatting of the graph (title, axis names, hover formatting) were taken from plotly's online manual,
    #on sites such as https://plot.ly/python/axes/, https://plot.ly/python/hover-text-and-formatting/,
    #https://plot.ly/python/text-and-annotations/, https://plot.ly/python/line-and-scatter/.
    
    
    if ans is 'A': #want average title and summary score
        p1 = px.scatter(x=index, y=sentiment_final, hover_name=article_name)
        p1.update_yaxes(range=[-1,1])
        p1.update_layout(
            title="Title and Summary Combined Sentiment", 
            xaxis_title="Date",
            yaxis_title="Combined Sentiment",
            font=dict(
                family="Times New Roman, monospace",
                size=18,
                color="#d62728"
            )
        )
        
        p1.show()
        
        print('In',length,'articles,',pos_ct_net,'are positive,',neg_ct_net,'are negative, and',net_ct_net,'are neutral. ',construction,'articles are being updated.')

    elif ans is 'B': #want only title score
        p2 = px.scatter(x=index, y=sentiment_title_final, hover_name=article_name)
        p2.update_yaxes(range=[-1,1])
        
        p2.update_layout(
            title="Title Sentiment", 
            xaxis_title="Date",
            yaxis_title="Title Sentiment",
            font=dict(
                family="Times New Roman, monospace",
                size=18,
                color="#d62728"
            )
        )
        p2.show()
        
        construction=int(construction)
        construction=construction-5
        construction=str(construction)
        
        print('In',length,'articles,',pos_ct_tit,'are positive,',neg_ct_tit,'are negative, and',net_ct_tit,'are neutral. ',construction,'articles are being updated.')

    elif ans is 'C':   #want only summary score
        p3 = px.scatter(x=index, y=sentiment_summary_final, hover_name=article_name)
        p3.update_yaxes(range=[-1,1])
        
        p3.update_layout(
            title="Summary Sentiment", 
            xaxis_title="Date",
            yaxis_title="Summary Sentiment",
            font=dict(
                family="Times New Roman, monospace",
                size=18,
                color="#d62728"
            )
        )      
        
        p3.show()
        
        print('In',length,'articles,',pos_ct_sent,'are positive,',neg_ct_sent,'are negative, and',net_ct_sent,'are neutral. ',construction,'articles are being updated.')

    else:
        print('Try again! Please only use CAPITAL A, B or C.')
        

        
    

In [11]:
def wsj_bias(url_temp,keyword,time1,time2,pagenum_str,end,construction):
    import urllib.request
    import bs4
    import lxml
    import re
    import datetime
    import pprint
    
    time1_d=datetime.datetime(int(time1[0:4]),int(time1[5:7]),int(time1[8:]))
    time2_d=datetime.datetime(int(time2[0:4]),int(time2[5:7]),int(time2[8:]))
            
    wsj={}
    wsj_summary={}
    today=datetime.datetime.now()
    final_date={}
    final_summary={}
    
    answer='yes'
    
    url_wsj = urllib.request.urlopen(url_temp)     #put url through BeautifulSoup
    soup_wsj = bs4.BeautifulSoup(url_wsj, 'lxml', from_encoding='utf-8')
    headline_container_wsj=soup_wsj.findAll('div',attrs={"class":"headline-container"}) #find headline containers
    #find headline containers instead of headlines or summaries themselves, as not all entries have 
    #summaries while all entries do have headline containers.

    for k in headline_container_wsj:           
        title=k.find('h3',attrs={"class":"headline"})   #find everything else in the container
        title=title.text
        date=k.find('time',attrs={"class":"date-stamp-container highlight","class":"date-stamp-container"})
        date=date.text
        summary=k.find('div',attrs={"class":"summary-container"})
        wsj[title]=date
        try:
            summary=summary.text
            wsj_summary[title]=summary
        except:
            wsj_summary[title]='n/a'

    
    keyview_wsj=wsj.keys
    
    for k in keyview_wsj():    #formatting time, as WSJ has a lot of inconsistencies
        val=wsj.get(k)         #when reporting the time an article is published
        
        if 'Just' in val:
            val=val[5:]
    
        if 'Jan.' in val:                           #standardizing months as they abbreviate
            val=val.replace('Jan.','January')       #sometimes and do not other times
        elif 'Feb.' in val:
            val=val.replace('Feb.','February')
        elif 'Mar.' in val:
            val=val.replace('Mar.','March')    
        elif 'Apr.' in val:
            val=val.replace('Apr.','April')
        elif 'Jun.' in val:
            val=val.replace('Jun.','June')
        elif 'Jul.' in val:
            val=val.replace('Jul.','July')
        elif 'Aug.' in val: 
            val=val.replace('Aug.','August')
        elif 'Sep.' in val:    
            val=val.replace('Sep.','September')
        elif 'Oct.' in val:  
            val=val.replace('Oct.','October')
        elif 'Nov.' in val: 
            val=val.replace('Nov.','November')
        elif 'Dec.' in val: 
            val=val.replace('Dec.','December')
            
    
        if 'hour' in val:                      #standardizing dates reported in hrs or mins
            number=''                          #into YYYY-MM-DD format
            for m in val:
                if re.match('\d',m):
                    number=number+m
            number=int(number)
            new_date= today + datetime.timedelta(hours = -number)
            wsj[k]=new_date.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')
            
        elif 'min' in val:
            number=''
            for n in val:
                if re.match('\d',n):
                    number=number+n
            number=int(number)
            new_date= today + datetime.timedelta(minutes = -number)
            wsj[k]=new_date.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')

        
        else:
            val=val[0:-2]                     #getting rid of time zone 
            val=datetime.datetime.strptime(val, '%B %d, %Y %I:%M %p ') 
            wsj[k]=val.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')  #change back to compare with datetime objects if wanted
                
#        if error_prompt is 0:
#            answer='no'
#            val=datetime.datetime.strptime('0001-01-01', "%Y-%m-%d")
    
        if time1_d<=val<=time2_d:   #check if article is in date range, add to pile if it is
            final_date[k]=wsj[k]
            summary_value=wsj_summary[k]
            final_summary[k]=summary_value
        
    if len(wsj)==0:
        end=end+1
        construction=construction+1
    else:
        end=0          #only end if three are zero right after one another 
        
    print(end)
    print(construction)
    
    if end>=5:
        answer='no'
                    
    if answer is 'yes':  #if answer is yes, we should move to the next page, and the function should call itself again
        print(wsj)
        pagenum=int(pagenum_str)
        pagenum=pagenum+1
        pagenum_str=str(pagenum)
        url_temp='https://www.wsj.com/search/term.html?KEYWORDS='+keyword+'&min-date='+time1+'&max-date='+time2+'&daysback=4y&isAdvanced=true&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,wsjvideo,interactivemedia,sitesearch,wsjpro&page='+pagenum_str
        add_date,add_summary,answer,construction=wsj_bias(url_temp,keyword,time1,time2,pagenum_str,end,construction) 
            
        final_date={**wsj, **add_date}  ####### combine dicts
        final_summary={**wsj_summary, **add_summary}
    
    return final_date,final_summary, answer, construction
                    
    #return final_date, final_summary, answer yields an answer to whether you should keep going or not 

In [12]:
bias_report()  

Welcome to the Bias Report.
Enter Newspaper Name:wall street
Enter a Keyword:accenture
Enter START date (YYYY/MM/DD):2019/01/01
Enter END date (YYYY/MM/DD):2020/01/01
0
0
{' PG&E: Wired to Fail ': '2019-12-28', ' Cyber Daily Year Ahead: CISOs, Who Is Your Boss? ': '2019-12-24', ' CISOs Emerge From CIOs’ Shadow ': '2019-12-24', ' The Newest Trend in Holiday Gifting? Old Stuff ': '2019-12-21', ' Tech, Media & Telecom Roundup: Market Talk ': '2019-10-31', ' Cyber Daily: Goodbye, 2019—A Year of Security Angst and Innovation ': '2019-12-20', ' The Tricky Role of the CEO in a New Era of Social Responsibility ': '2019-12-12', ' Virtual Travel Could Change the World—If It Gets Off the Ground ': '2019-12-12', ' VC Daily: Rocky Mountains Lure Investors, Startups; Elder Care Startup Bet; Genius-Google Battle Over Lyrics ': '2019-12-04', ' Risk Managers Grapple With Potential Downsides of AI ': '2019-12-02', ' The Generational Divide in Holiday Shopping ': '2019-11-25', ' Data and Deregulation Fue

1
1
{}
2
2
{}
3
3
{}
4
4
{}
5
5
                                                        Title  Net Sentiment  \
2019-12-28                               PG&E: Wired to Fail       -0.351667   
2019-12-24   Cyber Daily Year Ahead: CISOs, Who Is Your Bo...       0.083333   
2019-12-24                    CISOs Emerge From CIOs’ Shadow        0.000000   
2019-12-21    The Newest Trend in Holiday Gifting? Old Stuff        0.152500   
2019-07-12        Tech, Media & Telecom Roundup: Market Talk        0.250000   
...                                                       ...            ...   
2019-01-28               Yum Brands Opens Search for New CFO        0.068182   
2019-01-28   VC Daily: Getting a Stake in ‘Ghost’ Kitchens...       0.000000   
2019-01-21                     The Catch-22 of Globalization        0.150000   
2019-01-16   After a Strong 2018 for Retail Sales, Caution...       0.209722   
2019-01-10   America’s Electric Grid Has a Vulnerable Back...      -0.218750   

       

In 108 articles, 28 are positive, 13 are negative, and 67 are neutral.  0 articles are being updated.


Below is the code to an older iteration of the webscrapper. It only works for WSJ's Quick Search, which means that its range is limited to around 1 year before today, and even less for some other densely populated topics. However, it does work for the search items in the Quick Search. You can try it out as well if you'd like to.

In [None]:
def wsj_bias_old(url_temp,keyword,time1,time2,pagenum_str):
    import urllib.request
    import bs4
    import lxml
    import re
    import datetime
    import pprint
        
    time1_d=datetime.datetime(int(time1[0:4]),int(time1[5:7]),int(time1[8:]))
    time2_d=datetime.datetime(int(time2[0:4]),int(time2[5:7]),int(time2[8:]))
    
    wsj={}
    wsj_summary={}
    today=datetime.datetime.now()
    final_date={}
    final_summary={}
    
    answer='yes'
    
    url_wsj = urllib.request.urlopen(url_temp)
    soup_wsj = bs4.BeautifulSoup(url_wsj, 'lxml', from_encoding='utf-8')
    headline_container_wsj=soup_wsj.findAll('div',attrs={"class":"headline-container"})


    for k in headline_container_wsj:
        title=k.find('h3',attrs={"class":"headline"})
        title=title.text
        date=k.find('time',attrs={"class":"date-stamp-container highlight","class":"date-stamp-container"})
        date=date.text
        summary=k.find('div',attrs={"class":"summary-container"})
        wsj[title]=date
        try:
            summary=summary.text
            wsj_summary[title]=summary
        except:
            wsj_summary[title]='n/a'

#find containers and search in the containers 
#how to make sure this doesnt yield unnecessary stuff
    
    keyview_wsj=wsj.keys
    
    for k in keyview_wsj():
        val=wsj.get(k)
        
        if 'Just' in val:
            val=val[5:]
    
        if 'Jan.' in val:
            val=val.replace('Jan.','January')
        elif 'Feb.' in val:
            val=val.replace('Feb.','February')
        elif 'Mar.' in val:
            val=val.replace('Mar.','March')    
        elif 'Apr.' in val:
            val=val.replace('Apr.','April')
        elif 'Jun.' in val:
            val=val.replace('Jun.','June')
        elif 'Jul.' in val:
            val=val.replace('Jul.','July')
        elif 'Aug.' in val: 
            val=val.replace('Aug.','August')
        elif 'Sep.' in val:    
            val=val.replace('Sep.','September')
        elif 'Oct.' in val:  
            val=val.replace('Oct.','October')
        elif 'Nov.' in val: 
            val=val.replace('Nov.','November')
        elif 'Dec.' in val: 
            val=val.replace('Dec.','December')
            
    
        if 'hour' in val:
            number=''
            for m in val:
                if re.match('\d',m):
                    number=number+m
            number=int(number)
            new_date= today + datetime.timedelta(hours = -number)
            wsj[k]=new_date.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')
            
        elif 'min' in val:
            number=''
            for n in val:
                if re.match('\d',n):
                    number=number+n
            number=int(number)
            new_date= today + datetime.timedelta(minutes = -number)
            wsj[k]=new_date.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')

        
        else:
            val=val[0:-2]
            val=datetime.datetime.strptime(val, '%B %d, %Y %I:%M %p ') 
            wsj[k]=val.strftime('%Y-%m-%d')
            val=datetime.datetime.strptime(wsj[k], '%Y-%m-%d')  #change back to compare with datetime objects        
        
        if time1_d<=val<=time2_d:   #check if article is in date range, add to pile if it is
            final_date[k]=wsj[k]
            summary_value=wsj_summary[k]
            final_summary[k]=summary_value
            
        if val<time1_d:             #if we have passed the 'start' date we have gone too far back
            answer='no'             #stop the function

    error_prompt=len(headline_container_wsj) #if there is nothing on the page, this means that there are no more search results, and the function should stop. 
    
    if error_prompt is 0:
        answer='no'        
    
    if answer is 'yes':
        pagenum=int(pagenum_str)
        pagenum=pagenum+1
        pagenum_str=str(pagenum)
        url_temp='https://www.wsj.com/search/term.html?KEYWORDS='+keyword+'&mod=searchresults_viewallresults&page='+pagenum_str
        add_date,add_summary,answer=wsj_bias(url_temp,keyword,time1,time2,pagenum_str,loopmax) 
            
        final_date={**final_date, **add_date}  ####### combine dicts
        final_summary={**final_summary, **add_summary}
    
        
    return final_date,final_summary, answer
                    
    #return final_date, final_summary, answer yields an answer to whether you should keep going or not 

In [None]:
def bias_report_old():
    print('Welcome to the Bias Report.')
    newspaper=input('Enter Newspaper Name:')
    keyword=input('Enter a Keyword:')
    time1=input('Enter START date (YYYY/MM/DD):')
    time2=input('Enter END date (YYYY/MM/DD):')
    if ('wall street'in newspaper) or ('Wall Street' in newspaper):
        url_temp='https://www.wsj.com/search/term.html?KEYWORDS='+keyword+'&min-date='+time1+'&max-date='+time2+'&daysback=4y&isAdvanced=true&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,wsjvideo,interactivemedia,sitesearch,wsjpro&page='+'1'
        dict_title,dict_summary,answer=wsj_bias_old(url_temp,keyword,time1,time2,'1')
        sentiment,sentiment_titles,sentiment_summaries=sentiment_analyzer(dict_summary)
        graph_bias(sentiment,sentiment_titles,sentiment_summaries,dict_title)
    else:
        print('Please input "wall street journal" or "Wall Street Journal" as the newspaper. If confused, look at instructions.')
        
        