# Analyse server stored data

In [36]:
# Imports
from datetime import datetime, timedelta
import json
import time
from collections import Counter
import random

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from src.media import Outlet, Article

In [37]:
# Define functions
def getDateRange(start_date, end_date):  # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list

In [38]:
# Make maptlotlib show graphs in new window
%matplotlib qt 

In [39]:
# Constants
# Date range for articles being scraped from the server
startScrapeDate = "15/09/2021"
endScrapeDate = "15/09/2022"

collectionCap = -1 # The maximum amount of articles to get pulled from the server (set to -1 for uncaped scraping)

startScrapeDate = datetime.strptime(startScrapeDate, "%d/%m/%Y")
endScrapeDate = datetime.strptime(endScrapeDate, "%d/%m/%Y")
stopwordsSet = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [40]:
# Load setup data
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile)

In [41]:
articleList = []
startScanTime = time.time() # Track the time elapsed 
DBClient = MongoClient(setupData["DB_URI"], server_api = ServerApi('1')) # Connect to the database

# Get articles from DBClient
articleCollection = DBClient[setupData["DB_NAME"]]['newsData']
articleCursor = articleCollection.aggregate([{'$match': {'publishDate': {
                '$gt': startScrapeDate, 
                '$lt': endScrapeDate
        }}}])

for articleIndex, article in enumerate(articleCursor):
    articleList.append(Article(
                article["outletName"],
                article["headline"],
                article["description"],
                article["author"],
                article["publishDate"],
                article["sentimentScore"]
        ))
    
    print(f"\r Collected article number {articleIndex + 1}", end="")
print(f"\n Collected a total of {len(articleList)} articles in {round((time.time() - startScanTime), 3)} seconds")

 Collected article number 288177
 Collected a total of 288177 articles in 87.939 seconds


In [42]:
# Sort articles by outlet
outletList = []

for articleIndex, article in enumerate(articleList):
    foundOutlet = False # If the outlet has been found within `outletList`
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet: # Make new outlet
        newOutlet = Outlet(article.outlet)
        outletList.append(newOutlet)    
print(f"Found {len(outletList)} outlets in total")

Found 29 outlets in total


In [43]:
# Text outputs
for outlet in outletList:
    # Get the average sentiment
    avgSentiment = sum(list(article.sentimentScore for article in outlet.articleList)) / len(outlet.articleList)
    avgSentiment = avgSentiment 
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print("\n")

=== ABC News ===
Published a total of 17638 articles
Has an average sentiment of -0.089


=== 9 News ===
Published a total of 8415 articles
Has an average sentiment of -0.159


=== Sydney Morning Herald ===
Published a total of 26608 articles
Has an average sentiment of -0.023


=== SBS Australia ===
Published a total of 2277 articles
Has an average sentiment of -0.085


=== Independent Australia ===
Published a total of 1203 articles
Has an average sentiment of -0.097


=== Daily Telegraph ===
Published a total of 3176 articles
Has an average sentiment of -0.089


=== The Age ===
Published a total of 26587 articles
Has an average sentiment of -0.022


=== Michael West ===
Published a total of 1087 articles
Has an average sentiment of -0.007


=== The Guardian ===
Published a total of 40154 articles
Has an average sentiment of -0.063


=== Crikey ===
Published a total of 2664 articles
Has an average sentiment of -0.04


=== CNN ===
Published a total of 13550 articles
Has an average sen

In [51]:
# Display parameters
topicList = [['shooting', 'gun', 'control', 'uvalde', 'buffalo', 'shooter', 'gunman']] # Which topics to use (leave blank for all) (MUST BE LOWERCASE)
showOutletsList = [] # Which outlets to be shown (leave blank for all)

In [52]:
# Collect data to plot
plotArticles = {} # Stores the articles by the topic 
if topicList == []:
    plotArticles['total'] = [] # If there are no topic, total is used to show the total amt of articles
    for article in articleList:
        if article.outlet in showOutletsList or showOutletsList == []:
            plotArticles['total'].append(article)
else:            
    for topic in topicList: 
        plotArticles[topic[0]] = [] # Each topic stores corresponding articles in a list
        for article in articleList:
            if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
                for word in article.headline.split(" "):
                    if word.lower() in topic: # If a given word from the article is in the topic searchlist
                        plotArticles[topic[0]].append(article)
                        break


In [57]:
# Plot daily average (or total output) for any attribute over time, as a total/avg of all outlets
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate)

for topic in plotArticles.keys():
    plotData = {}
    for dateIndex, date in enumerate(plotDates):
        if dateIndex + 1 != len(plotDates):
            plotData[date] = []
    for article in plotArticles[topic]:
        articleDate = article.date
        if plotAttribute == "publishCount":
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
        else:
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute))
    
    # Plot the data
    xVals = list(plotData.keys())
    yVals = []
    for val in xVals:
        try:
            if plotAttribute == "publishCount":
                yVals.append(len(plotData[val]))
            else:
                yVals.append(sum(plotData[val]) / len(plotData[val]))
        except ZeroDivisionError:
            yVals.append(0)
    if plotAttribute == "publishCount":
        print(f"In total, {sum(yVals)} articles got published about {topic}")
    plt.plot(xVals, yVals, label=topic)
    
plt.title(f"{plotAttribute} Over time")
plt.legend()
plt.show()

In total, 5382 articles got published about shooting


In [25]:
# Plot any attribute over time but broken down by outlet
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted

displayOutlets = [] # The list that keeps track of which outlets to display
if showOutletsList == []: # If the user has not specified which outlets to show, show all of them
    for outlet in outletList:
        displayOutlets.append(outlet.name) 
else:
    displayOutlets = showOutletsList # Else show only those specified

for topic in plotArticles.keys():
    for outlet in displayOutlets: # Show how each media outlet reports each topic
        plotData = {} # Dict containing each display date as key, and the list of scores for that day as value
        for dateIndex, date in enumerate(plotDates):
            if dateIndex + 1 != len(plotDates):
                plotData[date] = []
                
        for article in plotArticles[topic]:
            if article.outlet == outlet:
                articleDate = article.date
                if plotAttribute == "publishCount": # If the user is trying to find how many articles have been published on a given day, add 1 per article
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
                else:
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute)) # Append the score to the daily list
        
        # Plot the data
        xVals = list(plotData.keys())
        yVals = []
        for val in xVals:
            try:
                if plotAttribute == "publishCount":
                    yVals.append(len(plotData[val])) # Plot the daily count (total)
                else:
                    yVals.append(sum(plotData[val]) / len(plotData[val])) # plot the daily average 
            except ZeroDivisionError:
                yVals.append(0) # If there are no datapoints for the day, display 0
        plt.plot(xVals, yVals, label=f"{outlet} - {topic}")
        
        if plotAttribute == "publishCount":
            print(f"In total, {outlet} published {sum(yVals)} articles about {topic}")
plt.title(f"{plotAttribute} Over time by Outlet")
plt.legend()
plt.show()

In total, BuzzFeed published 2 articles about anthony
In total, Sydney Morning Herald published 81 articles about anthony
In total, The Guardian published 136 articles about anthony
In total, CNN published 3 articles about anthony
In total, CBS News published 3 articles about anthony
In total, ABC News published 71 articles about anthony
In total, 9 News published 28 articles about anthony
In total, Crikey published 53 articles about anthony
In total, Fox News published 0 articles about anthony
In total, Huffington Post published 5 articles about anthony
In total, Independent Australia published 7 articles about anthony
In total, Daily Telegraph published 11 articles about anthony
In total, The Age published 79 articles about anthony
In total, Michael West published 12 articles about anthony
In total, BBC News published 4 articles about anthony
In total, The Conversation published 47 articles about anthony
In total, Wall Street Journal published 1 articles about anthony
In total, Amerc

In [56]:
# Find keywords for each day by topic
dailyDisplay = 4 # The number of keywords that gets displayed for each date
displayTopic = "shooting" # The topic that gets graphed
minTextScore = 2 # The minimum number of a times a keyword needs to be mentioned in order to get it's text displayed
try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    

keywordColors = {} # Dict containing the color for each keyword
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted

datedKeywords = {} # A dict containg all the keywords in articles from a given date about the topic
for dateIndex, date in enumerate(plotDates):
    if dateIndex + 1 != len(plotDates):
        datedKeywords[date] = []

for article in plotArticles[displayTopic]:
    if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
        articleDate = article.date
        for word in article.headline.split(" "):
            word = word.strip().lower()
            if word not in stopwordsSet and len(word) > 2:
                datedKeywords[articleDate.replace(hour=0, minute=0, second=0)].append(lemmatizer.lemmatize(word)) # Append the (lemmatized) word to the dict for the given date

lastKeywords = []
for date in datedKeywords.keys():
    keywords = Counter(datedKeywords[date]).most_common(dailyDisplay)
    for keyword in keywords:
        try:
            keywordColor = keywordColors[keyword[0]] # If the keyword already has a color for itself
        except KeyError:
            r = lambda: random.randint(0,255) # Else, generate a new color for the keyword
            keywordColor = '#%02X%02X%02X' % (r(),r(),r())
            keywordColors[keyword[0]] = keywordColor # If this is the first time 


        plt.scatter(date, keyword[1], color=keywordColor, label=keyword[0]) # Put the point on the graph

        # Draw lines between points with the same keyword
        foundPrior = False # Tracks whether the date before contains the same keyword
        for lastKeyword in lastKeywords:
            if lastKeyword[0] == keyword[0]:
                plt.plot([lastDate, date], [lastKeyword[1], keyword[1]], color=keywordColor)
                foundPrior = True
                break

        if not foundPrior and keyword[1] >= minTextScore: # Only display text if the point is at the start of a 'chain'
            plt.text(date, keyword[1], keyword[0])

    # Save the last date and keywords to plot lines in the next date
    lastDate = date 
    lastKeywords = keywords

plt.title(f"Keywords over time for topic {displayTopic}")
plt.show()