# Analyse server stored data

In [1]:
# Imports
from datetime import datetime, timedelta
import json
import time
from collections import Counter
import random

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.style
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import operator

from src.media import Outlet, Article

In [2]:
# Define functions
def getDateRange(start_date, end_date):  # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list

In [3]:
# Make maptlotlib show graphs in new window
%matplotlib qt 

In [4]:
# Constants
# Date range for articles being scraped from the server
startScrapeDate = "01/08/2022"
endScrapeDate = "31/12/2022"

collectionCap = -1 # The maximum amount of articles to get pulled from the server (set to -1 for uncaped scraping)

startScrapeDate = datetime.strptime(startScrapeDate, "%d/%m/%Y")
endScrapeDate = datetime.strptime(endScrapeDate, "%d/%m/%Y")
stopwordsSet = set(stopwords.words('english'))
exclusionList = ["say", "new", "news", "day", "days"]
lemmatizer = WordNetLemmatizer()
plt.style.use('default')

In [5]:
# Load setup data
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile)

In [6]:
articleList = []
startScanTime = time.time() # Track the time elapsed 
DBClient = MongoClient(setupData["DB_URI"], server_api = ServerApi('1')) # Connect to the database

# Get articles from DBClient
articleCollection = DBClient[setupData["DB_NAME"]]['newsData']
articleCursor = articleCollection.aggregate([{'$match': {'publishDate': {
                '$gt': startScrapeDate, 
                '$lt': endScrapeDate
        }}}])

for articleIndex, article in enumerate(articleCursor):
    articleList.append(Article(
                article["outletName"],
                article["headline"],
                article["description"],
                article["author"],
                article["publishDate"],
                article["sentimentScore"]
        ))
    
    print(f"\r Collected article number {articleIndex + 1}", end="")
print(f"\n Collected a total of {len(articleList)} articles in {round((time.time() - startScanTime), 3)} seconds")

 Collected article number 171712
 Collected a total of 171712 articles in 51.999 seconds


In [7]:
# Sort articles by outlet
outletList = []

for articleIndex, article in enumerate(articleList):
    foundOutlet = False # If the outlet has been found within `outletList`
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet: # Make new outlet
        newOutlet = Outlet(article.outlet)
        outletList.append(newOutlet)    
print(f"Found {len(outletList)} outlets in total")

Found 26 outlets in total


In [8]:
outletList.sort(key = lambda x : len(x.articleList), reverse=True)
# Text outputs for each outlet
for outlet in outletList:
    # Get the average sentiment
    avgSentiment = sum(list(article.sentimentScore for article in outlet.articleList)) / len(outlet.articleList)
    avgSentiment = avgSentiment 
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print("\n")
#     print(f"{outlet.name},{len(outlet.articleList)}")

=== Daily Mail ===
Published a total of 31734 articles
Has an average sentiment of -0.206


=== The Guardian ===
Published a total of 18424 articles
Has an average sentiment of -0.066


=== CNN ===
Published a total of 10848 articles
Has an average sentiment of -0.092


=== BuzzFeed ===
Published a total of 10825 articles
Has an average sentiment of 0.091


=== South China Morning Post ===
Published a total of 9894 articles
Has an average sentiment of -0.092


=== CBS News ===
Published a total of 8561 articles
Has an average sentiment of -0.054


=== ABC News ===
Published a total of 8488 articles
Has an average sentiment of -0.112


=== Huffington Post ===
Published a total of 8007 articles
Has an average sentiment of -0.067


=== Sydney Morning Herald ===
Published a total of 7537 articles
Has an average sentiment of -0.073


=== The Age ===
Published a total of 7466 articles
Has an average sentiment of -0.075


=== Fox News ===
Published a total of 7227 articles
Has an average sent

In [9]:
# Text outputs for each journalist
journalistList = []
journalistOutput = {}
for article in articleList:
    journalistList.append(article.author)
    try:
        if article.outlet not in journalistOutput[article.author]:
           journalistOutput[article.author].append(article.outlet)
        
    except KeyError:
        journalistOutput[article.author] = [article.outlet]

print(f"The 10 most prolific journalists are:")
for journalist in Counter(journalistList).most_common(10):
    print(f"- {journalist[0]} - {journalist[1]} | {journalistOutput[journalist[0]]}")

The 10 most prolific journalists are:
- None - 95199 | ['Daily Mail', 'Huffington Post', 'BuzzFeed', 'CNN', 'CBS News', 'Amercian ABC', 'The Conversation', 'Sky News', 'BBC News', 'Alja Zeera', 'CNBC', 'ABC News', 'Wall Street Journal', 'Fox News', 'New York Times']
-  - 9699 | ['The Guardian', 'The Age', 'Sydney Morning Herald', 'Daily Telegraph', 'South China Morning Post', 'Independent Australia']
- 9News - 3716 | ['9 News']
- AAP - 1665 | ['Crikey', 'Michael West', 'The Guardian']
- Associated Press - 1347 | ['South China Morning Post', 'The Guardian', 'USA Today', 'The Washington Post', 'Sydney Morning Herald', 'The Age']
- Reuters - 1161 | ['South China Morning Post', 'Crikey', 'The Guardian', 'New York Times', 'The Age', 'Sydney Morning Herald']
- Agence France-Presse - 977 | ['South China Morning Post', 'The Guardian']
- Australian Associated Press - 660 | ['The Guardian']
- Bloomberg - 468 | ['South China Morning Post']
- Fox News Staff - 375 | ['Fox News']


In [12]:
# Display parameters
topicList = [["mar-a-lago","trump","donald","fbi","raid","seize","documents"]] # Which topics to use (leave blank for all) (MUST BE LOWERCASE)
showOutletsList = [] # Which outlets to be shown (leave blank for all)

In [13]:
# Collect data to plot
plotArticles = {} # Stores the articles by the topic 
if topicList == []:
    plotArticles['total'] = [] # If there are no topic, total is used to show the total amt of articles
    for article in articleList:
        if article.outlet in showOutletsList or showOutletsList == []:
            plotArticles['total'].append(article)
else:            
    for topic in topicList: 
        plotArticles[topic[0]] = [] # Each topic stores corresponding articles in a list
        for article in articleList:
            if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
                for word in article.headline.split(" "):
                    if word.lower() in topic: # If a given word from the article is in the topic searchlist
                        plotArticles[topic[0]].append(article)
                        break


In [14]:
# Plot daily average (or total output) for any attribute over time, as a total/avg of all outlets
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate)

for topic in plotArticles.keys():
    plotData = {}
    for dateIndex, date in enumerate(plotDates):
        if dateIndex + 1 != len(plotDates):
            plotData[date] = []
    for article in plotArticles[topic]:
        articleDate = article.date
        if plotAttribute == "publishCount":
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
        else:
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute))
    
    # Plot the data
    xVals = list(plotData.keys())
    yVals = []
    for val in xVals:
        try:
            if plotAttribute == "publishCount":
                yVals.append(len(plotData[val]))
            else:
                yVals.append(sum(plotData[val]) / len(plotData[val]))
        except ZeroDivisionError:
            yVals.append(0)
    if plotAttribute == "publishCount":
        print(f"In total, {sum(yVals)} articles got published about {topic}")
    else:
        print(f"The average {plotAttribute} for {topic} was {sum(yVals) / len(yVals)}")
    plt.plot(xVals, yVals, label=topic)
    
plt.title(f"{plotAttribute} over time")
plt.legend()
# plt.ylim((-1, 1))
plt.show()

In total, 4518 articles got published about mar-a-lago


In [20]:
# Plot any attribute over time but broken down by outlet
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted

displayOutlets = [] # The list that keeps track of which outlets to display
if showOutletsList == []: # If the user has not specified which outlets to show, show all of them
    for outlet in outletList:
        displayOutlets.append(outlet.name) 
else:
    displayOutlets = showOutletsList # Else show only those specified

for topic in plotArticles.keys():
    for outlet in displayOutlets: # Show how each media outlet reports each topic
        plotData = {} # Dict containing each display date as key, and the list of scores for that day as value
        for dateIndex, date in enumerate(plotDates):
            if dateIndex + 1 != len(plotDates):
                plotData[date] = []
                
        for article in plotArticles[topic]:
            if article.outlet == outlet:
                articleDate = article.date
                if plotAttribute == "publishCount": # If the user is trying to find how many articles have been published on a given day, add 1 per article
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
                else:
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute)) # Append the score to the daily list
        
        # Plot the data
        xVals = list(plotData.keys())
        yVals = []
        for val in xVals:
            try:
                if plotAttribute == "publishCount":
                    yVals.append(len(plotData[val])) # Plot the daily count (total)
                else:
                    yVals.append(sum(plotData[val]) / len(plotData[val])) # plot the daily average 
            except ZeroDivisionError:
                yVals.append(0) # If there are no datapoints for the day, display 0
        plt.plot(xVals, yVals, label=f"{outlet} - {topic}")
        
        if plotAttribute == "publishCount":
#             print(f"In total, {outlet} published {sum(yVals)} articles about {topic}")
            print(f"{outlet},{sum(yVals)}")
        else:
            print(f"For {outlet} the overall average for {topic} was {round(sum(yVals) / len(yVals), 4)}")
plt.title(f"{plotAttribute} Over time by Outlet")
plt.legend()
plt.show()

Daily Mail,37
The Guardian,17
BuzzFeed,2
CNN,30
South China Morning Post,3
ABC News,1
Sydney Morning Herald,4
The Age,4
CBS News,10
Huffington Post,14
Fox News,20
Alja Zeera,3
USA Today,17
BBC News,4
New York Times,1
9 News,0
Amercian ABC,4
Daily Telegraph,0
CNBC,4
Crikey,0
The Conversation,2
Sky News,2
Michael West,0
Wall Street Journal,0
Independent Australia,0
Daily Mail,2
The Guardian,0
BuzzFeed,0
CNN,14
South China Morning Post,0
ABC News,17
Sydney Morning Herald,0
The Age,0
CBS News,0
Huffington Post,152
Fox News,0
Alja Zeera,0
USA Today,0
BBC News,1
New York Times,1
9 News,2
Amercian ABC,69
Daily Telegraph,0
CNBC,6
Crikey,0
The Conversation,0
Sky News,2
Michael West,0
Wall Street Journal,0
Independent Australia,8


In [15]:
# Find keywords for each day by topic
dailyDisplay = 4 # The number of keywords that gets displayed for each date
displayTopic = "mar-a-lago" # The topic that gets graphed
minTextScore = 2 # The minimum number of a times a keyword needs to be mentioned in order to get it's text displayed


try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    

keywordColors = {} # Dict containing the color for each keyword
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted
totalKeywords = [] # All the keywords and their freqency

datedKeywords = {} # A dict containg all the keywords in articles from a given date about the topic
for dateIndex, date in enumerate(plotDates):
    if dateIndex + 1 != len(plotDates):
        datedKeywords[date] = []

for article in plotArticles[displayTopic]:
    if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
        articleDate = article.date
        for word in article.headline.split(" "):
            word = word.strip().lower()
            if word not in stopwordsSet and len(word) > 2 and word not in exclusionList:
                datedKeywords[articleDate.replace(hour=0, minute=0, second=0)].append(lemmatizer.lemmatize(word)) # Append the (lemmatized) word to the dict for the given date
lastKeywords = []
for date in datedKeywords.keys():
    keywords = Counter(datedKeywords[date]).most_common(dailyDisplay)
    for keyword in keywords:
        try:
            keywordColor = keywordColors[keyword[0]] # If the keyword already has a color for itself
        except KeyError:
            r = lambda: random.randint(0,255) # Else, generate a new color for the keyword
            keywordColor = '#%02X%02X%02X' % (r(),r(),r())
            keywordColors[keyword[0]] = keywordColor # If this is the first time 

        totalKeywords.append(keyword[0])
        plt.scatter(date, keyword[1], color=keywordColor, label=keyword[0]) # Put the point on the graph

        # Draw lines between points with the same keyword
        foundPrior = False # Tracks whether the date before contains the same keyword
        for lastKeyword in lastKeywords:
            if lastKeyword[0] == keyword[0]:
                plt.plot([lastDate, date], [lastKeyword[1], keyword[1]], color=keywordColor)
                foundPrior = True
                break

        if not foundPrior and keyword[1] >= minTextScore: # Only display text if the point is at the start of a 'chain'
            plt.text(date, keyword[1], keyword[0])

    # Save the last date and keywords to plot lines in the next date
    lastDate = date 
    lastKeywords = keywords

print(f"The most common keywords for {displayTopic} were")
for keyword, keyFreq in Counter(totalKeywords).most_common(15):
    print(f"- {keyword} - {keyFreq}")
    
plt.title(f"Keywords over time for topic {displayTopic}")
plt.show()

The most common keywords for mar-a-lago were
- trump - 152
- mar-a-lago - 41
- fbi - 37
- donald - 36
- document - 22
- say - 20
- 2024 - 10
- raid - 10
- search - 9
- special - 9
- tax - 9
- gop - 7
- judge - 7
- committee - 7
- court - 6


In [17]:
# Create a histogram for the number of articles with each sentiment scores
displayTopic = "mar-a-lago" # The topic that gets graphed
incrementCount = 50
counts = [0, 0]

try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    
sentimentData = []
for article in plotArticles[displayTopic]:
    if(article.sentimentScore > 0):
        counts[1] += 1
    elif(article.sentimentScore < 0):
        counts[0] += 1
    sentimentData.append(article.sentimentScore)
    
plt.hist(sentimentData, incrementCount)
plt.title(f"Number of articles with each sentiment for topic {displayTopic}")
plt.show()

In [43]:
for article in plotArticles[displayTopic]:
    articleDate = article.date
    articleDate = articleDate.replace(hour=0, minute=0, second=0)
#     print(str(articleDate))
    if (str(articleDate)) == "2022-12-19 00:00:00":
        print(article.headline.strip())


Trump is accused of using copyrighted images in his NFT collection
Trump Still Acting Like Imperious President In Mar-A-Lago: Report
Emails show the FBI 'repeatedly grilled' Twitter execs over 'state propaganda' on the app
Hong Kong police arrest 3, seize HK$700,000 worth of drugs after boy, 14, found dead in Chungking Mansions
Three Jordanian police killed during raid on hideout in Maan
Youth gang accused granted bail after city hotel raid
Pakistani Taliban overpower guards, seize police center
FBI warns of explosion of ‘sextortion’ cases targeting boys, teens
Canada moves to seize US$26 million from Russian oligarch Roman Abramovich
FBI issues alert on 'explosion' in child 'sextortion' schemes
Jan. 6 Committee Refers Four Criminal Charges Against Trump to DOJ
Trump should face criminal charges over Capitol riots, committee recommends
January 6 committee recommends insurrection, obstruction charges against Donald Trump
Pentagon Officials Feared Trump Would Try To Use Troops In His Jan