# Analyse server stored data

In [None]:
# Imports
from datetime import datetime, timedelta
import json
import time
from collections import Counter
import random
import csv

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.style
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import operator

from src.media import Outlet, Article
from src.data import getNames

In [None]:
# Define functions
def getDateRange(start_date, end_date):  # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list

In [None]:
# Make maptlotlib show graphs in new window
%matplotlib qt 

In [None]:
# Constants
# Date range for articles being scraped from the server
startScrapeDate = "01/03/2023"
endScrapeDate = "30/04/2023"

collectionCap = -1 # The maximum amount of articles to get pulled from the server (set to -1 for uncaped scraping)

startScrapeDate = datetime.strptime(startScrapeDate, "%d/%m/%Y")
endScrapeDate = datetime.strptime(endScrapeDate, "%d/%m/%Y")
stopwordsSet = set(stopwords.words('english'))
exclusionList = ["say", "new", "news", "day", "days"]
lemmatizer = WordNetLemmatizer()
plt.style.use('default')

In [None]:
# Load setup data
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile)

In [None]:
articleList = []
startScanTime = time.time() # Track the time elapsed 
client = MongoClient(setupData["DB_URI"], server_api=ServerApi('1'))
# Get articles from DBClient
articleCollection = client[setupData["DB_NAME"]]['newsData']
articleCursor = articleCollection.aggregate([{'$match': {'publishDate': {
                '$gt': startScrapeDate, 
                '$lt': endScrapeDate
        }}}])

for articleIndex, article in enumerate(articleCursor):
    if(article["isLegacy"]):
        articleList.append(Article(
            article["outletName"],
            article["headline"],
            article["description"],
            article["author"],
            article["publishDate"],
            article["sentimentScore"],
            "",
            []
        ))
    else:
        articleList.append(Article(
            article["outletName"],
            article["headline"],
            article["description"],
            article["author"],
            article["publishDate"],
            article["sentimentScore"],
            article["linkToArticle"],
            article["catergories"]
        ))
    
    print(f"\r Collected article number {articleIndex + 1}", end="")
print(f"\n Collected a total of {len(articleList)} articles in {round((time.time() - startScanTime), 3)} seconds")

In [None]:
# Sort articles by outlet
outletList = []

for articleIndex, article in enumerate(articleList):
    foundOutlet = False # If the outlet has been found within `outletList`
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet: # Make new outlet
        newOutlet = Outlet(article.outlet)
        outletList.append(newOutlet)    
print(f"Found {len(outletList)} outlets in total")

In [None]:
outletList.sort(key = lambda x : len(x.articleList), reverse=True)
# Text outputs for each outlet
for outlet in outletList:
    # Get the average sentiment
    avgSentiment = sum(list(article.sentimentScore for article in outlet.articleList)) / len(outlet.articleList)
    avgSentiment = avgSentiment 
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print("\n")
#     print(f"{outlet.name},{len(outlet.articleList)}")

In [13]:
# Display parameters
topicList = [["france", "french","pension","retirement"], ["aukus", "AUKUS"]] # Which topics to use (leave blank for all) (MUST BE LOWERCASE)
showOutletsList = ["ABC News", "Al Arabiya", "Alja Zeera", "BBC News", "DW News", "Islamic Republic News Agency", "Russia Today"] # Which outlets to be shown (leave blank for all)

In [14]:
# Collect data to plot
plotArticles = {} # Stores the articles by the topic 
if len(topicList) == 0:
    plotArticles['total'] = [] # Also include the total
    for article in articleList:
        if article.outlet in showOutletsList or showOutletsList == []:
            plotArticles['total'].append(article)
else:            
    for topic in topicList: 
        plotArticles[topic[0]] = [] # Each topic stores corresponding articles in a list
        for article in articleList:
            if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
                for word in article.headline.split(" "):
                    if word.lower() in topic or "" in topic: # If a given word from the article is in the topic searchlist
                        plotArticles[topic[0]].append(article)



In [None]:
# Plot daily average (or total output) for any attribute over time, as a total/avg of all outlets
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate)

for topic in plotArticles.keys():
    plotData = {}
    for dateIndex, date in enumerate(plotDates):
        if dateIndex + 1 != len(plotDates):
            plotData[date] = []
    for article in plotArticles[topic]:
        articleDate = article.date
        try:
            if plotAttribute == "publishCount":
                plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
            else:
                plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute))
        except KeyError:
            pass
    # Plot the data
    xVals = list(plotData.keys())
    yVals = []
    for val in xVals:
        try:
            if plotAttribute == "publishCount":
                yVals.append(len(plotData[val]))
            else:
                yVals.append(sum(plotData[val]) / len(plotData[val]))
        except ZeroDivisionError:
            yVals.append(0)

    if plotAttribute == "publishCount":
        print(f"In total, {sum(yVals)} articles got published about {topic}")
    else:
        print(f"The average {plotAttribute} for {topic} was {sum(yVals) / len(yVals)}")
    plt.plot(xVals, yVals, label=topic)
    
plt.title(f"Daly Article output")
plt.xlabel("Date")
plt.ylabel("Number Of Articles")
# plt.legend()
# plt.ylim((-1, 1))
plt.show()

In [None]:
# Plot any attribute over time but broken down by outlet
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted

displayOutlets = [] # The list that keeps track of which outlets to display
if showOutletsList == []: # If the user has not specified which outlets to show, show all of them
    for outlet in outletList:
        displayOutlets.append(outlet.name) 
else:
    displayOutlets = showOutletsList # Else show only those specified

outletData = {}
for topic in plotArticles.keys():
    outletData[topic] = {}
    for outlet in displayOutlets: # Show how each media outlet reports each topic
        outletRow = [outlet]
        plotData = {} # Dict containing each display date as key, and the list of scores for that day as value
        for dateIndex, date in enumerate(plotDates):
            if dateIndex + 1 != len(plotDates):
                plotData[date] = []       
        articleCount = 0
        for article in plotArticles[topic]:
            if article.outlet == outlet:
                articleDate = article.date
                try:
                    if plotAttribute == "publishCount": # If the user is trying to find how many articles have been published on a given day, add 1 per article
                        plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
                        articleCount += 1
                    else:
                        plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute)) # Append the score to the daily list
                except KeyError:
                    pass
        outletData[topic][outlet] = articleCount
        
        # Plot the data
        xVals = list(plotData.keys())
        yVals = []
        for val in xVals:
            try:
                if plotAttribute == "publishCount":
                    yVals.append(len(plotData[val])) # Plot the daily count (total)
                else:
                    yVals.append(sum(plotData[val]) / len(plotData[val])) # plot the daily average 
            except ZeroDivisionError:
                yVals.append(0) # If there are no datapoints for the day, display 0
        plt.plot(xVals, yVals, label=f"{outlet} - {topic}")
        
        if plotAttribute == "publishCount":
            print(f"In total, {outlet} published {sum(yVals)} articles about {topic}")
        else:
            print(f"For {outlet} the overall average for {topic} was {round(sum(yVals) / len(yVals), 4)}")


            
plt.title(f"{plotAttribute} Over time by Outlet")
plt.legend()
plt.show()

In [None]:
# Convert outlet data to outputable csv data
outputData = [['']+[topic for topic in plotArticles.keys()]]
for outlet in outletData[list(plotArticles.keys())[0]].keys():
    rowList = [outlet]
    for topic in plotArticles.keys():
        rowList.append(outletData[topic][outlet])
    outputData.append(rowList)
with open("./output.csv", "w", encoding="utf-8", newline="") as outputFile:
    writer = csv.writer(outputFile)
    writer.writerows(outputData)

In [None]:
# Text outputs for each journalist by topic
displayTopic = "total" # The topic that gets graphed
journalistList = []
journalistOutput = {}
for article in plotArticles[displayTopic]:
    for name in getNames(str(article.author), pos_tag, word_tokenize):
        journalistList.append(name)
        try:
            if article.outlet not in journalistOutput[name]:
               journalistOutput[name].append(article.outlet)

        except KeyError:
            journalistOutput[name] = [article.outlet]

print(f"The 10 most prolific journalists are:")
for journalist in Counter(journalistList).most_common(10):
    print(f"- {journalist[0]} - {journalist[1]} | {journalistOutput[journalist[0]]}")

In [None]:
# Find keywords for each day by topic
dailyDisplay = 4 # The number of keywords that gets displayed for each date
displayTopic = "total" # The topic that gets graphed
minTextScore = 2 # The minimum number of a times a keyword needs to be mentioned in order to get it's text displayed


try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    

keywordColors = {} # Dict containing the color for each keyword
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted
totalKeywords = [] # All the keywords and their freqency

datedKeywords = {} # A dict containg all the keywords in articles from a given date about the topic
for dateIndex, date in enumerate(plotDates):
    if dateIndex + 1 != len(plotDates):
        datedKeywords[date] = []

for article in plotArticles[displayTopic]:
    if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
        articleDate = article.date
        for word in article.headline.split(" "):
            word = word.strip().lower()
            if word not in stopwordsSet and len(word) > 2 and word not in exclusionList:
                lemmatizedWord = lemmatizer.lemmatize(word)
                totalKeywords.append(lemmatizedWord)
                datedKeywords[articleDate.replace(hour=0, minute=0, second=0)].append(lemmatizedWord) # Append the (lemmatized) word to the dict for the given date
lastKeywords = []
for date in datedKeywords.keys():
    keywords = Counter(datedKeywords[date]).most_common(dailyDisplay)
    for keyword in keywords:
        try:
            keywordColor = keywordColors[keyword[0]] # If the keyword already has a color for itself
        except KeyError:
            r = lambda: random.randint(0,255) # Else, generate a new color for the keyword
            keywordColor = '#%02X%02X%02X' % (r(),r(),r())
            keywordColors[keyword[0]] = keywordColor # If this is the first time 

        plt.scatter(date, keyword[1], color=keywordColor, label=keyword[0]) # Put the point on the graph

        # Draw lines between points with the same keyword
        foundPrior = False # Tracks whether the date before contains the same keyword
        for lastKeyword in lastKeywords:
            if lastKeyword[0] == keyword[0]:
                plt.plot([lastDate, date], [lastKeyword[1], keyword[1]], color=keywordColor)
                foundPrior = True
                break

        if not foundPrior and keyword[1] >= minTextScore: # Only display text if the point is at the start of a 'chain'
            plt.text(date, keyword[1], keyword[0])

    # Save the last date and keywords to plot lines in the next date
    lastDate = date 
    lastKeywords = keywords

print(f"The most common keywords for {displayTopic} were")
for keyword, keyFreq in Counter(totalKeywords).most_common(15):
    print(f"- {keyword} - {keyFreq}")
    
plt.title(f"Keywords over time for topic {displayTopic}")
plt.show()

In [None]:
# Create a histogram for the number of articles with each sentiment scores
displayTopic = "france" # The topic that gets graphed
incrementCount = 50 # Number of bars to get shown

try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    
sentimentData = []

outletData = {}
for outlet in showOutletsList:
    outletData[outlet] = {"negative": 0, "neutral": 0, "positive": 0}

for article in plotArticles[displayTopic]:
    sentimentData.append(article.sentimentScore)
    if article.sentimentScore < -0.25:
        outletData[article.outlet]["negative"] += 1
    elif article.sentimentScore < 0.25:
        outletData[article.outlet]["neutral"] += 1    
    else:
        outletData[article.outlet]["positive"] += 1
    
for outlet in showOutletsList:
    print(f"{outlet} - {outletData[outlet]['negative']} - {outletData[outlet]['neutral']} - {outletData[outlet]['positive']}")
    
plt.hist(sentimentData, incrementCount, range=[-1,1])
plt.title(f"Number of articles with each sentiment for topic {displayTopic}")
plt.xlabel("Sentiment Value")
plt.ylabel("Number of Articles")
plt.show()



In [None]:
# Find the most common verb, adjective and so on (NOTE, does not exclude stopwords)
displayTopic = "france" # The topic that gets graphed
wordTypes = ["JJ", "JJS", "JJR"] # Word types (https://www.guru99.com/pos-tagging-chunking-nltk.html#:~:text=POS%20Tagging%20in%20NLTK%20is,each%20word%20of%20the%20sentence.)
words = []
tagList = []
maxCount = -1 # The maximum amount of articles to be checked

for articleIndex, article in enumerate(plotArticles[displayTopic]):
    for word in article.headline.split(" "):
        if(len(word) >= 2):
            if("" in wordTypes):
                if word not in stopwordsSet:
                    words.append(word)
            else:
                posTag = pos_tag([word])[0][1]
                tagList.append(posTag)
                if(posTag in wordTypes) or ("" in wordTypes):
                    words.append(word)

    print(f"\r Loading {articleIndex+1}/{len(plotArticles[displayTopic])}", end="")
    if(articleIndex > maxCount) and (maxCount != -1):
        break
print("")
wordCount = Counter(words).most_common(10)
tagCount = Counter(tagList).most_common(10)
print("WORDS")
print(*[f"-{word[0]} - {word[1]}\n" for word in wordCount])
print("TAGS")
print(*[f"{tag[0]}-{tag[1]}\n" for tag in tagCount])

In [15]:
catergories = {
    "1a": "Government For",
    "1b": "Government Against",
    "2a": "Individual For",
    "2b": "Individual Against",
    "3a": "None For",
    "3b": "None Against",
    "4": "NA"
}

catergoriesList = list(key.upper() + ' ' + catergories[key] + '  |' for key in catergories.keys())
output = {"count": 0}

for outlet in showOutletsList:
    output[outlet] = {}
    for key in catergories.keys():
        output[outlet][catergories[key]] = 0
    print(f"{outlet}\n{''.join(catergoriesList)}")
    for articleIndex, article in enumerate(plotArticles["aukus"]):
        if(article.outlet == outlet):
            print(article.headline.strip())
            sourceType = input()
            try:
                output[outlet][catergories[sourceType.lower()]] += 1
            except KeyError:
                print("WRONG")
                sourceType = input()
                output[outlet][catergories[sourceType.lower()]] += 1
            output["count"] += 1
            
        with open("sources.json", "w", encoding="utf-8") as outputFile:
            json_object = json.dumps(output, indent=4)
            outputFile.write(json_object)

ABC News
1A Government For  |1B Government Against  |2A Individual For  |2B Individual Against  |3A None For  |3B None Against  |4 NA  |
AUKUS deal may be cold comfort to Aussies struggling with the cost of living. Can Labor make it about jobs?
3a
Ahead of AUKUS unveiling, Port Kembla emerges as preferred site for new submarine base
3a
Live: Australian nuclear submarine program to cost up to $368b as AUKUS details set to be unveiled
1a
Australian nuclear submarine program to cost up to $368b as AUKUS details unveiled
1a
No briefing required: China already knows AUKUS is about them. So what happens next?
1b
Nuclear submarines needed as region has become 'less stable', AUKUS task force head says
1a
The AUKUS deal is done but, in American politics, anything can be undone
1a
What these six words in Joe Biden's AUKUS speech told us about his next political battle
2a
Where will Australia dump its waste from the AUKUS nuclear submarines?
3b
Australia has to dispose of nuclear waste under the 