# Analyse data stored in the server

In [1]:
import json
from datetime import datetime

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import matplotlib.pyplot as plt

from src.media import Article, Outlet

In [2]:
%matplotlib qt

In [3]:
# Constants
# What dates to get from DB
startScanDate = "01/03/2022"
endScanDate = "21/05/2022"

collectionCap = 50000 # The max amount of articles collected from the DB (set to -1 for uncapped collection)

startScanDate = datetime.strptime(startScanDate, "%d/%m/%Y")
endScanDate = datetime.strptime(endScanDate, "%d/%m/%Y")

In [4]:
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile) 


In [5]:
# Read data from MongoDB
articleList = []
DBClient = MongoClient(setupData["DB_URI"], server_api=ServerApi('1')) # Connect to DB

outletCollection = DBClient[setupData["DB_NAME"]]['newsData']
outletCursor = outletCollection.find({})
print("Starting Scan")

for documentIndex, document in enumerate(outletCursor):
    try:
        # If the article is from the correct time period
        if document["publishDate"] > startScanDate and document["publishDate"] < endScanDate:
            articleList.append(Article(
                document["outletName"],
                document["headline"],
                document["description"],
                document["author"],
                document["publishDate"],
                document["sentimentScore"]
            ))
            print(f"\r Collecting article number {len(articleList)}", end="")
        
        # Stop collecting articles if the cap is reached
        if collectionCap != -1 and len(articleList) >= collectionCap: 
            break
    # If an article is missing a data point, it is discarded
    except Exception:
        pass

print(f"\nCollected {len(articleList)} Articles")

Starting Scan
 Collecting article number 50000
Collected 50000 Articles


In [17]:
outletList = []
print("Sorting articles")
for articleIndex, article in enumerate(articleList):
    foundOutlet = False
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet:
        newOutlet = Outlet(article.outlet)
        newOutlet.addArticle(article)
        outletList.append(newOutlet)
outletList.sort(key=lambda x: len(x.articleList), reverse=True)
print(f"Done sorting, {len(outletList)} outlets found")
        

Sorting articles
Done sorting, 26 outlets found


In [38]:
# Print out basic stats
for outlet in outletList:
    # Gather data
    authorList = []
    avgSentiment = 0
    for article in outlet.articleList:
        if article.author not in authorList:
            authorList.append(article.author)
        avgSentiment += article.sentimentScore / len(outlet.articleList)
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print(f"And has had articles from {len(authorList)} different authors\n")

=== Daily Mail ===
Published a total of 7839 articles
Has an average sentiment of -0.202
And has had articles from 1 different authors

=== The Guardian ===
Published a total of 4711 articles
Has an average sentiment of -0.086
And has had articles from 1881 different authors

=== The Age ===
Published a total of 3775 articles
Has an average sentiment of -0.016
And has had articles from 400 different authors

=== Sydney Morning Herald ===
Published a total of 3771 articles
Has an average sentiment of -0.019
And has had articles from 400 different authors

=== BuzzFeed ===
Published a total of 3267 articles
Has an average sentiment of 0.132
And has had articles from 1 different authors

=== CBS News ===
Published a total of 3091 articles
Has an average sentiment of -0.079
And has had articles from 1 different authors

=== CNN ===
Published a total of 2611 articles
Has an average sentiment of -0.11
And has had articles from 1 different authors

=== South China Morning Post ===
Published a