# Analyse data stored in the server

In [1]:
import json
from datetime import datetime, timedelta
import random
import csv

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import matplotlib.pyplot as plt

from src.media import Article, Outlet

In [2]:
%matplotlib qt

In [3]:
# Constants
# What dates to get from DB
startScanDate = "15/09/2021"
endScanDate = "12/09/2022"

collectionCap = -1 # The max amount of articles collected from the DB (set to -1 for uncapped collection)

startScanDate = datetime.strptime(startScanDate, "%d/%m/%Y")
endScanDate = datetime.strptime(endScanDate, "%d/%m/%Y")

In [4]:
# Define functions
def getDateRange(start_date, end_date):
    # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list

def getDailyAverages(articleList, value) -> dict:
    dayData = {}
    
    # Make a dict containing each date 
    sortList = articleList.copy()
    sortList.sort(key=lambda x: x.date)
    for date in getDateRange(sortList[0].date, sortList[-1].date):
        dayData[date.replace(hour=0, minute=0, second=0)] = {"count": 0, "total": 0}
    
    # Get the total score for each date 
    for article in articleList:
        try:
            dayData[article.date.replace(hour=0, minute=0, second=0)]["total"] += getattr(article, value)
            dayData[article.date.replace(hour=0, minute=0, second=0)]["count"] += 1
        except KeyError as e:
            pass
    # Calculate average for each day
    for date in dayData.keys():
        try:
            dayData[date] = dayData[date]["total"] / dayData[date]["count"]
        except ZeroDivisionError:
            dayData[date] = 0
    return dayData
            

In [5]:
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile) 

In [None]:
# Read data from MongoDB
articleList = []
DBClient = MongoClient(setupData["DB_URI"], server_api=ServerApi('1')) # Connect to DB

outletCollection = DBClient[setupData["DB_NAME"]]['newsData']
outletCursor = outletCollection.find({})
print("Starting Scan")

for documentIndex, document in enumerate(outletCursor):
    try:
        # If the article is from the correct time period
        if document["publishDate"] > startScanDate and document["publishDate"] < endScanDate:
            articleList.append(Article(
                document["outletName"],
                document["headline"],
                document["description"],
                document["author"],
                document["publishDate"],
                document["sentimentScore"]
            ))
            print(f"\r Collecting article number {len(articleList)}", end="")
        
        # Stop collecting articles if the cap is reached
        if collectionCap != -1 and len(articleList) >= collectionCap: 
            break
    # If an article is missing a data point, it is discarded
    except Exception:
        pass

print(f"\nCollected {len(articleList)} Articles")

Starting Scan
 Collecting article number 261660

In [None]:
outletList = []
print("Sorting articles")
for articleIndex, article in enumerate(articleList):
    foundOutlet = False
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet:
        newOutlet = Outlet(article.outlet)
        newOutlet.addArticle(article)
        outletList.append(newOutlet)
outletList.sort(key=lambda x: len(x.articleList), reverse=True)
print(f"Done sorting, {len(outletList)} outlets found")
        

In [None]:
# Print out basic stats
for outlet in outletList:
    # Gather data
    authorList = []
    avgSentiment = 0
    for article in outlet.articleList:
        if article.author not in authorList:
            authorList.append(article.author)
        avgSentiment += article.sentimentScore / len(outlet.articleList)
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print(f"And has had articles from {len(authorList)} different authors\n")

In [None]:
# Sort into topics 
topicList = [["Joe", "Biden", "Democrat", "Democrats"], ["Donald", "Trump"]] 
for topicIndex, topic in enumerate(topicList):
    topicData = {"topicName": topic[0], "outletList": []}
    articleCount = 0
    for outlet in outletList:
        thisOutlet = Outlet(outlet.name)
        for article in outlet.articleList:
            for searchWord in topic:
                if searchWord in article.headline.split(" "):
                    thisOutlet.addArticle(article)
                    articleCount += 1
                    break
        
        topicData["outletList"].append(thisOutlet)
    print(f"Topic '{topic[0]}' has {articleCount} articles about it")
    topicList[topicIndex] = topicData

In [None]:
# Plot specific trait against the other
trait1 = "puvb"



In [10]:
# Output total data into a .csv file
outletIndexes = {}
for outletIndex, outlet in enumerate(outletList):
    outletIndexes[outlet.name] = outletIndex 

outletNames = list(outlet.name for outlet in outletList)
outletNames.insert(0, "")

dailySentiment = {}
dailyOutput = {}
for date in getDateRange(startScanDate, endScanDate):
    dailySentiment[date] = list(0 for i in range(len(outletList)))
    dailyOutput[date]  = list(0 for i in range(len(outletList)))

for outlet in outletList:
    # Get daily sentiment
    sentimentVals = getDailyAverages(outlet.articleList, "sentimentScore")
    for date in sentimentVals.keys():
        dailySentiment[date][outletIndexes[outlet.name]] = sentimentVals[date]
    
    # Get daily output
    for article in outlet.articleList:
        articleDate = article.date
        dailyOutput[articleDate.replace(hour=0, minute=0, second=0)][outletIndexes[outlet.name]] += 1


    
with open("data/dailyAvgSentiment.csv", "w+", encoding='UTF8', newline='') as sentimentOutput:
    writer = csv.writer(sentimentOutput)
    writer.writerow(outletNames)
    for date in dailySentiment.keys():
        dateRow = dailySentiment[date]
        dateRow.insert(0, date.strftime("%d/%m/%Y"))
        writer.writerow(dateRow)
with open("data/dailyOutput.csv", "w+", encoding='UTF8', newline='') as outputFile:
    writer = csv.writer(outputFile)
    writer.writerow(outletNames)
    for date in dailyOutput.keys():
        dateRow = dailyOutput[date]
        dateRow.insert(0, date.strftime("%d/%m/%Y"))
        writer.writerow(dateRow)

In [11]:
# Output topic specific data
topicIndexes = {}
for topicIndex, topic in enumerate(topicList):
    topicIndexes[topic["topicName"]] = topicIndex
topicNames = list(topic["topicName"] for topic in topicList)
topicNames.insert(0, "")

dailySentiment = {}
dailyOutput = {}
for date in getDateRange(startScanDate, endScanDate):
    dailySentiment[date] = list(0 for i in range(len(topicList)))
    dailyOutput[date] = list(0 for i in range(len(topicList)))
    
for topic in topicList:
    # Get list containing all published articles AND get the number of published articles each day
    articleList = []
    for outlet in topic["outletList"]:
        for article in outlet.articleList:
            articleList.append(article)
            articleDate = article.date
            dailyOutput[articleDate.replace(hour=0, minute=0, second=0)][topicIndexes[topic["topicName"]]] += 1
            
    # Get daily sentiment
    sentimentVals = getDailyAverages(articleList, "sentimentScore")
    for date in sentimentVals.keys():
        dailySentiment[date][topicIndexes[topic["topicName"]]]= sentimentVals[date]
    

with open("./data/topicAvgDailySentiment.csv", "w+", encoding="UTF-8", newline = "") as sentimentFile:
    writer = csv.writer(sentimentFile)
    writer.writerow(topicNames)
    for date in dailySentiment.keys():
        dateRow = dailySentiment[date]
        dateRow.insert(0, date.strftime("%d/%m/%Y"))
        writer.writerow(dateRow)
        
with open("./data/topicDailyOutput.csv", "w+", encoding="UTF-8", newline = "") as outputFile:
    writer = csv.writer(outputFile)
    writer.writerow(topicNames)
    for date in dailyOutput.keys():
        dateRow = dailyOutput[date]
        dateRow.insert(0, date.strftime("%d/%m/%Y"))
        writer.writerow(dateRow)