# Analyse server stored data

In [5]:
# Imports
from datetime import datetime, timedelta
import json
import time
from collections import Counter
import random

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.style
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import operator

from src.media import Outlet, Article
from src.data import getNames

In [6]:
# Define functions
def getDateRange(start_date, end_date):  # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list

In [7]:
# Make maptlotlib show graphs in new window
%matplotlib qt 

In [38]:
# Constants
# Date range for articles being scraped from the server
startScrapeDate = "01/03/2023"
endScrapeDate = "26/03/2023"

collectionCap = -1 # The maximum amount of articles to get pulled from the server (set to -1 for uncaped scraping)

startScrapeDate = datetime.strptime(startScrapeDate, "%d/%m/%Y")
endScrapeDate = datetime.strptime(endScrapeDate, "%d/%m/%Y")
stopwordsSet = set(stopwords.words('english'))
exclusionList = ["say", "new", "news", "day", "days"]
lemmatizer = WordNetLemmatizer()
plt.style.use('default')

In [39]:
# Load setup data
with open("./settings.json", "r") as setupFile:
    setupData = json.load(setupFile)

In [40]:
articleList = []
startScanTime = time.time() # Track the time elapsed 
DBClient = MongoClient(setupData["DB_URI"], server_api = ServerApi('1')) # Connect to the database

# Get articles from DBClient
articleCollection = DBClient[setupData["DB_NAME"]]['newsData']
articleCursor = articleCollection.aggregate([{'$match': {'publishDate': {
                '$gt': startScrapeDate, 
                '$lt': endScrapeDate
        }}}])

for articleIndex, article in enumerate(articleCursor):
    articleList.append(Article(
                article["outletName"],
                article["headline"],
                article["description"],
                article["author"],
                article["publishDate"],
                article["sentimentScore"]
        ))
    
    print(f"\r Collected article number {articleIndex + 1}", end="")
print(f"\n Collected a total of {len(articleList)} articles in {round((time.time() - startScanTime), 3)} seconds")

 Collected article number 30555
 Collected a total of 30555 articles in 11.896 seconds


In [41]:
# Sort articles by outlet
outletList = []

for articleIndex, article in enumerate(articleList):
    foundOutlet = False # If the outlet has been found within `outletList`
    for outlet in outletList:
        if outlet.name == article.outlet:
            outlet.addArticle(article)
            foundOutlet = True
            break
    if not foundOutlet: # Make new outlet
        newOutlet = Outlet(article.outlet)
        outletList.append(newOutlet)    
print(f"Found {len(outletList)} outlets in total")

Found 32 outlets in total


In [42]:
outletList.sort(key = lambda x : len(x.articleList), reverse=True)
# Text outputs for each outlet
for outlet in outletList:
    # Get the average sentiment
    avgSentiment = sum(list(article.sentimentScore for article in outlet.articleList)) / len(outlet.articleList)
    avgSentiment = avgSentiment 
        
    print(f"{'=' * 3} {outlet.name} {'=' * 3}")
    print(f"Published a total of {len(outlet.articleList)} articles")
    print(f"Has an average sentiment of {round(avgSentiment, 3)}")
    print("\n")
#     print(f"{outlet.name},{len(outlet.articleList)}")

=== Daily Mail ===
Published a total of 5582 articles
Has an average sentiment of -0.205


=== The Guardian ===
Published a total of 2985 articles
Has an average sentiment of -0.066


=== CNN ===
Published a total of 1881 articles
Has an average sentiment of -0.098


=== BuzzFeed ===
Published a total of 1834 articles
Has an average sentiment of 0.085


=== South China Morning Post ===
Published a total of 1798 articles
Has an average sentiment of -0.067


=== CBS News ===
Published a total of 1416 articles
Has an average sentiment of -0.049


=== ABC News ===
Published a total of 1393 articles
Has an average sentiment of -0.099


=== Huffington Post ===
Published a total of 1310 articles
Has an average sentiment of -0.084


=== Fox News ===
Published a total of 1250 articles
Has an average sentiment of -0.158


=== The Age ===
Published a total of 1134 articles
Has an average sentiment of -0.048


=== Sydney Morning Herald ===
Published a total of 1119 articles
Has an average sentimen

In [46]:
# Display parameters
topicList = [] # Which topics to use (leave blank for all) (MUST BE LOWERCASE)
showOutletsList = ["Islamic Republic News Agency"] # Which outlets to be shown (leave blank for all)

In [47]:
# Collect data to plot
plotArticles = {} # Stores the articles by the topic 
if topicList == []:
    plotArticles['total'] = [] # If there are no topic, total is used to show the total amt of articles
    for article in articleList:
        if article.outlet in showOutletsList or showOutletsList == []:
            plotArticles['total'].append(article)
else:            
    for topic in topicList: 
        plotArticles[topic[0]] = [] # Each topic stores corresponding articles in a list
        for article in articleList:
            if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
                for word in article.headline.split(" "):
                    if word.lower() in topic: # If a given word from the article is in the topic searchlist
                        plotArticles[topic[0]].append(article)
                        break


In [48]:
# Plot daily average (or total output) for any attribute over time, as a total/avg of all outlets
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate)

for topic in plotArticles.keys():
    plotData = {}
    for dateIndex, date in enumerate(plotDates):
        if dateIndex + 1 != len(plotDates):
            plotData[date] = []
    for article in plotArticles[topic]:
        articleDate = article.date
        if plotAttribute == "publishCount":
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
        else:
            plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute))
    
    # Plot the data
    xVals = list(plotData.keys())
    yVals = []
    for val in xVals:
        try:
            if plotAttribute == "publishCount":
                yVals.append(len(plotData[val]))
            else:
                yVals.append(sum(plotData[val]) / len(plotData[val]))
        except ZeroDivisionError:
            yVals.append(0)
    if plotAttribute == "publishCount":
        print(f"In total, {sum(yVals)} articles got published about {topic}")
    else:
        print(f"The average {plotAttribute} for {topic} was {sum(yVals) / len(yVals)}")
    plt.plot(xVals, yVals, label=topic)
    
plt.title(f"{plotAttribute} over time")
plt.legend()
# plt.ylim((-1, 1))
plt.show()

In total, 185 articles got published about total


In [49]:
# Plot any attribute over time but broken down by outlet
plotAttribute = "publishCount" # The article attribute to be avg'd and plotted over time (set to `publishCount` for daily TOTAL output)
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted

displayOutlets = [] # The list that keeps track of which outlets to display
if showOutletsList == []: # If the user has not specified which outlets to show, show all of them
    for outlet in outletList:
        displayOutlets.append(outlet.name) 
else:
    displayOutlets = showOutletsList # Else show only those specified

for topic in plotArticles.keys():
    for outlet in displayOutlets: # Show how each media outlet reports each topic
        plotData = {} # Dict containing each display date as key, and the list of scores for that day as value
        for dateIndex, date in enumerate(plotDates):
            if dateIndex + 1 != len(plotDates):
                plotData[date] = []
                
        for article in plotArticles[topic]:
            if article.outlet == outlet:
                articleDate = article.date
                if plotAttribute == "publishCount": # If the user is trying to find how many articles have been published on a given day, add 1 per article
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(1)
                else:
                    plotData[articleDate.replace(hour=0, minute=0, second=0)].append(getattr(article, plotAttribute)) # Append the score to the daily list
        
        # Plot the data
        xVals = list(plotData.keys())
        yVals = []
        for val in xVals:
            try:
                if plotAttribute == "publishCount":
                    yVals.append(len(plotData[val])) # Plot the daily count (total)
                else:
                    yVals.append(sum(plotData[val]) / len(plotData[val])) # plot the daily average 
            except ZeroDivisionError:
                yVals.append(0) # If there are no datapoints for the day, display 0
        plt.plot(xVals, yVals, label=f"{outlet} - {topic}")
        
        if plotAttribute == "publishCount":
            print(f"In total, {outlet} published {sum(yVals)} articles about {topic}")
        else:
            print(f"For {outlet} the overall average for {topic} was {round(sum(yVals) / len(yVals), 4)}")
plt.title(f"{plotAttribute} Over time by Outlet")
plt.legend()
plt.show()

In total, Islamic Republic News Agency published 185 articles about total


In [51]:
# Text outputs for each journalist by topic
displayTopic = "total" # The topic that gets graphed
journalistList = []
journalistOutput = {}
for article in plotArticles[displayTopic]:
    for name in getNames(str(article.author), pos_tag, word_tokenize):
        journalistList.append(name)
        try:
            if article.outlet not in journalistOutput[name]:
               journalistOutput[name].append(article.outlet)

        except KeyError:
            journalistOutput[name] = [article.outlet]

print(f"The 10 most prolific journalists are:")
for journalist in Counter(journalistList).most_common(1000):
    print(f"- {journalist[0]} - {journalist[1]} | {journalistOutput[journalist[0]]}")

The 10 most prolific journalists are:
- IRNA English - 185 | ['Islamic Republic News Agency']


In [53]:
# Find keywords for each day by topic
dailyDisplay = 4 # The number of keywords that gets displayed for each date
displayTopic = "total" # The topic that gets graphed
minTextScore = 2 # The minimum number of a times a keyword needs to be mentioned in order to get it's text displayed


try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    

keywordColors = {} # Dict containing the color for each keyword
plotDates = getDateRange(startScrapeDate, endScrapeDate) # A list of all the dates that will be plotted
totalKeywords = [] # All the keywords and their freqency

datedKeywords = {} # A dict containg all the keywords in articles from a given date about the topic
for dateIndex, date in enumerate(plotDates):
    if dateIndex + 1 != len(plotDates):
        datedKeywords[date] = []

for article in plotArticles[displayTopic]:
    if article.outlet in showOutletsList or showOutletsList == []: # If the article is from the specified outlet
        articleDate = article.date
        for word in article.headline.split(" "):
            word = word.strip().lower()
            if word not in stopwordsSet and len(word) > 2 and word not in exclusionList:
                datedKeywords[articleDate.replace(hour=0, minute=0, second=0)].append(lemmatizer.lemmatize(word)) # Append the (lemmatized) word to the dict for the given date
lastKeywords = []
for date in datedKeywords.keys():
    keywords = Counter(datedKeywords[date]).most_common(dailyDisplay)
    for keyword in keywords:
        try:
            keywordColor = keywordColors[keyword[0]] # If the keyword already has a color for itself
        except KeyError:
            r = lambda: random.randint(0,255) # Else, generate a new color for the keyword
            keywordColor = '#%02X%02X%02X' % (r(),r(),r())
            keywordColors[keyword[0]] = keywordColor # If this is the first time 

        totalKeywords.append(keyword[0])
        plt.scatter(date, keyword[1], color=keywordColor, label=keyword[0]) # Put the point on the graph

        # Draw lines between points with the same keyword
        foundPrior = False # Tracks whether the date before contains the same keyword
        for lastKeyword in lastKeywords:
            if lastKeyword[0] == keyword[0]:
                plt.plot([lastDate, date], [lastKeyword[1], keyword[1]], color=keywordColor)
                foundPrior = True
                break

        if not foundPrior and keyword[1] >= minTextScore: # Only display text if the point is at the start of a 'chain'
            plt.text(date, keyword[1], keyword[0])

    # Save the last date and keywords to plot lines in the next date
    lastDate = date 
    lastKeywords = keywords

print(f"The most common keywords for {displayTopic} were")
for keyword, keyFreq in Counter(totalKeywords).most_common(15):
    print(f"- {keyword} - {keyFreq}")
    
plt.title(f"Keywords over time for topic {displayTopic}")
plt.show()

The most common keywords for total were
- iran - 12
- iranian - 5
- tie - 2
- iran, - 2
- official - 2
- iran’s - 2
- iran's - 2
- belarus - 1
- comprehensive - 1
- improve - 1
- iran: - 1
- trade - 1
- envoy - 1
- uae - 1
- trip - 1


In [54]:
# Create a histogram for the number of articles with each sentiment scores
displayTopic = "total" # The topic that gets graphed
incrementCount = 50

try:
    plotArticles[displayTopic]
except KeyError:
    print(f"Topic {displayTopic} is not available, the possible topics are {list(plotArticles.keys())}")
    
sentimentData = []
for article in plotArticles[displayTopic]:
    sentimentData.append(article.sentimentScore)
    
plt.hist(sentimentData, incrementCount)
plt.title(f"Number of articles with each sentiment for topic {displayTopic}")
plt.show()



In [59]:
# Find the most common verb, adjective and so on (NOTE, does not exclude stopwords)
displayTopic = "total" # The topic that gets graphed
words = []
for article in plotArticles["total"]:
    for word in article.headline.split(" "):
        words.append(word)
wordCount = Counter(words).most_common()

In [60]:
wordCount

[('in', 72),
 ('Iran', 61),
 ('to', 55),
 ('of', 49),
 ('Iranian', 26),
 ('on', 21),
 ('FM', 18),
 ('with', 17),
 ('for', 15),
 ('Iran,', 12),
 ('ties', 12),
 ('Nowruz', 12),
 ('Leader', 11),
 ('Iran’s', 11),
 ("Iran's", 11),
 ('Supreme', 10),
 ('at', 9),
 ('Iraq', 8),
 ('official', 7),
 ('Tehran', 7),
 ('UAE', 7),
 ('regional', 7),
 ('security', 6),
 ('Tehran-Riyadh', 6),
 ('agreement', 6),
 ('as', 6),
 ('New', 6),
 ('discuss', 6),
 ('Ramadan', 6),
 ('states', 5),
 ('Headlines', 5),
 ('English-language', 5),
 ('dailies', 5),
 ('March', 5),
 ('by', 5),
 ('talks', 5),
 ('trade', 5),
 ('Raisi', 5),
 ('attack', 5),
 ('Year', 5),
 ('welcomes', 4),
 ('Iran-Saudi', 4),
 ('says', 4),
 ('US', 4),
 ('be', 4),
 ('visit', 4),
 ('FMs', 4),
 ('president', 4),
 ('envoy', 4),
 ('speech', 4),
 ('new', 4),
 ('football', 4),
 ('growth', 4),
 ('condemns', 4),
 ('felicitates', 4),
 ('cooperation', 3),
 ('national', 3),
 ('economic', 3),
 ('Afghanistan', 3),
 ('Iran:', 3),
 ('not', 3),
 ('over', 3),
 ('top