# Media Analysis using MongoDB
## By Gabriel Hanich

In [44]:
# Imports
import json
from datetime import datetime

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import pymongo.errors

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as corpusStopwords
from nltk import pos_tag


In [45]:
# Load DB constants
with open("./settings.json", "r") as constantsFile:
    DBConstants = json.load(constantsFile)

In [46]:
# User settings

# Dates
doSetDates = True # Whether to use user-specified dates
startingDate = "13/9/2021"
endDate = "23/01/2022"

# Outlets
doSetOutlets = False # Whether to limit search to a set of outlets
setOutlets = [] # Only include a set of outlets
doTotalOutlets = True # Whether to take the outlets as a total

# Title searching
titleSearchWords = [] # Only include articles with headlines containing these words (Leave BLANK to disable)

# Similar Words settings
doSimilarWords = False # Whether to scan reddit for similar words to the title search words
searchPostsCount = 500 # How many posts to scan per word
searchWords = 10 # How many words from reddit are added per keyword
minOccurenceCount = 5 # Minimum amount of times a word has to appear to be registered as 'similar'

# Chart Settings
commonWordCount = 1 # How many common words per day are displayed
chartType = "articleCount" # Type of chart to make'
saveAsCSV = False


# Convert string userDates to datetime objs
startingDate = datetime.strptime(startingDate, "%d/%m/%Y")
endDate = datetime.strptime(endDate, "%d/%m/%Y")

In [24]:
# Using above user settings, generate a aggregation string
aggregateString = "{\n"
if doSetDates:
    aggregateString += "publishDate: {$gt: " + startingDate.strftime("%Y-%m-%d") + " },\n"
    aggregateString += " publishDate: {$lt: " + endDate.strftime("%Y-%m-%d") +"}"

print(aggregateString)


{
publishDate: {$gt: 2021-09-13 }


In [21]:
# Connect to DB and sort data by selected aggregation
DBClient = MongoClient(DBConstants["DB_URI"], server_api=ServerApi('1'))

db = DBClient.get_database(DBConstants["DB_NAME"])
newsCollection = db["newsData"]

DBClient.close()