## Topic modelling with the Sunday Times archives (2013-21)

#### Task 1) Provide recommendations to improve the research design outlined above

### Imports

In [41]:
#standard

import numpy as np
import pandas as pd
from sys import exit
import time
import re
import random
import codecs
from datetime import datetime, timedelta
import traceback
import pickle
import itertools
from requests.exceptions import Timeout
import traceback

#web-scraping-tools
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import requests
from bs4 import BeautifulSoup
import urllib.parse
from lxml import html

#analysis
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Functions - Section 1

In [2]:
headers = {
    'authority': 'www.google.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

In [3]:
def init_driver():
    from webdriver_manager.chrome import ChromeDriverManager

In [4]:
def get_sundays():
    start_date = datetime(2013, 1, 6)  
    end_date = datetime(2021, 1, 10) 

    sundays = []

    current_date = start_date
    while current_date <= end_date:
        sundays.append(current_date.strftime("%y%m%d"))
        current_date += timedelta(weeks=1)

    return sundays

In [5]:
def get_all_pages():
    
    base_url = "https://www.sundaytimes.lk/X/news/"
    dates = get_sundays()
    updated_urls = [base_url.replace("X", sunday) for sunday in dates]
    
    return updated_urls

In [6]:
def scrape_single_page(url):
    main_list = []

    # Make soup and scrape
    with webdriver.Chrome(ChromeDriverManager().install()) as driver:
        driver.get(url)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        # Extract URLs
        article_divs = soup.find_all("h2", class_='entry_title')
        urls = [re.search("(?P<url>https?://[^\s]+)", str(div)).group("url") for div in article_divs]

        main_list.append(urls)
        driver.close()

    return main_list

In [7]:
def main_engine():
    try:
        archive_urls_full = get_all_pages()
        no_of_editions = len(archive_urls_full)

        # Sanity check
        print(f"Total number of {no_of_editions} digitized editions of the Sunday Times from 2013-21.")

        final_list = []

        for i, url in enumerate(archive_urls_full):
            try:
                edition_result = scrape_single_page(url)
                final_list.append(edition_result)
                print(f"Iteration {i + 1} of {no_of_editions} complete. Moving on ...")

                # Pickle dump the final_list after each iteration
                with open(f'final_list_iteration_{i + 1:03d}.pkl', 'wb') as pickle_file:
                    pickle.dump(final_list, pickle_file)
                    print(f"Final list saved to final_list_iteration_{i + 1:03d}.pkl")

            except Exception as e:
                print(f"Error in iteration {i + 1}: {str(e)}")
                traceback.print_exc()

        return final_list

    except Exception as main_engine_error:
        print(f"Main engine error: {str(main_engine_error)}")
        traceback.print_exc()


In [8]:
def append_to_dataframe(existing_df, pickle_filename):
    try:
        # Call the main_engine function
        result = main_engine()

        # Create a new DataFrame with the results and append it to the original DataFrame
        for i, edition in enumerate(result):
            try:
                edition_df = pd.DataFrame(edition, columns=[f"Article{i + 1}_URL"])
                existing_df = pd.concat([existing_df, edition_df], axis=1)
                print(f"Article {i + 1} successfully appended to DataFrame.")
                length = len(existing_df)
                
                # Dump the dataframe to a pickle file for backup
                with open(pickle_filename, 'wb') as pickle_file:
                    pickle.dump(existing_df, pickle_file)
                    print(f"DataFrame saved to {pickle_filename}")

            except Exception as edition_error:
                print(f"Error appending article {i + 1} to DataFrame: {str(edition_error)}")
                traceback.print_exc()

        # This line should be outside the inner try block
        print(f"Total entries at {length}")

        print("Appending to DataFrame completed.")
        return existing_df

    except Exception as append_error:
        print(f"DataFrame appending error: {str(append_error)}")
        traceback.print_exc()
        return existing_df  # Return the existing dataframe in case of an error


### Functions - Section 2

In [9]:
def extract_elements(url, timeout=5):
    try:
        response = requests.get(url, timeout=timeout, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        soup = BeautifulSoup(response.text, 'html.parser')

        headings = [h1.get_text() for h1 in soup.find_all('h1')]
        paragraphs = [p.get_text() for p in soup.find_all('p')]
        return headings, paragraphs
        
    except Timeout:
        print(f"Connection timed out while trying to retrieve the webpage {url}")
    except requests.RequestException as e:
        print(f"Failed to retrieve the webpage {url}. Error: {e}")
        
    return None, None  # Return None values in case of errors


In [10]:
def create_dataframe(webpage_urls):
    data = {'Heading': [], 'Paragraph': []}

    try:
        for i, url in enumerate(webpage_urls, start=1):
            headings, paragraphs = extract_elements(url)
            time.sleep(5)
            
            if headings is not None and paragraphs is not None:
                data['Heading'].extend(headings)
                data['Paragraph'].extend(paragraphs)
            
            print(f"Processed {i} out of {len(webpage_urls)} URLs. Current URL: {url}")
        
        df = pd.DataFrame(data)
    except: 
        print(f"Failed to processed article {i} out of {len(webpage_urls)} URLs. Current URL: {url}")
    return df

### Functions - Section 3 

In [17]:
def clean_text(text):
    """
    Perform basic text cleaning:
    - Convert to lowercase
    - Remove special characters, numbers, and punctuation
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [18]:
def tokenize_and_remove_stopwords(text):
    """
    Tokenize the text and remove stopwords using NLTK
    """
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [19]:
def preprocess_data(data):
    """
    Apply text cleaning and tokenization to a list of documents
    """
    cleaned_data = [clean_text(doc) for doc in data]
    tokenized_data = [tokenize_and_remove_stopwords(doc) for doc in cleaned_data]
    return tokenized_data

In [20]:
def vectorize_data(tokenized_data):
    """
    Convert tokenized data into a document-term matrix using CountVectorizer
    """
    flattened_data = [' '.join(tokens) for tokens in tokenized_data]
    
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(flattened_data)
    
    return dtm, vectorizer

In [42]:
def perform_lda(dtm, num_topics=5, random_state=42):
    """
    Perform Latent Dirichlet Allocation (LDA) on the document-term matrix
    """
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        random_state=random_state
    )

    lda_result = lda.fit_transform(dtm)

    return lda, lda_result

In [43]:
def display_topics(model, feature_names, num_top_words=10):
    """
    Display the top words for each topic in the LDA model
    """
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print()

In [44]:
def no_of_topics():
    number_of_topics = input(r"Enter number of topics to test LDA with")

    return number_of_topics

### Execution - Section 1 - Scrape URLs

In [11]:
# initial_dataframe = pd.DataFrame()
# pickle_filename = 'backup_dataframe.pkl'

# final_dataframe = append_to_dataframe(initial_dataframe, pickle_filename)

## error with list dimensions, issue with concatting to df, used pickle to recover file 

In [13]:
def read_and_clean_urls():
    # Read pickled DataFrame
    obj = pd.read_pickle(r'final_list_iteration_419.pkl')

    # Flatten the nested lists and return the result
    url_list =  list(itertools.chain.from_iterable(itertools.chain.from_iterable(obj)))
    cleaned_urls = [url.rstrip('"') for url in url_list]
    return cleaned_urls


In [14]:
articles = read_and_clean_urls()

In [15]:
articles

['http://www.sundaytimes.lk/130106/news/committee-to-probe-lankan-smuggling-to-us-nato-bases-27659.html',
 'http://www.sundaytimes.lk/130106/news/ex-basl-secretary-upul-jayasuriya-to-contest-for-basl-presidency-27649.html',
 'http://www.sundaytimes.lk/130106/news/explain-or-withdraw-order-to-corrupted-antibiotic-supplier-27645.html',
 'http://www.sundaytimes.lk/130106/news/now-canada-lanka-powwow-on-illegal-entry-27643.html',
 'http://www.sundaytimes.lk/130106/news/monkey-tricks-results-in-anxious-hours-at-us-consulate-27638.html',
 'http://www.sundaytimes.lk/130106/news/motorcycle-gunmen-kill-kelaniya-ps-member-27719.html',
 'http://www.sundaytimes.lk/130106/news/more-chinese-funds-sought-for-mattala-airport-27716.html',
 'http://www.sundaytimes.lk/130106/news/war-women-against-rape-27711.html',
 'http://www.sundaytimes.lk/130106/news/ministries-in-fuel-row-rs-8-b-worth-stocks-may-go-to-waste-27707.html',
 'http://www.sundaytimes.lk/130106/news/left-parties-wavering-on-impeachment-vot

### Execution - Section 2 - Scrape articles

In [None]:
df = create_dataframe(articles)

Processed 1 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/committee-to-probe-lankan-smuggling-to-us-nato-bases-27659.html
Processed 2 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/ex-basl-secretary-upul-jayasuriya-to-contest-for-basl-presidency-27649.html
Processed 3 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/explain-or-withdraw-order-to-corrupted-antibiotic-supplier-27645.html
Processed 4 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/now-canada-lanka-powwow-on-illegal-entry-27643.html
Processed 5 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/monkey-tricks-results-in-anxious-hours-at-us-consulate-27638.html
Processed 6 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/motorcycle-gunmen-kill-kelaniya-ps-member-27719.html
Processed 7 out of 13662 URLs. Current URL: http://www.sundaytimes.lk/130106/news/more-chinese-funds-sought-for-mattala-airport

#### Task 2(a): Describe how you would scale up the data engineering organization; what type of tools would you use? How would you query the data? assume that there are no resource constraints (e.g., storage limits, cloud infrastructure access) 


#### High-level overview of changes I would make

1) Convert to script (.py) instead of using a notebook. Notebook was used as building scrapers are generally fragile and require a lot of testing which JN is ideal for. But would re-write in script format when scaling, after all code has been cleaned, documented correctly, and optimized.

2) Selenium Grid:
SG allows us to distribute the scrape--we can break the urls we scraped into chunks and distribute it across multiple machines, which can be beneficial for parallel processing and load balancing. Also would allow us to bypass the issue of newspaper sites or other commercials blocking IP due to repeated querying. Therefore; Create a Selenium Grid setup with a hub and multiple nodes. The hub manages the distribution of tasks to the available nodes.

3) Docker:
Containerize the web scraper, including the Selenium WebDriver and any necessary dependencies, into a Docker image. Have done this previously with a colleague, close to 3 years ago. See https://github.com/dataXdevelopment for more info.

4) AWS Fargate:
Allows us to run Docker containers without having to manage the underlying infrastructure

   >Fargate abstracts the infrastructure management, enabling easy scaling and resource optimization
   
   >Utilize Auto Scaling groups in combination with Fargate to automatically adjust the number of tasks based on demand. This ensures efficient resource utilization

   >Set up networking properly to allow communication between your Fargate tasks and the Selenium Grid nodes. Ensure that the necessary ports are open, and the security groups are configured appropriately
   
   >Logging and Monitoring:Implement proper logging within your web scraper application. Utilize AWS CloudWatch for centralized logging and monitoring. This helps in identifying and troubleshooting issues. For example, if the Sunday Times changes it's HTML page structure or dynamic Javascript in the future to hold its archives, we will be notified, and therefore can ammend the scraper accordingly
   
   >Task Scheduling: Depending on when the ST uploads the arhive, we can set Fargate to run automatically to scrape as necessary without requiring human prompt

## IMPORTANT NOTE: SCRAPER DID NOT HAVE ENOUGH TIME TO COMPLETE; 

I was being IP blocked by Sunday times so had to slow down the request rate. Therefore, in liue of the Sunday times data, I have used data I previous scraped in R from multiple newspapers and summarized for Verite Research (Not their IP, so I am free to use it). See this set of GitHub repos (https://github.com/dayanadithyan/VR-C5-C6-Analytics). This data will be used for the topic modelling below. 

### Execution - Section 3 - Topic Modelling 

##### Rationale for LDA over other algorithms

Given news articles are short, and the topics may vary across a plethora of topics, it allows for both the discovery of latent topics as well as allows for domain specific interpretation by media researchers (it is flexibile). This particular example has only utilized 6000 articles, however, the full Sunday Times archive for a decade would be closer to 20,000--in the case that we decide to scale the scraper, LDA would be ideal as it performs well with sparse data representation and batch processing.

##### Other options considered and would have tested if I had the time:
1) Vader (Architecture: Lexicon)
2) DitilBert or BERTopic (Architecture: Transformer)
3) Stanza CNN (Architecture: Convolutional NN)

This will be further expounded on during the presentation on Tuesday.

In [7]:
df1 = pd.read_csv("C5-CLEANDATA.csv")
df2 = pd.read_csv("C6-CLEANDATA.csv")

In [10]:
df3 = pd.concat([df1,df2])

In [11]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11543 entries, 0 to 8526
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           11543 non-null  object 
 1   Newspaper      11543 non-null  object 
 2   TypeOfArticle  11543 non-null  object 
 3   View.Grade     11543 non-null  object 
 4   News.Grade     11543 non-null  object 
 5   ArticleName    11542 non-null  object 
 6   Coresspondent  6289 non-null   object 
 7   Summary        11036 non-null  object 
 8   Tag1           0 non-null      float64
 9   Tag2           0 non-null      float64
 10  Language       11543 non-null  object 
 11  VG.Score       11543 non-null  int64  
 12  NG.Score       11543 non-null  int64  
 13  Total.Score    11543 non-null  int64  
dtypes: float64(2), int64(3), object(9)
memory usage: 1.3+ MB


In [12]:
df3.head()

Unnamed: 0,Date,Newspaper,TypeOfArticle,View.Grade,News.Grade,ArticleName,Coresspondent,Summary,Tag1,Tag2,Language,VG.Score,NG.Score,Total.Score
0,2020-07-31T00:00:00Z,Mawbima,Long Opinion,Neutral,Neutral,6 General Elections,Susil Suraweera,Author describes the various notable events th...,,,Sinhala,0,0,0
1,2020-07-31T00:00:00Z,Aruna,Long Opinion,Neutral,Neutral,The JVP that Worships,Nuwan Ballanthudawe,Author mentions the Janatha Vimukthi Peramuna ...,,,Sinhala,0,0,0
2,2020-07-31T00:00:00Z,Daily Mirror,Feature,Positive,Neutral,SLASSCOM boosts start-ups with OIW Accelerate ...,,The Royal Norwegian Embassy in Sri Lanka has h...,,,English,1,0,1
3,2020-07-30T00:00:00Z,Daily FT,Feature,Positive,Neutral,SLASSCOM boosts start-ups with OIW Accelerate ...,,The Royal Norwegian Embassy in Sri Lanka has h...,,,English,1,0,1
4,2020-07-30T00:00:00Z,Daily FT,Long Opinion,Neutral,Neutral,Tactical voting can thwart the two-thirds threat,Dr. Dayan Jayatilleka,Author describes the harms of a singular party...,,,English,0,0,0


##### Prepare for LDA

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/dayan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
# Clean  data

df3.dropna(subset=['Summary'], inplace=True)

data = df3['Summary'].tolist()

In [35]:
# Begin preprocessing for analysis

tokenized_data = preprocess_data(data)

In [38]:
# sanity check on tokenization 
tokenized_data 

[['author',
  'describes',
  'various',
  'notable',
  'events',
  'occurred',
  'within',
  'various',
  'governments',
  'sri',
  'lanka',
  'across',
  'years',
  'oslohosted',
  'ceasefire',
  'agreement',
  'one',
  'instance'],
 ['author',
  'mentions',
  'janatha',
  'vimukthi',
  'peramuna',
  'partys',
  'stance',
  'religious',
  'matters',
  'mentions',
  'certain',
  'rights',
  'christians',
  'norway',
  'given',
  'constitution'],
 ['royal',
  'norwegian',
  'embassy',
  'sri',
  'lanka',
  'hosted',
  'competition',
  'nurture',
  'startups',
  'within',
  'sri',
  'lanka'],
 ['royal',
  'norwegian',
  'embassy',
  'sri',
  'lanka',
  'hosted',
  'competition',
  'nurture',
  'startups',
  'within',
  'sri',
  'lanka'],
 ['author',
  'describes',
  'harms',
  'singular',
  'party',
  'owning',
  'rds',
  'parliament',
  'oslohosted',
  'cease',
  'fire',
  'agreement',
  'came',
  'fire',
  'nationalist',
  'rightwing',
  'parties'],
 ['royal',
  'norwegian',
  'embassy

In [36]:
# Vectorize the data
dtm, vectorizer = vectorize_data(tokenized_data)

In [37]:
# check document-term matrix
print("Document-Term Matrix:")
print(dtm.toarray())

Document-Term Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [40]:
#check feature names

feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Terms):")
print(feature_names)

Feature Names (Terms):
['aaa' 'aac' 'aadaraneeya' ... 'zweingenthal' 'zwinglians' 'zyklonb']


##### Prompt: Enter number of topics; use this to experiment with the LDA

In [52]:
num_topics = no_of_topics()

Enter number of topics to test LDA with 20


#### Run the LDA 

In [53]:
lda_model, lda_result = perform_lda(dtm, num_topics=int(num_topics))

In [54]:
# Display the topics and top words
print(f"Displaying {num_topics} topics:")
display_topics(lda_model, feature_names)

Displaying 20 topics:
Topic #1:
huawei chinese canada us canadian sri meng arrest china arrested

Topic #2:
us trade canada trump president states united said agreement mexico

Topic #3:
canada coronavirus cases pandemic covid disease countries reported spread virus

Topic #4:
canada countries article norway minister justin also cabinet trudeau uk

Topic #5:
sri lanka rights human canada united resolution countries council nations

Topic #6:
canada canadian said article cannabis plastic two one arctic use

Topic #7:
canada said countries sri lanka states country united iran including

Topic #8:
sri lanka canada university development norway international norwegian article also

Topic #9:
bank article central oil canada sri government countries foreign best

Topic #10:
canadian police canada said prize toronto people years woman nobel

Topic #11:
tamil sri government canada ltte lanka people war countries political

Topic #12:
sri lanka lankan ltte article canada agreement author attack

### END