# Scrapping op-eds/editorials from the University of Chicago's Maroon's [Viewpoints](https://www.chicagomaroon.com/viewpoints/) page

In [None]:
import bs4
import queue
import json
import sys
import csv
import urllib
import collections
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

### 1. Crawl the Viewpoints page and identify metadata of each article (e.g. name of the author, link to the article, title of the article and the type of the article (op-ed, letter, editorial etc.))

In [203]:
def page_crawl(starting_url):
    '''
    Scrapes the full webpage from a starting url.
    
    Inputs:
        starting_url: string
    Returns:
        a soup object
    '''    
    page = urllib.request.urlopen(starting_url)
    if page is not None:
        soup = bs4.BeautifulSoup(page, "html")
    return soup

In [204]:
full_doc = page_crawl("https://www.chicagomaroon.com/viewpoints/")



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


In [29]:
full_doc_text = full_doc.find_all(class_=["media-heading", "text-muted  media-heading", "inline section-footer", " media-heading"])

In [418]:
def get_article_attributes(all_doc_text):
    '''
    Extracts the attributes (link of the article, article title, author and article type) of the Viewpoints articles.
    
    Inputs:
        - the text of the Viewpoints home webpage
    
    Returns:
        - four lists corresponding to each attribute (link of the article, article title, author and article type).
    '''

    article_title_list = []
    article_link_list = []
    contributor_names_list = []
    article_type_list = []

    for line in all_doc_text:
        # find the link to the article and the title of the article
        if line.name == "h2" or line.name == "h4":

            if line.find(class_="plain-link") is not None:
                article_link = line.find(class_="plain-link")['href']
                article_link_list.append(article_link)

                article_title = line.find(class_="plain-link").text.strip()
                article_title_list.append(article_title)

        # find the name(s) of the contributor(s) of the Viewpoints article
        elif line.name == "p":
            contributors = line.find_all(class_="plain-link")

            # if condition to store one contributor, otherwise create a tuple to record multiple contributors
            if len(contributors) == 1:
                contributor_names_list.append(contributors[0].text)

            else:
                all_names = ()
                # find out the names of all the contributors of a single article
                for contributor_name in contributors:
                    #following line of code inspiration from: https://stackoverflow.com/questions/16730339/python-add-item-to-the-tuple
                    all_names = all_names + (contributor_name.text,)
                contributor_names_list.append(all_names)

        # find the type of the article (oped, letter to the editor, editorial board etc.)
        # assumption: I'm considering all articles as the viewpoints expressed by the students of the University of Chicago in my project
        elif line.name == "small":
            article_type = line.text
            article_type_list.append(article_type)
    
    return article_link_list, article_title_list, contributor_names_list, article_type_list

check if articles data (name, article type etc.) is complete

In [419]:
article_link_list, article_title_list, contributor_names_list, article_type_list = get_article_attributes(full_doc_text)

In [420]:
len(article_link_list)

52

In [421]:
len(article_title_list)

52

In [422]:
len(contributor_names_list)

52

In [424]:
len(article_type_list)

36

The article type list is not complete because the first 16 articles did not have a tag associated with them. Upon a quick inspection, it seems like all of them are OP-ED, so I'll create an entry for them in the article_type_list.

In [425]:
first_26_article_types = ['OP-EDS']*(len(contributor_names_list)-len(article_type_list))

In [426]:
full_article_type_list = first_26_article_types + article_type_list

In [427]:
len(full_article_type_list)

52

### 2. Scrap the texts of the articles, organize meta data into a dataframe

In [428]:
# first convert the relative url into the absolute url

In [429]:
new_article_link_list = []
for article_link in article_link_list:
    new_article_link = 'https://www.chicagomaroon.com' + article_link
    new_article_link_list.append(new_article_link)

In [433]:
# show only top 5
new_article_link_list[:5]

['https://www.chicagomaroon.com/article/2018/5/11/hasty-calculations-reclaiming-math/',
 'https://www.chicagomaroon.com/article/2018/5/15/violence-words/',
 'https://www.chicagomaroon.com/article/2018/5/8/dear-bartlett-tale-dining-fall/',
 'https://www.chicagomaroon.com/article/2018/5/4/defense-criticizing-uchicago/',
 'https://www.chicagomaroon.com/article/2018/5/1/uchicago-place-people-color/']

In [431]:
# now crawl through every webpage and find the relevant text

def get_text_and_byline(full_articles_links):
    '''
    Get the text of the articles and their bylines.
    
    Input:
        - the links of the articles
    
    Returns:
        - a list of lists where each list inside contains text corresponding to a particular article
        - a list of strings where each string is text corresponding to a particular article
        - a list of bylines
    '''

    text_list_of_lists = []
    byline_list = []
    texts_list_final_without_join = []

    for article_link in full_articles_links:

        article_text = page_crawl(article_link)
        texts = article_text.find("div", class_="article-content")
        texts_list = texts.find_all({"p", "em", "i"})

        texts_list_final = []
        tag_name_list = []

        for text_ in texts_list:
            tag_name = text_.name
            k = text_.text.strip()
            tag_name_list.append(tag_name)
            texts_list_final.append(k)

        if (tag_name_list[-1] == "em") or (tag_name_list[-1] == "i"):
            byline_list.append(texts_list_final[-1])
            texts_list_final = texts_list_final[:-1]

        else:
            byline_list.append("unidentified byline")

        texts_list_final_without_join.append(texts_list_final)
        texts_list_final = " ".join(texts_list_final)
        text_list_of_lists.append(texts_list_final)
        
    return texts_list_final_without_join, text_list_of_lists, byline_list

In [432]:
texts_list_final_without_join, text_list_of_lists, byline_list = get_text_and_byline(new_article_link_list)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


In [454]:
byline_list

['Meera Santhanam is a first-year in the College.',
 'Lucas Du is a first-year in the College.',
 'Katia Kukucka is a first-year in the College.',
 'unidentified byline',
 'Soulet Ali is a second-year in the College.',
 'Zahra Nassar is a first-year in the College.',
 'Natalie Denby is a third-year in the College.',
 'Alexa Perlmutter is a first-year in the College.',
 'Natalie Denby is a third year in the College majoring in public policy studies.',
 'Lucas Du is a first year in the College.',
 'Meera Santhanam is a first-year in the College.',
 'Soulet Ali is a second-year in the College.',
 'unidentified byline',
 'unidentified byline',
 'Atlantic',
 'Natalie Denby is a third-year in the College majoring in public policy studies.',
 'David Mihalyfy (Ph.D. ’17) studied in the Divinity School’s History of Christianity program.',
 'Isaac Tannenbaum is a student in the College.',
 'Kavia Khosla,\xa0Gena Lenti, and\xa0Katie Ellis are first-years at the Pritzker School of Medicine.',
 'Al

In [442]:
def dict_to_df(some_list):
    '''
    Collects meta data and the text of the article in a dictionary and converts it into a dataframe.
    '''
    
    # initialize a dictionary
    viewpoints_dict = {}

    for index, article_type in enumerate(some_list):
        viewpoints_dict[index] = []
        viewpoints_dict[index].append(article_link_list[index])
        viewpoints_dict[index].append(article_title_list[index])
        viewpoints_dict[index].append(contributor_names_list[index])
        viewpoints_dict[index].append(article_type_list[index])
        viewpoints_dict[index].append(texts_list_final_without_join[index])
        viewpoints_dict[index].append(text_list_of_lists[index])
        viewpoints_dict[index].append(byline_list[index])
    
    # convert dictionary into a datframe
    df = pd.DataFrame(viewpoints_dict)
    df = df.transpose()
    df.columns = ['article_link', 'title', 'author', 'article_type', 'article_text_in_list', 'article_text_str', 'byline']
    df.head()
    
    return df

In [446]:
maroon_df = dict_to_df(article_type_list)
maroon_df.head()

Unnamed: 0,article_link,title,author,article_type,article_text_in_list,article_text_str,byline
0,/article/2018/5/11/hasty-calculations-reclaimi...,Hasty Calculations: Reclaiming Math,Meera Santhanam,OP-EDS,[“I am not a math person.” I think many of us ...,“I am not a math person.” I think many of us c...,Meera Santhanam is a first-year in the College.
1,/article/2018/5/15/violence-words/,The Violence of Words,Lucas Du,OP-EDS,"[Content warning: sexual violence, Content war...",Content warning: sexual violence Content warni...,Lucas Du is a first-year in the College.
2,/article/2018/5/8/dear-bartlett-tale-dining-fall/,Dear Bartlett: A Tale Of a Dining Fall,Katia Kukucka,OP-EDS,"[We’ve all heard the hot (or, should I say, th...","We’ve all heard the hot (or, should I say, the...",Katia Kukucka is a first-year in the College.
3,/article/2018/5/4/defense-criticizing-uchicago/,In Defense of Criticizing UChicago,Alexa Perlmutter,OP-EDS,[A few weeks ago when prospective students flo...,A few weeks ago when prospective students floo...,unidentified byline
4,/article/2018/5/1/uchicago-place-people-color/,UChicago: No Place for People of Color,Soulet Ali,OP-EDS,[Prospective PoC (people of color) undergradua...,Prospective PoC (people of color) undergraduat...,Soulet Ali is a second-year in the College.


In [453]:
# saves the dataframe to a csv
maroon_df.to_csv("maroon_df")