# Project 2: Web Scraping and API access

In [58]:
!pip install beautifulsoup4



In [59]:
import requests
from bs4 import BeautifulSoup


## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

<th class ="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Year</th>

In [60]:
from bs4 import BeautifulSoup

# HTML
html_snippet = """
<th class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Year</th>
"""

soup = BeautifulSoup(html_snippet, 'html.parser')

th_element = soup.find('th')

class_name = th_element.get('class')
tab_index = th_element.get('tabindex')
role = th_element.get('role')
title = th_element.get('title')
header_text = th_element.get_text()

print("Class:", class_name)
print("Tabindex:", tab_index)
print("Role:", role)
print("Title:", title)
print("Header Text:", header_text)


Class: ['headerSort']
Tabindex: 0
Role: columnheader button
Title: Sort ascending
Header Text: Year


### B. Using inspect element, find the html syntax for a link. 

In [61]:
from bs4 import BeautifulSoup

# HTML with an anchor element
html_snippet = """
<a href="https://en.wikipedia.org/wiki/American_Library_Association" title="American Library Association">American Library Association</a>
"""

soup = BeautifulSoup(html_snippet, 'html.parser')

a_element = soup.find('a')

href = a_element.get('href')
title = a_element.get('title')
link_text = a_element.get_text()

print("Link URL:", href)
print("Title:", title)
print("Link Text:", link_text)


Link URL: https://en.wikipedia.org/wiki/American_Library_Association
Title: American Library Association
Link Text: American Library Association


### C. Using inspect element, find the html syntax for linking an image

In [62]:
from bs4 import BeautifulSoup

# HTML for a linked image
html_snippet = """
<a href="https://en.wikipedia.org/wiki/Young_Adult_Library_Services_Association">
  <img src="https://upload.wikimedia.org/wikipedia/commons/8/89/Example.jpg" alt="Example Image" width="200" height="150">
</a>
"""

soup = BeautifulSoup(html_snippet, 'html.parser')


a_element = soup.find('a')


link_url = a_element.get('href')


img_element = a_element.find('img')

img_src = img_element.get('src')
img_alt = img_element.get('alt')
img_width = img_element.get('width')
img_height = img_element.get('height')

print("Link URL:", link_url)
print("Image URL:", img_src)
print("Alt Text:", img_alt)
print("Image Width:", img_width)
print("Image Height:", img_height)


Link URL: https://en.wikipedia.org/wiki/Young_Adult_Library_Services_Association
Image URL: https://upload.wikimedia.org/wikipedia/commons/8/89/Example.jpg
Alt Text: Example Image
Image Width: 200
Image Height: 150


## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [63]:
import bs4
import requests
import pandas as pd

In [64]:
#save and print the text content of a page with all tags removed

url = "https://en.wikipedia.org/wiki/Young_Adult_Library_Services_Association"

response = requests.get(url)

if response.status_code == 200:
    page_content = response.content
    print("Page fetched successfully!")

    soup = bs4.BeautifulSoup(page_content, 'html.parser')

    text_content = soup.get_text(separator="\n", strip=True)

    with open("wiki_page_text.txt", "w", encoding="utf-8") as file:
        file.write(text_content)

    print(text_content[:500])
else:
    print(f"Failed to retrieve page. Status code: {response.status_code}")


Page fetched successfully!
Young Adult Library Services Association - Wikipedia
Jump to content
Main menu
Main menu
move to sidebar
hide
Navigation
Main page
Contents
Current events
Random article
About Wikipedia
Contact us
Contribute
Help
Learn to edit
Community portal
Recent changes
Upload file
Search
Search
Donate
Appearance
Create account
Log in
Personal tools
Create account
Log in
Pages for logged out editors
learn more
Contributions
Talk
Contents
move to sidebar
hide
(Top)
1
History
2
Awards
3
See also
4
References



In [65]:
# download an image with beautifulsoup and save it in this repository


url = "https://en.wikipedia.org/wiki/Young_Adult_Library_Services_Association"

response = requests.get(url)

if response.status_code == 200:
    page_content = response.content
    print("Page fetched successfully!")


    soup = bs4.BeautifulSoup(page_content, 'html.parser')

    img_tag = soup.find('img')

    if img_tag:
    
        img_src = img_tag['src']
        
        if img_src.startswith("//"):
            img_url = "https:" + img_src 
        elif img_src.startswith("/"):
            img_url = "https://en.wikipedia.org" + img_src 
        else:
            img_url = img_src 
   
        img_response = requests.get(img_url)

        if img_response.status_code == 200:
          
            img_filename = os.path.join("downloaded_image.jpg")
            with open(img_filename, 'wb') as file:
                file.write(img_response.content)
            print(f"Image downloaded and saved as {img_filename}")
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
    else:
        print("No image found on the page.")
else:
    print(f"Failed to retrieve page. Status code: {response.status_code}")


Page fetched successfully!
Image downloaded and saved as downloaded_image.jpg


In [66]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links
def find_links(url):
    page = requests.get(url)
    
    if page.status_code == 200: 
        soup = bs4.BeautifulSoup(page.content, 'html.parser')
        links = soup.find_all('a')

        for i in range(min(10, len(links))): 
            if 'href' in links[i].attrs: 
                print(links[i]['href'][:100]) 
    else:
        print(f"Failed to retrieve page. Status code: {page.status_code}")

url = "https://en.wikipedia.org/wiki/Young_Adult_Library_Services_Association"
find_links(url)


#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal


## Part 3: Downloading scripts

In [67]:
scripts=pd.read_csv('pudding_data.csv')

In [68]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [69]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags

data = {
 "title": ["The Cocoanuts", "Frankenstein", "The Last Flight", "American Madness", "Grand Hotel", 
              "Pay the Ghost", "Son of Saul", "Dope", "Truth", "Grandma"],
    "year": [1929, 1931, 1931, 1932, 1932, 2015, 2015, 2015, 2015, 2015],
    "gross (inflation-adjusted)": [None, 298.0, None, None, None, None, 0.0, 18.0, 2.0, 7.0],
    "link": [
        "http://www.example.com/script_for_cocoanuts",
        "https://www.imsdb.com/scripts/Frankenstein.html",
        "https://www.imsdb.com/scripts/The-Last-Flight.html",
        "https://www.imsdb.com/scripts/American-Madness.html",
        "https://www.imsdb.com/scripts/Grand-Hotel.html",
        "http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Pay-the-Ghost.pdf",
        "http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Son-of-Saul.pdf",
        "http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Dope.pdf", 
        "http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Truth.pdf", 
        "http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Grandma.pdf"
    ]
}
        
df = pd.DataFrame(data)

def download_scripts(data):
    for i in range(len(data)):
        url = data['link'][i]
        
        try:
            page = requests.get(url)
            page.raise_for_status() 
            
            soup = bs4.BeautifulSoup(page.content, 'html.parser')
            text = soup.get_text()
            
            with open(f'script_{i}.txt', 'w', encoding='utf-8') as file:
                file.write(text[:1000])
                
            print(f"Downloaded script_{i}.txt successfully.")
        
        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

download_scripts(df)


An error occurred while processing http://www.example.com/script_for_cocoanuts: 500 Server Error: Internal Server Error for url: http://www.example.com/script_for_cocoanuts
Downloaded script_1.txt successfully.
Downloaded script_2.txt successfully.
Downloaded script_3.txt successfully.
Downloaded script_4.txt successfully.
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Pay-the-Ghost.pdf: 404 Client Error: Not Found for url: https://gointothestory.blcklst.com/wp-content/uploads/2015/09/Pay-the-Ghost.pdf?gi=095c6dbddf46
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Son-of-Saul.pdf: 404 Client Error: Not Found for url: https://gointothestory.blcklst.com/wp-content/uploads/2015/09/Son-of-Saul.pdf?gi=0b572ae37527
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Dope.pdf: 404 Client Error: Not Found for url: https://gointothestory.blcklst.com/wp-co

In [70]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"
df = pd.DataFrame(data)

df['script_text'] = ""

def download_scripts(data):
    for i in range(len(data)):
        url = data['link'][i]
        try:
            page = requests.get(url)
            page.raise_for_status() 
            soup = bs4.BeautifulSoup(page.content, 'html.parser')
            text = soup.get_text()
            df.at[i, 'script_text'] = text[:1000]
            print(f"Downloaded script for {data['title'][i]} successfully.")
        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

download_scripts(df)

df.to_csv("pudding_texts.csv", index=False)
print("DataFrame saved as 'pudding_texts.csv'.")


An error occurred while processing http://www.example.com/script_for_cocoanuts: 500 Server Error: Internal Server Error for url: http://www.example.com/script_for_cocoanuts
Downloaded script for Frankenstein successfully.
Downloaded script for The Last Flight successfully.
Downloaded script for American Madness successfully.
Downloaded script for Grand Hotel successfully.
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Pay-the-Ghost.pdf: 404 Client Error: Not Found for url: https://gointothestory.blcklst.com/wp-content/uploads/2015/09/Pay-the-Ghost.pdf?gi=b35c2a9ac9e2
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Son-of-Saul.pdf: 404 Client Error: Not Found for url: https://gointothestory.blcklst.com/wp-content/uploads/2015/09/Son-of-Saul.pdf?gi=48b3d08b5c35
An error occurred while processing http://gointothestory.blcklst.com/wp-content/uploads/2015/09/Dope.pdf: 404 Client Error: Not Found

## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [71]:
import requests

url = 'https://api.themoviedb.org/3/movie/now_playing'

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxMWQyYzg2YjY4YWUwMWE1MWRmZDcxNzZmOTE0OWUwYyIsIm5iZiI6MTcyODI4MDE1OS44OTI5NTUsInN1YiI6IjY3MDM3NGE5N2NmZWE2ZjIwMjczZDk1NiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.QV7gqbDiBD8mGPGJ2an-zjqbps21R4ScRXvQWSTEwBM"
}

response = requests.get(url, headers=headers)

print(response.text)

{"dates":{"maximum":"2024-10-09","minimum":"2024-08-28"},"page":1,"results":[{"adult":false,"backdrop_path":"/9R9Za5kybgl5AhuCNoK3gngaBdG.jpg","genre_ids":[27,53],"id":1114513,"original_language":"en","original_title":"Speak No Evil","overview":"When an American family is invited to spend the weekend at the idyllic country estate of a charming British family they befriended on vacation, what begins as a dream holiday soon warps into a snarled psychological nightmare.","popularity":2281.291,"poster_path":"/fDtkrO2OAF8LKQTdzYmu1Y7lCLB.jpg","release_date":"2024-09-11","title":"Speak No Evil","video":false,"vote_average":7.334,"vote_count":511},{"adult":false,"backdrop_path":"/uXDwP5qPhuRyPpQ7WkLbE6t2z5W.jpg","genre_ids":[35,53,28],"id":877817,"original_language":"en","original_title":"Wolfs","overview":"Hired to cover up a high-profile crime, a fixer soon finds his night spiralling out of control when he's forced to work with an unexpected counterpart.","popularity":1693.793,"poster_path"

In [72]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 
#save this dataset as "movies.csv"

url = 'https://api.themoviedb.org/3/movie/now_playing'

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxMWQyYzg2YjY4YWUwMWE1MWRmZDcxNzZmOTE0OWUwYyIsIm5iZiI6MTcyODI4MDE1OS44OTI5NTUsInN1YiI6IjY3MDM3NGE5N2NmZWE2ZjIwMjczZDk1NiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.QV7gqbDiBD8mGPGJ2an-zjqbps21R4ScRXvQWSTEwBM"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    data = response.json()
    movie_list = []

    for movie in data['results']:
        movie_data = {
            'title': movie['title'],
            'release_date': movie['release_date'],
            'vote_average': movie['vote_average'],
            'vote_count': movie['vote_count'],
            'popularity': movie['popularity'],
            'original_language': movie['original_language'],
            'overview': movie['overview'],
            'poster_path': movie['poster_path']
        }
        movie_list.append(movie_data)

    df = pd.DataFrame(movie_list)

    df.to_csv('movies.csv', index=False)

    print("Movies data saved to 'movies.csv' successfully.")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")



Movies data saved to 'movies.csv' successfully.


In [73]:
#download the movie posters for 10 of these movies and save them to this repository

df = pd.read_csv('movies.csv')

base_image_url = 'https://image.tmdb.org/t/p/w500'

if not os.path.exists('posters'):
    os.makedirs('posters')

for i in range(10):
    poster_path = df.loc[i, 'poster_path']
    movie_title = df.loc[i, 'title']

    full_poster_url = base_image_url + poster_path

    response = requests.get(full_poster_url)

    if response.status_code == 200:
        with open(f'./posters/{movie_title}_poster.jpg', 'wb') as f:
            f.write(response.content)
        print(f"Downloaded poster for {movie_title}")
    else:
        print(f"Failed to download poster for {movie_title}. Status code: {response.status_code}")


Downloaded poster for Speak No Evil
Downloaded poster for Wolfs
Downloaded poster for The Crow
Downloaded poster for The Substance
Downloaded poster for Joker: Folie à Deux
Downloaded poster for Kill 'em All 2
Downloaded poster for Project Silence
Downloaded poster for The Wild Robot
Downloaded poster for Beetlejuice Beetlejuice
Downloaded poster for It Ends with Us
