# Scrape Transcripts of Every X-Files Episode and Send to MongoDB

## Import packages 

In [1]:
import requests
import time
import random
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient

## Build functions for web scraper to MongoDB pipeline

_add some text here_

### Web scraper 

#### Scrape the page with the episode transcript links 

In [4]:
def get_soup(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        print("Request successful\n")
    else:
        print("Error code\n", response.status_code)
    
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    return soup

#### Make a dictionary of episode transcript links, episode number, and season number 

In [14]:
def make_link_dict(soup):
    
    episode_list = []
    episode_num = 1
    
    seasons = soup.find_all(class_="fusion-text")[1:]
    
    for season_ix,season in enumerate(seasons):
        for episode in season.find_all('a'):
            document = {}
            document['episode_number'] = episode_num
            document['season'] = season_ix+1
            document['episode_name'] = episode.text
            document['link'] = episode['href']
            episode_list.append(document)
            episode_num+=1
            
    return episode_list

#### Scrape transcript

In [18]:
def get_transcript(episode):
    soup = get_soup(episode['link'])
    episode['transcript'] = soup.find('p').getText()
    
    return episode

### Get Scully & Mulder lines 

In [15]:
def get_lines(document):
    regex_scully = '\n\nSCULLY: (.*)'
    regex_mulder = '\n\nMULDER: (.*)'
    parens_regex = '[\(].*?[\)]'
    
    # remove all text inside parentheses - they are stage directions, not dialogue
    dialogue = re.sub(parens_regex, '', document['transcript'])
    scully_lines = re.findall(regex_scully, dialogue)
    mulder_lines = re.findall(regex_mulder, dialogue)
    document['scully_lines'] = scully_lines
    document['mulder_lines'] = mulder_lines
    
    return document

### Add document to MongoDB 

In [3]:
def send_to_mongodb(document, client_connection):
    
    """Takes a dictionary in document form and sends it to a pre-specified database
    and collection in mongodb. document is the document to enter into the database. 
    client_connection is an already opened connection with a MongoDB client pointing
    to a specified location."""
    
    mongo_result = client_connection.insert_one(document)
    
    if not mongo_result.acknowledged: 
        raise ValueError("Failed to add document to MongoDB. Check connection and document.")

### Web to MongoDB pipeline 

In [28]:
def web_to_mongo(url, database, collection):
    
    print("Connecting to Mongo database...\n")
    client = MongoClient()
    db = client[database]
    mongo_loc = db[collection]
    
    print("Scraping episode meta-data...\n")
    link_soup = get_soup(url)
    episode_list = make_link_dict(link_soup)
    
    for episode in episode_list:
        print("Scraping transcript...\n")
        episode_with_transcript = get_transcript(episode)
        print("Cleaning and sending to Mongo...\n")
        cleaned_episode = get_lines(episode_with_transcript)
        send_to_mongodb(cleaned_episode, mongo_loc)
        seconds = random.uniform(0,5)
        print(f"Sleeping {int(seconds)} seconds until episode {episode['episode_number']}.\n")
        time.sleep(seconds)

    client.close()
    print('Done.\n')

## Get, clean and store the dataset

In [None]:
url = 'https://scifi.media/x-files/transcripts/'
web_to_mongo(url, 'x-files', 'transcripts')

# Next steps 

## Improving pipeline 

* The links for all the transcripts are just the episode number at the end of the link - could simpify the code to create a link list from that, but then we wouldn't get the season and episode name as easily

## Error messages and other production aspects to add 

* docstrings
* error messages/type checking
* for `get_lines()` do some more checking on the input - maybe allow for checking with the user if they mean dana scully, or fox mulder, or any other character. 
* add check db size for `web_to_mongo`