In [5]:
import requests
import os
import fsspec
import lxml
import lxml.html
import lxml.etree
import urllib
import joblib
import pandas as pd

In [6]:
session_storage = 'downloads'

os.makedirs(session_storage, exist_ok=True)

fs_dataset = fsspec.filesystem(
    'simplecache', 
    target_protocol='https', 
    cache_storage=session_storage,
)

In [7]:
def get_all_links(url):
    return lxml.html.fromstring(
        requests.get(url).text
    ).xpath('//a')

def get_all_hrefs(url, default=None):
    return map(
        lambda x: x.attrib.get('href', default),
        get_all_links(url)
    )

def get_script(script_link):
    res = ''
    for el in lxml.html.fromstring(
        requests.get('https://imsdb.com/' + script_link).text
    ).xpath('//td[@class="scrtext"]/pre'):    
        res += el.text_content() 
    return res    

def get_movie_links():
    return {
        l.attrib.get('href'): l.text_content()
            for l in filter(
                lambda x: os.path.dirname(x.attrib.get('href')) == '/Movie Scripts',
                get_all_links('https://imsdb.com/all-scripts.html')
            )
    }

def get_movie_details(movie_links):
    raise NotImplementedError

def get_script_links(movie_link):
    return set(
        filter(
            lambda u: u is not None and os.path.dirname(u) == '/scripts',            
            get_all_hrefs('https://imsdb.com/' + movie_link)
        )
    )

def get_movie_name(movie_link):
    name, _ = os.path.splitext(
        os.path.basename(movie_link)
    )
    return name
    
def get_movie_script_links():
    def _f(movie_link, movie_name):
        return (movie_name, get_script_links(movie_link))
    return joblib.Parallel(n_jobs=-1)(
        joblib.delayed(_f)(movie_link, movie_name)
            for movie_link, movie_name in get_movie_links().items()
    )

def get_scripts(movie_script_links):
    def _f(movie_name, script_link):
        return (
            movie_name,
            list(map(
                get_script,
                script_link
            ))
        )

    return joblib.Parallel(n_jobs=-1)(
        joblib.delayed(_f)(movie_name, script_link) 
            for movie_name, script_link in movie_script_links
    )

In [8]:
df = pd.DataFrame(
    get_scripts(
        get_movie_script_links()
    ),
    columns=['movie_name', 'script_text']
)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movie_name   1209 non-null   object
 1   script_text  1209 non-null   object
dtypes: object(2)
memory usage: 19.0+ KB


In [10]:
df.to_pickle('./dataset.pkl', compression='gzip')

In [11]:
pd.read_pickle('./dataset.pkl', compression='gzip')

Unnamed: 0,movie_name,script_text
0,Reservoir Dogs,[\r\n\r\n\r\n\r\n\r\n\r\n ...
1,How to Train Your Dragon,[\r\n\r\n\r\n HOW T...
2,Scream,[ \r\n ...
3,Groundhog Day,[\r\n\r\n \r\n ...
4,Black Panther,[\r\n\r\n\r\n \r\n BL...
...,...,...
1204,You've Got Mail,[\r\n\r\nYou've Got Mail\r\n\r\n\r\n\r\n\t\t\t...
1205,Youth in Revolt,[ \r\n \r\n\r\n ...
1206,Zero Dark Thirty,[\r\n\r\n \r\n ...
1207,Zerophilia,[\r\n\r\n\r\n\r\n ...
