In [1]:
%%python - --url https://stackoverflow.com/questions/55457370/how-to-avoid-valueerror-separator-is-not-found-and-chunk-exceed-the-limit --tags HTML title head body p b a

import argparse, bs4, requests, re, textwrap

# url = 'https://www.twitch.tv/directory'

# tags = ['HTML', 'title', 'head', 'body', 'p', 'b', 'a']

'''
Method: extract
Input:
    url: The url to be parsed
    tags: a list of tags to be parsed from the url; 
    tags are treated as case insensitive, and stripped of '<!/>' characters
Output:
    list_of_extracted_data: 
        A list of strings found within each tag; 
        Contains all text found within tags nested inside the requested tag;
        
        Assumptions:
            If a tag does not contain text within the tag 
                or within any of the tags nested inside of the requested tag,
                the value stored inside the list of extracted data will be ''.
            If a tag is not found in the html document, 
                no content will be added to the list_of_extracted_data for that tag.

'''

def extract(url, tags):
    res = requests.get(url);
    res.raise_for_status();
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    #print('Number of tags: {}'.format(len(tags)))
    list_of_extracted_data = []
    for unique_tag_index in range(0,len(tags)):
        #print('tag #{}:'.format(unique_tag_index+1))
        regex_search_string = "^" + tags[unique_tag_index].strip('<!/>') + "$"
        #print('regex search string: {}'.format(regex_search_string))
        unique_tag_list = soup.find_all(re.compile(regex_search_string,re.I))
        #print(unique_tag_list)
        #print('result length: {}'.format(len(unique_tag_list)))
        for tag_data in range(0,len(unique_tag_list)):
            list_of_extracted_data.append(unique_tag_list[tag_data].text);
    #print('list_of_extracted_data: {}'.format(list_of_extracted_data))
    return list_of_extracted_data


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract', formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-u','--url', dest='url', type=str, required=True)
    parser.add_argument('-t','--tags', dest='tags', nargs='+', required=True, 
       help=textwrap.dedent('''\
       Method: extract
Input:
    url: The url to be parsed
    tags: a list of tags to be parsed from the url; 
    tags are treated as case insensitive, and stripped of '<!/>' characters
Output:
    list_of_extracted_data: 
        A list of strings found within each tag; 
        Contains all text found within tags nested inside the requested tag;
        
        Assumptions:
            If a tag does not contain text within the tag 
                or within any of the tags nested inside of the requested tag,
                the value stored inside the list of extracted data will be ''.
            If a tag is not found in the html document, 
                no content will be added to the list_of_extracted_data for that tag.'''))
    args = parser.parse_args()

    result_l = extract(args.url, args.tags);
    print(result_l)



['\n\npython 3.x - How to avoid "ValueError: Separator is not found, and chunk exceed the limit" - Stack Overflow\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nStack Overflow\n\n\n\nAbout\n\n\n\r\n                        Products\r\n                    \n\n\nFor Teams\n\n\n\n\n\n\n\nStack Overflow\nPublic questions & answers\n\n\n\n\nStack Overflow for Teams\nWhere developers & technologists share private knowledge with coworkers\n\n\n\n\nTalent\n\r\n                                Build your employer brand\r\n                            \n\n\n\n\nAdvertising\nReach developers & technologists worldwide\n\n\n\nAbout the company\n\n\n\n\n\n\n\n\n\n\n\nLoading…\n\n\n\n\n\n\n\n\n\n\n\n\n\ncurrent community\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n            Stack Overflow\r\n        \n\n\n\nhelp\nchat\n\n\n\n\n\n\n\n\n\r\n            Meta Stack Overflow\r\n        \n\n\n\n\n\n\r\nyour communities            \n\n\n\nSign up or log in to custom

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""