-
Notifications
You must be signed in to change notification settings - Fork 204
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More changes to accomodate str and abs file paths for windows
- Loading branch information
1 parent
07fd31f
commit 2d25981
Showing
1 changed file
with
119 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,134 +1,176 @@ | ||
import os | ||
import platform | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from advertools.spider import crawl | ||
|
||
system = platform.system() | ||
|
||
links_columns = { | ||
'links_url': 14, | ||
'links_text': 14, | ||
'links_nofollow': 14, | ||
'nav_links_url': 3, | ||
'nav_links_text': 3, | ||
'header_links_url': 3, | ||
'header_links_text': 3, | ||
'footer_links_url': 3, | ||
'footer_links_text': 3, | ||
"links_url": 14, | ||
"links_text": 14, | ||
"links_nofollow": 14, | ||
"nav_links_url": 3, | ||
"nav_links_text": 3, | ||
"header_links_url": 3, | ||
"header_links_text": 3, | ||
"footer_links_url": 3, | ||
"footer_links_text": 3, | ||
} | ||
|
||
links_file = os.path.abspath('tests/data/crawl_testing/test_content.html') | ||
crawl('file://' + links_file, 'links_crawl.jl', | ||
custom_settings={'ROBOTSTXT_OBEY': False}) | ||
crawl_df = pd.read_json('links_crawl.jl', lines=True) | ||
os.remove('links_crawl.jl') | ||
links_filepath = "tests/data/crawl_testing/test_content.html" | ||
if platform == "Windows": | ||
links_filepath = links_filepath.replace("/", r"\\") | ||
|
||
links_file = Path(links_filepath).absolute() | ||
crawl(links_file.as_uri(), "links_crawl.jl", custom_settings={"ROBOTSTXT_OBEY": False}) | ||
crawl_df = pd.read_json("links_crawl.jl", lines=True) | ||
os.remove("links_crawl.jl") | ||
|
||
crawl( | ||
str(links_file.as_uri()), | ||
"follow_url_params.jl", | ||
allowed_domains=[str(links_file), "example.com"], | ||
custom_settings={"ROBOTSTXT_OBEY": False}, | ||
follow_links=True, | ||
) | ||
|
||
crawl('file://' + links_file, 'follow_url_params.jl', | ||
allowed_domains=[links_file, 'example.com'], | ||
custom_settings={'ROBOTSTXT_OBEY': False}, | ||
follow_links=True) | ||
follow_url_params_df = pd.read_json('follow_url_params.jl', lines=True) | ||
os.remove('follow_url_params.jl') | ||
follow_url_params_df = pd.read_json("follow_url_params.jl", lines=True) | ||
os.remove("follow_url_params.jl") | ||
|
||
|
||
def test_follow_url_params_followed(): | ||
assert follow_url_params_df['url'].str.contains('?', regex=False).any() | ||
assert follow_url_params_df["url"].str.contains("?", regex=False).any() | ||
|
||
|
||
crawl('file://' + links_file, 'dont_follow_url_params.jl', | ||
allowed_domains=[links_file, 'example.com'], | ||
custom_settings={'ROBOTSTXT_OBEY': False}, | ||
follow_links=True, exclude_url_params=True) | ||
dont_follow_url_params_df = pd.read_json('dont_follow_url_params.jl', | ||
lines=True) | ||
crawl( | ||
str(links_file.as_uri()), | ||
"dont_follow_url_params.jl", | ||
allowed_domains=[str(links_file), "example.com"], | ||
custom_settings={"ROBOTSTXT_OBEY": False}, | ||
follow_links=True, | ||
exclude_url_params=True, | ||
) | ||
dont_follow_url_params_df = pd.read_json("dont_follow_url_params.jl", lines=True) | ||
|
||
|
||
def test_dont_follow_url_params_not_followed(): | ||
assert not dont_follow_url_params_df['url'].str.contains('?', | ||
regex=False).all() | ||
os.remove('dont_follow_url_params.jl') | ||
assert not dont_follow_url_params_df["url"].str.contains("?", regex=False).all() | ||
|
||
|
||
os.remove("dont_follow_url_params.jl") | ||
|
||
|
||
file_path = "tests/data/crawl_testing/duplicate_links.html" | ||
if platform == "Windows": | ||
file_path = links_filepath.replace("/", r"\\") | ||
|
||
|
||
file_path = 'tests/data/crawl_testing/duplicate_links.html' | ||
dup_links_file = os.path.abspath(file_path) | ||
crawl('file://' + dup_links_file, 'dup_links_crawl.jl', | ||
custom_settings={'ROBOTSTXT_OBEY': False}) | ||
dup_crawl_df = pd.read_json('dup_links_crawl.jl', lines=True) | ||
os.remove('dup_links_crawl.jl') | ||
dup_links_file = Path(file_path).absolute() | ||
crawl( | ||
str(dup_links_file.as_uri()), | ||
"dup_links_crawl.jl", | ||
custom_settings={"ROBOTSTXT_OBEY": False}, | ||
) | ||
dup_crawl_df = pd.read_json("dup_links_crawl.jl", lines=True) | ||
os.remove("dup_links_crawl.jl") | ||
|
||
|
||
def test_link_columns_all_exist(): | ||
assert set(links_columns).difference(crawl_df.columns.tolist()) == set() | ||
|
||
|
||
@pytest.mark.parametrize("colname,count", links_columns.items()) | ||
def test_links_extracted_at_correct_number(colname, count): | ||
assert crawl_df[colname].str.split('@@').str.len().values[0] == count | ||
assert crawl_df[colname].str.split("@@").str.len().values[0] == count | ||
|
||
|
||
def test_extract_h_tags(): | ||
assert crawl_df['h2'].str.split('@@').str.len().values[0] == 3 | ||
assert crawl_df['h2'].str.split('@@').explode().iloc[1] == '' | ||
assert crawl_df["h2"].str.split("@@").str.len().values[0] == 3 | ||
assert crawl_df["h2"].str.split("@@").explode().iloc[1] == "" | ||
|
||
|
||
def test_all_links_have_nofollow(): | ||
assert (crawl_df | ||
.filter(regex='nofollow') | ||
.apply(lambda s: s.str.contains("True")) | ||
.all().all()) | ||
assert ( | ||
crawl_df.filter(regex="nofollow") | ||
.apply(lambda s: s.str.contains("True")) | ||
.all() | ||
.all() | ||
) | ||
|
||
|
||
def test_image_tags_available(): | ||
assert [col in crawl_df for col in ['img_src', 'img_alt', | ||
'img_height', 'img_width']] | ||
assert [ | ||
col in crawl_df for col in ["img_src", "img_alt", "img_height", "img_width"] | ||
] | ||
|
||
|
||
def test_all_img_attrs_have_same_length(): | ||
assert (crawl_df | ||
.filter(regex='img_') | ||
.apply(lambda s: s.str.split('@@').str.len()) | ||
.apply(set, axis=1)[0].__len__()) == 1 | ||
assert ( | ||
crawl_df.filter(regex="img_") | ||
.apply(lambda s: s.str.split("@@").str.len()) | ||
.apply(set, axis=1)[0] | ||
.__len__() | ||
) == 1 | ||
|
||
|
||
def test_img_src_has_abs_path(): | ||
assert crawl_df['img_src'].str.startswith('http').all() | ||
assert crawl_df["img_src"].str.startswith("http").all() | ||
|
||
dup_links_test = (['https://example_a.com' for i in range(5)] + | ||
['https://example.com']) | ||
|
||
dup_text_test = ['Link Text A', | ||
'Link Text A', | ||
'Link Text A', | ||
'Link Text B', | ||
'Link Text C', | ||
'Link Other'] | ||
dup_links_test = ["https://example_a.com" for i in range(5)] + ["https://example.com"] | ||
|
||
dup_nf_test = ['True'] + ['False' for i in range(5)] | ||
dup_text_test = [ | ||
"Link Text A", | ||
"Link Text A", | ||
"Link Text A", | ||
"Link Text B", | ||
"Link Text C", | ||
"Link Other", | ||
] | ||
|
||
dup_nf_test = ["True"] + ["False" for i in range(5)] | ||
|
||
|
||
def test_duplicate_links_counted_propery(): | ||
assert dup_crawl_df['links_url'].str.split('@@')[0] == dup_links_test | ||
assert dup_crawl_df['links_text'].str.split('@@')[0] == dup_text_test | ||
assert dup_crawl_df['links_nofollow'].str.split('@@')[0] == dup_nf_test | ||
assert dup_crawl_df["links_url"].str.split("@@")[0] == dup_links_test | ||
assert dup_crawl_df["links_text"].str.split("@@")[0] == dup_text_test | ||
assert dup_crawl_df["links_nofollow"].str.split("@@")[0] == dup_nf_test | ||
|
||
|
||
def test_non_existent_links_are_NA(): | ||
assert 'nav_links_url' not in dup_crawl_df | ||
assert 'nav_links_text' not in dup_crawl_df | ||
assert 'header_links_url' not in dup_crawl_df | ||
assert 'footer_links_url' not in dup_crawl_df | ||
assert "nav_links_url" not in dup_crawl_df | ||
assert "nav_links_text" not in dup_crawl_df | ||
assert "header_links_url" not in dup_crawl_df | ||
assert "footer_links_url" not in dup_crawl_df | ||
|
||
|
||
broken_links_file = os.path.abspath('tests/data/crawl_testing/broken_links.html') | ||
broken_links_path = "tests/data/crawl_testing/broken_links.html" | ||
if platform == "Windows": | ||
broken_links_path = links_filepath.replace("/", r"\\") | ||
|
||
broken_links_file = Path(broken_links_path).absolute() | ||
|
||
crawl( | ||
[str(broken_links_file.as_uri()), "wrong_url"], | ||
"broken_links_crawl.jl", | ||
follow_links=True, | ||
) | ||
|
||
crawl(['file://' + broken_links_file, 'wrong_url'], 'broken_links_crawl.jl', | ||
follow_links=True) | ||
|
||
def test_broken_links_are_reported(): | ||
broken_links_df = pd.read_json('broken_links_crawl.jl', lines=True) | ||
assert 'errors' in broken_links_df | ||
assert 'wrong_url' not in broken_links_df['url'] | ||
os.remove('broken_links_crawl.jl') | ||
broken_links_df = pd.read_json("broken_links_crawl.jl", lines=True) | ||
assert "errors" in broken_links_df | ||
assert "wrong_url" not in broken_links_df["url"] | ||
os.remove("broken_links_crawl.jl") | ||
|
||
|
||
def test_crawling_bad_url_directly_is_handled(): | ||
crawl(['wrong_url', 'https://example.com'], 'bad_url.jl') | ||
bad_url_df = pd.read_json('bad_url.jl', lines=True) | ||
crawl(["wrong_url", "https://example.com"], "bad_url.jl") | ||
bad_url_df = pd.read_json("bad_url.jl", lines=True) | ||
assert len(bad_url_df) == 1 | ||
assert bad_url_df['url'][0] == 'https://example.com' | ||
os.remove('bad_url.jl') | ||
assert bad_url_df["url"][0] == "https://example.com" | ||
os.remove("bad_url.jl") |