Skip to content

Commit

Permalink
More changes to accomodate str and abs file paths for windows
Browse files Browse the repository at this point in the history
  • Loading branch information
eliasdabbas committed Oct 30, 2023
1 parent 07fd31f commit 2d25981
Showing 1 changed file with 119 additions and 77 deletions.
196 changes: 119 additions & 77 deletions tests/test_crawl.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,176 @@
import os
import platform
from pathlib import Path

import pandas as pd
import pytest

from advertools.spider import crawl

system = platform.system()

links_columns = {
'links_url': 14,
'links_text': 14,
'links_nofollow': 14,
'nav_links_url': 3,
'nav_links_text': 3,
'header_links_url': 3,
'header_links_text': 3,
'footer_links_url': 3,
'footer_links_text': 3,
"links_url": 14,
"links_text": 14,
"links_nofollow": 14,
"nav_links_url": 3,
"nav_links_text": 3,
"header_links_url": 3,
"header_links_text": 3,
"footer_links_url": 3,
"footer_links_text": 3,
}

links_file = os.path.abspath('tests/data/crawl_testing/test_content.html')
crawl('file://' + links_file, 'links_crawl.jl',
custom_settings={'ROBOTSTXT_OBEY': False})
crawl_df = pd.read_json('links_crawl.jl', lines=True)
os.remove('links_crawl.jl')
links_filepath = "tests/data/crawl_testing/test_content.html"
if platform == "Windows":
links_filepath = links_filepath.replace("/", r"\\")

links_file = Path(links_filepath).absolute()
crawl(links_file.as_uri(), "links_crawl.jl", custom_settings={"ROBOTSTXT_OBEY": False})
crawl_df = pd.read_json("links_crawl.jl", lines=True)
os.remove("links_crawl.jl")

crawl(
str(links_file.as_uri()),
"follow_url_params.jl",
allowed_domains=[str(links_file), "example.com"],
custom_settings={"ROBOTSTXT_OBEY": False},
follow_links=True,
)

crawl('file://' + links_file, 'follow_url_params.jl',
allowed_domains=[links_file, 'example.com'],
custom_settings={'ROBOTSTXT_OBEY': False},
follow_links=True)
follow_url_params_df = pd.read_json('follow_url_params.jl', lines=True)
os.remove('follow_url_params.jl')
follow_url_params_df = pd.read_json("follow_url_params.jl", lines=True)
os.remove("follow_url_params.jl")


def test_follow_url_params_followed():
assert follow_url_params_df['url'].str.contains('?', regex=False).any()
assert follow_url_params_df["url"].str.contains("?", regex=False).any()


crawl('file://' + links_file, 'dont_follow_url_params.jl',
allowed_domains=[links_file, 'example.com'],
custom_settings={'ROBOTSTXT_OBEY': False},
follow_links=True, exclude_url_params=True)
dont_follow_url_params_df = pd.read_json('dont_follow_url_params.jl',
lines=True)
crawl(
str(links_file.as_uri()),
"dont_follow_url_params.jl",
allowed_domains=[str(links_file), "example.com"],
custom_settings={"ROBOTSTXT_OBEY": False},
follow_links=True,
exclude_url_params=True,
)
dont_follow_url_params_df = pd.read_json("dont_follow_url_params.jl", lines=True)


def test_dont_follow_url_params_not_followed():
assert not dont_follow_url_params_df['url'].str.contains('?',
regex=False).all()
os.remove('dont_follow_url_params.jl')
assert not dont_follow_url_params_df["url"].str.contains("?", regex=False).all()


os.remove("dont_follow_url_params.jl")


file_path = "tests/data/crawl_testing/duplicate_links.html"
if platform == "Windows":
file_path = links_filepath.replace("/", r"\\")


file_path = 'tests/data/crawl_testing/duplicate_links.html'
dup_links_file = os.path.abspath(file_path)
crawl('file://' + dup_links_file, 'dup_links_crawl.jl',
custom_settings={'ROBOTSTXT_OBEY': False})
dup_crawl_df = pd.read_json('dup_links_crawl.jl', lines=True)
os.remove('dup_links_crawl.jl')
dup_links_file = Path(file_path).absolute()
crawl(
str(dup_links_file.as_uri()),
"dup_links_crawl.jl",
custom_settings={"ROBOTSTXT_OBEY": False},
)
dup_crawl_df = pd.read_json("dup_links_crawl.jl", lines=True)
os.remove("dup_links_crawl.jl")


def test_link_columns_all_exist():
assert set(links_columns).difference(crawl_df.columns.tolist()) == set()


@pytest.mark.parametrize("colname,count", links_columns.items())
def test_links_extracted_at_correct_number(colname, count):
assert crawl_df[colname].str.split('@@').str.len().values[0] == count
assert crawl_df[colname].str.split("@@").str.len().values[0] == count


def test_extract_h_tags():
assert crawl_df['h2'].str.split('@@').str.len().values[0] == 3
assert crawl_df['h2'].str.split('@@').explode().iloc[1] == ''
assert crawl_df["h2"].str.split("@@").str.len().values[0] == 3
assert crawl_df["h2"].str.split("@@").explode().iloc[1] == ""


def test_all_links_have_nofollow():
assert (crawl_df
.filter(regex='nofollow')
.apply(lambda s: s.str.contains("True"))
.all().all())
assert (
crawl_df.filter(regex="nofollow")
.apply(lambda s: s.str.contains("True"))
.all()
.all()
)


def test_image_tags_available():
assert [col in crawl_df for col in ['img_src', 'img_alt',
'img_height', 'img_width']]
assert [
col in crawl_df for col in ["img_src", "img_alt", "img_height", "img_width"]
]


def test_all_img_attrs_have_same_length():
assert (crawl_df
.filter(regex='img_')
.apply(lambda s: s.str.split('@@').str.len())
.apply(set, axis=1)[0].__len__()) == 1
assert (
crawl_df.filter(regex="img_")
.apply(lambda s: s.str.split("@@").str.len())
.apply(set, axis=1)[0]
.__len__()
) == 1


def test_img_src_has_abs_path():
assert crawl_df['img_src'].str.startswith('http').all()
assert crawl_df["img_src"].str.startswith("http").all()

dup_links_test = (['https://example_a.com' for i in range(5)] +
['https://example.com'])

dup_text_test = ['Link Text A',
'Link Text A',
'Link Text A',
'Link Text B',
'Link Text C',
'Link Other']
dup_links_test = ["https://example_a.com" for i in range(5)] + ["https://example.com"]

dup_nf_test = ['True'] + ['False' for i in range(5)]
dup_text_test = [
"Link Text A",
"Link Text A",
"Link Text A",
"Link Text B",
"Link Text C",
"Link Other",
]

dup_nf_test = ["True"] + ["False" for i in range(5)]


def test_duplicate_links_counted_propery():
assert dup_crawl_df['links_url'].str.split('@@')[0] == dup_links_test
assert dup_crawl_df['links_text'].str.split('@@')[0] == dup_text_test
assert dup_crawl_df['links_nofollow'].str.split('@@')[0] == dup_nf_test
assert dup_crawl_df["links_url"].str.split("@@")[0] == dup_links_test
assert dup_crawl_df["links_text"].str.split("@@")[0] == dup_text_test
assert dup_crawl_df["links_nofollow"].str.split("@@")[0] == dup_nf_test


def test_non_existent_links_are_NA():
assert 'nav_links_url' not in dup_crawl_df
assert 'nav_links_text' not in dup_crawl_df
assert 'header_links_url' not in dup_crawl_df
assert 'footer_links_url' not in dup_crawl_df
assert "nav_links_url" not in dup_crawl_df
assert "nav_links_text" not in dup_crawl_df
assert "header_links_url" not in dup_crawl_df
assert "footer_links_url" not in dup_crawl_df


broken_links_file = os.path.abspath('tests/data/crawl_testing/broken_links.html')
broken_links_path = "tests/data/crawl_testing/broken_links.html"
if platform == "Windows":
broken_links_path = links_filepath.replace("/", r"\\")

broken_links_file = Path(broken_links_path).absolute()

crawl(
[str(broken_links_file.as_uri()), "wrong_url"],
"broken_links_crawl.jl",
follow_links=True,
)

crawl(['file://' + broken_links_file, 'wrong_url'], 'broken_links_crawl.jl',
follow_links=True)

def test_broken_links_are_reported():
broken_links_df = pd.read_json('broken_links_crawl.jl', lines=True)
assert 'errors' in broken_links_df
assert 'wrong_url' not in broken_links_df['url']
os.remove('broken_links_crawl.jl')
broken_links_df = pd.read_json("broken_links_crawl.jl", lines=True)
assert "errors" in broken_links_df
assert "wrong_url" not in broken_links_df["url"]
os.remove("broken_links_crawl.jl")


def test_crawling_bad_url_directly_is_handled():
crawl(['wrong_url', 'https://example.com'], 'bad_url.jl')
bad_url_df = pd.read_json('bad_url.jl', lines=True)
crawl(["wrong_url", "https://example.com"], "bad_url.jl")
bad_url_df = pd.read_json("bad_url.jl", lines=True)
assert len(bad_url_df) == 1
assert bad_url_df['url'][0] == 'https://example.com'
os.remove('bad_url.jl')
assert bad_url_df["url"][0] == "https://example.com"
os.remove("bad_url.jl")

0 comments on commit 2d25981

Please sign in to comment.