More changes to accomodate str and abs file paths for windows

eliasdabbas · Oct 30, 2023 · 2d25981 · 2d25981
1 parent 07fd31f
commit 2d25981
Showing 1 changed file with 119 additions and 77 deletions.
diff --git a/tests/test_crawl.py b/tests/test_crawl.py
@@ -1,134 +1,176 @@
 import os
+import platform
+from pathlib import Path
 
 import pandas as pd
 import pytest
+
 from advertools.spider import crawl
 
+system = platform.system()
+
 links_columns = {
-    'links_url': 14,
-    'links_text': 14,
-    'links_nofollow': 14,
-    'nav_links_url': 3,
-    'nav_links_text': 3,
-    'header_links_url': 3,
-    'header_links_text': 3,
-    'footer_links_url': 3,
-    'footer_links_text': 3,
+    "links_url": 14,
+    "links_text": 14,
+    "links_nofollow": 14,
+    "nav_links_url": 3,
+    "nav_links_text": 3,
+    "header_links_url": 3,
+    "header_links_text": 3,
+    "footer_links_url": 3,
+    "footer_links_text": 3,
 }
 
-links_file = os.path.abspath('tests/data/crawl_testing/test_content.html')
-crawl('file://' + links_file, 'links_crawl.jl',
-      custom_settings={'ROBOTSTXT_OBEY': False})
-crawl_df = pd.read_json('links_crawl.jl', lines=True)
-os.remove('links_crawl.jl')
+links_filepath = "tests/data/crawl_testing/test_content.html"
+if platform == "Windows":
+    links_filepath = links_filepath.replace("/", r"\\")
+
+links_file = Path(links_filepath).absolute()
+crawl(links_file.as_uri(), "links_crawl.jl", custom_settings={"ROBOTSTXT_OBEY": False})
+crawl_df = pd.read_json("links_crawl.jl", lines=True)
+os.remove("links_crawl.jl")
+
+crawl(
+    str(links_file.as_uri()),
+    "follow_url_params.jl",
+    allowed_domains=[str(links_file), "example.com"],
+    custom_settings={"ROBOTSTXT_OBEY": False},
+    follow_links=True,
+)
 
-crawl('file://' + links_file, 'follow_url_params.jl',
-      allowed_domains=[links_file, 'example.com'],
-      custom_settings={'ROBOTSTXT_OBEY': False},
-      follow_links=True)
-follow_url_params_df = pd.read_json('follow_url_params.jl', lines=True)
-os.remove('follow_url_params.jl')
+follow_url_params_df = pd.read_json("follow_url_params.jl", lines=True)
+os.remove("follow_url_params.jl")
 
 
 def test_follow_url_params_followed():
-    assert follow_url_params_df['url'].str.contains('?', regex=False).any()
+    assert follow_url_params_df["url"].str.contains("?", regex=False).any()
 
 
-crawl('file://' + links_file, 'dont_follow_url_params.jl',
-      allowed_domains=[links_file, 'example.com'],
-      custom_settings={'ROBOTSTXT_OBEY': False},
-      follow_links=True, exclude_url_params=True)
-dont_follow_url_params_df = pd.read_json('dont_follow_url_params.jl',
-                                         lines=True)
+crawl(
+    str(links_file.as_uri()),
+    "dont_follow_url_params.jl",
+    allowed_domains=[str(links_file), "example.com"],
+    custom_settings={"ROBOTSTXT_OBEY": False},
+    follow_links=True,
+    exclude_url_params=True,
+)
+dont_follow_url_params_df = pd.read_json("dont_follow_url_params.jl", lines=True)
 
 
 def test_dont_follow_url_params_not_followed():
-    assert not dont_follow_url_params_df['url'].str.contains('?',
-                                                             regex=False).all()
-os.remove('dont_follow_url_params.jl')
+    assert not dont_follow_url_params_df["url"].str.contains("?", regex=False).all()
+
+
+os.remove("dont_follow_url_params.jl")
+
+
+file_path = "tests/data/crawl_testing/duplicate_links.html"
+if platform == "Windows":
+    file_path = links_filepath.replace("/", r"\\")
 
 
-file_path = 'tests/data/crawl_testing/duplicate_links.html'
-dup_links_file = os.path.abspath(file_path)
-crawl('file://' + dup_links_file, 'dup_links_crawl.jl',
-      custom_settings={'ROBOTSTXT_OBEY': False})
-dup_crawl_df = pd.read_json('dup_links_crawl.jl', lines=True)
-os.remove('dup_links_crawl.jl')
+dup_links_file = Path(file_path).absolute()
+crawl(
+    str(dup_links_file.as_uri()),
+    "dup_links_crawl.jl",
+    custom_settings={"ROBOTSTXT_OBEY": False},
+)
+dup_crawl_df = pd.read_json("dup_links_crawl.jl", lines=True)
+os.remove("dup_links_crawl.jl")
+
 
 def test_link_columns_all_exist():
     assert set(links_columns).difference(crawl_df.columns.tolist()) == set()
 
 
 @pytest.mark.parametrize("colname,count", links_columns.items())
 def test_links_extracted_at_correct_number(colname, count):
-    assert crawl_df[colname].str.split('@@').str.len().values[0] == count
+    assert crawl_df[colname].str.split("@@").str.len().values[0] == count
 
 
 def test_extract_h_tags():
-    assert crawl_df['h2'].str.split('@@').str.len().values[0] == 3
-    assert crawl_df['h2'].str.split('@@').explode().iloc[1] == ''
+    assert crawl_df["h2"].str.split("@@").str.len().values[0] == 3
+    assert crawl_df["h2"].str.split("@@").explode().iloc[1] == ""
 
 
 def test_all_links_have_nofollow():
-    assert (crawl_df
-            .filter(regex='nofollow')
-            .apply(lambda s: s.str.contains("True"))
-            .all().all())
+    assert (
+        crawl_df.filter(regex="nofollow")
+        .apply(lambda s: s.str.contains("True"))
+        .all()
+        .all()
+    )
 
 
 def test_image_tags_available():
-    assert [col in crawl_df for col in ['img_src', 'img_alt',
-                                        'img_height', 'img_width']]
+    assert [
+        col in crawl_df for col in ["img_src", "img_alt", "img_height", "img_width"]
+    ]
 
 
 def test_all_img_attrs_have_same_length():
-    assert (crawl_df
-            .filter(regex='img_')
-            .apply(lambda s: s.str.split('@@').str.len())
-            .apply(set, axis=1)[0].__len__()) == 1
+    assert (
+        crawl_df.filter(regex="img_")
+        .apply(lambda s: s.str.split("@@").str.len())
+        .apply(set, axis=1)[0]
+        .__len__()
+    ) == 1
+
 
 def test_img_src_has_abs_path():
-    assert crawl_df['img_src'].str.startswith('http').all()
+    assert crawl_df["img_src"].str.startswith("http").all()
 
-dup_links_test = (['https://example_a.com' for i in range(5)] +
-                  ['https://example.com'])
 
-dup_text_test = ['Link Text A',
-                 'Link Text A',
-                 'Link Text A',
-                 'Link Text B',
-                 'Link Text C',
-                 'Link Other']
+dup_links_test = ["https://example_a.com" for i in range(5)] + ["https://example.com"]
 
-dup_nf_test = ['True'] + ['False' for i in range(5)]
+dup_text_test = [
+    "Link Text A",
+    "Link Text A",
+    "Link Text A",
+    "Link Text B",
+    "Link Text C",
+    "Link Other",
+]
+
+dup_nf_test = ["True"] + ["False" for i in range(5)]
 
 
 def test_duplicate_links_counted_propery():
-    assert dup_crawl_df['links_url'].str.split('@@')[0] == dup_links_test
-    assert dup_crawl_df['links_text'].str.split('@@')[0] == dup_text_test
-    assert dup_crawl_df['links_nofollow'].str.split('@@')[0] == dup_nf_test
+    assert dup_crawl_df["links_url"].str.split("@@")[0] == dup_links_test
+    assert dup_crawl_df["links_text"].str.split("@@")[0] == dup_text_test
+    assert dup_crawl_df["links_nofollow"].str.split("@@")[0] == dup_nf_test
 
 
 def test_non_existent_links_are_NA():
-    assert 'nav_links_url' not in dup_crawl_df
-    assert 'nav_links_text' not in dup_crawl_df
-    assert 'header_links_url' not in dup_crawl_df
-    assert 'footer_links_url' not in dup_crawl_df
+    assert "nav_links_url" not in dup_crawl_df
+    assert "nav_links_text" not in dup_crawl_df
+    assert "header_links_url" not in dup_crawl_df
+    assert "footer_links_url" not in dup_crawl_df
+
 
-broken_links_file = os.path.abspath('tests/data/crawl_testing/broken_links.html')
+broken_links_path = "tests/data/crawl_testing/broken_links.html"
+if platform == "Windows":
+    broken_links_path = links_filepath.replace("/", r"\\")
+
+broken_links_file = Path(broken_links_path).absolute()
+
+crawl(
+    [str(broken_links_file.as_uri()), "wrong_url"],
+    "broken_links_crawl.jl",
+    follow_links=True,
+)
 
-crawl(['file://' + broken_links_file, 'wrong_url'], 'broken_links_crawl.jl',
-      follow_links=True)
 
 def test_broken_links_are_reported():
-    broken_links_df = pd.read_json('broken_links_crawl.jl', lines=True)
-    assert 'errors' in broken_links_df
-    assert 'wrong_url' not in broken_links_df['url']
-    os.remove('broken_links_crawl.jl')
+    broken_links_df = pd.read_json("broken_links_crawl.jl", lines=True)
+    assert "errors" in broken_links_df
+    assert "wrong_url" not in broken_links_df["url"]
+    os.remove("broken_links_crawl.jl")
+
 
 def test_crawling_bad_url_directly_is_handled():
-    crawl(['wrong_url', 'https://example.com'], 'bad_url.jl')
-    bad_url_df = pd.read_json('bad_url.jl', lines=True)
+    crawl(["wrong_url", "https://example.com"], "bad_url.jl")
+    bad_url_df = pd.read_json("bad_url.jl", lines=True)
     assert len(bad_url_df) == 1
-    assert bad_url_df['url'][0] == 'https://example.com'
-    os.remove('bad_url.jl')
+    assert bad_url_df["url"][0] == "https://example.com"
+    os.remove("bad_url.jl")