Webgraph construction (cc-main-2022-may-jun-aug):

- avoid multiple extraction of host names from source and base URLs - implement method get_links(...) in class ExtractHostLinksJob - pass extracted source and base host names to method yield_links(...) - update IANA TLD list - consistent naming of source nodes (src_url, src_host instead of from_url, from_host)
commoncrawl · Sep 21, 2022 · 54918e8 · 54918e8
1 parent 0c17f69
commit 54918e8
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 58 deletions.
diff --git a/iana_tld.py b/iana_tld.py
@@ -106,38 +106,38 @@
     'jnj', 'jo', 'jobs', 'joburg', 'jot', 'joy', 'jp', 'jpmorgan',
     'jprs', 'juegos', 'juniper', 'kaufen', 'kddi', 'ke',
     'kerryhotels', 'kerrylogistics', 'kerryproperties', 'kfh',
-    'kg', 'kh', 'ki', 'kia', 'kim', 'kinder', 'kindle', 'kitchen',
-    'kiwi', 'km', 'kn', 'koeln', 'komatsu', 'kosher', 'kp', 'kpmg',
-    'kpn', 'kr', 'krd', 'kred', 'kuokgroup', 'kw', 'ky', 'kyoto',
-    'kz', 'la', 'lacaixa', 'ladbrokes', 'lamborghini', 'lamer',
-    'lancaster', 'lancia', 'lancome', 'land', 'landrover',
-    'lanxess', 'lasalle', 'lat', 'latino', 'latrobe', 'law',
-    'lawyer', 'lb', 'lc', 'lds', 'lease', 'leclerc', 'lefrak',
-    'legal', 'lego', 'lexus', 'lgbt', 'li', 'liaison', 'lidl',
-    'life', 'lifeinsurance', 'lifestyle', 'lighting', 'like',
-    'lilly', 'limited', 'limo', 'lincoln', 'linde', 'link',
-    'lipsy', 'live', 'living', 'lixil', 'lk', 'llc', 'llp', 'loan',
-    'loans', 'locker', 'locus', 'loft', 'lol', 'london', 'lotte',
-    'lotto', 'love', 'lpl', 'lplfinancial', 'lr', 'ls', 'lt',
-    'ltd', 'ltda', 'lu', 'lundbeck', 'lupin', 'luxe', 'luxury',
-    'lv', 'ly', 'ma', 'macys', 'madrid', 'maif', 'maison',
-    'makeup', 'man', 'management', 'mango', 'map', 'market',
-    'marketing', 'markets', 'marriott', 'marshalls', 'maserati',
-    'mattel', 'mba', 'mc', 'mcd', 'mcdonalds', 'mckinsey', 'md',
-    'me', 'med', 'media', 'meet', 'melbourne', 'meme', 'memorial',
-    'men', 'menu', 'meo', 'merckmsd', 'metlife', 'mg', 'mh',
-    'miami', 'microsoft', 'mil', 'mini', 'mint', 'mit',
-    'mitsubishi', 'mk', 'ml', 'mlb', 'mls', 'mm', 'mma', 'mn',
-    'mo', 'mobi', 'mobile', 'mobily', 'moda', 'moe', 'moi', 'mom',
-    'monash', 'money', 'monster', 'montblanc', 'mopar', 'mormon',
-    'mortgage', 'moscow', 'moto', 'motorcycles', 'mov', 'movie',
-    'movistar', 'mp', 'mq', 'mr', 'ms', 'msd', 'mt', 'mtn', 'mtpc',
-    'mtr', 'mu', 'museum', 'music', 'mutual', 'mv', 'mw', 'mx',
-    'my', 'mz', 'na', 'nab', 'nadex', 'nagoya', 'name',
-    'nationwide', 'natura', 'navy', 'nba', 'nc', 'ne', 'nec',
-    'net', 'netbank', 'netflix', 'network', 'neustar', 'new',
-    'newholland', 'news', 'next', 'nextdirect', 'nexus', 'nf',
-    'nfl', 'ng', 'ngo', 'nhk', 'ni', 'nico', 'nike', 'nikon',
+    'kg', 'kh', 'ki', 'kia', 'kids', 'kim', 'kinder', 'kindle',
+    'kitchen', 'kiwi', 'km', 'kn', 'koeln', 'komatsu', 'kosher',
+    'kp', 'kpmg', 'kpn', 'kr', 'krd', 'kred', 'kuokgroup', 'kw',
+    'ky', 'kyoto', 'kz', 'la', 'lacaixa', 'ladbrokes',
+    'lamborghini', 'lamer', 'lancaster', 'lancia', 'lancome',
+    'land', 'landrover', 'lanxess', 'lasalle', 'lat', 'latino',
+    'latrobe', 'law', 'lawyer', 'lb', 'lc', 'lds', 'lease',
+    'leclerc', 'lefrak', 'legal', 'lego', 'lexus', 'lgbt', 'li',
+    'liaison', 'lidl', 'life', 'lifeinsurance', 'lifestyle',
+    'lighting', 'like', 'lilly', 'limited', 'limo', 'lincoln',
+    'linde', 'link', 'lipsy', 'live', 'living', 'lixil', 'lk',
+    'llc', 'llp', 'loan', 'loans', 'locker', 'locus', 'loft',
+    'lol', 'london', 'lotte', 'lotto', 'love', 'lpl',
+    'lplfinancial', 'lr', 'ls', 'lt', 'ltd', 'ltda', 'lu',
+    'lundbeck', 'lupin', 'luxe', 'luxury', 'lv', 'ly', 'ma',
+    'macys', 'madrid', 'maif', 'maison', 'makeup', 'man',
+    'management', 'mango', 'map', 'market', 'marketing', 'markets',
+    'marriott', 'marshalls', 'maserati', 'mattel', 'mba', 'mc',
+    'mcd', 'mcdonalds', 'mckinsey', 'md', 'me', 'med', 'media',
+    'meet', 'melbourne', 'meme', 'memorial', 'men', 'menu', 'meo',
+    'merckmsd', 'metlife', 'mg', 'mh', 'miami', 'microsoft', 'mil',
+    'mini', 'mint', 'mit', 'mitsubishi', 'mk', 'ml', 'mlb', 'mls',
+    'mm', 'mma', 'mn', 'mo', 'mobi', 'mobile', 'mobily', 'moda',
+    'moe', 'moi', 'mom', 'monash', 'money', 'monster', 'montblanc',
+    'mopar', 'mormon', 'mortgage', 'moscow', 'moto', 'motorcycles',
+    'mov', 'movie', 'movistar', 'mp', 'mq', 'mr', 'ms', 'msd',
+    'mt', 'mtn', 'mtpc', 'mtr', 'mu', 'museum', 'music', 'mutual',
+    'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nab', 'nadex', 'nagoya',
+    'name', 'nationwide', 'natura', 'navy', 'nba', 'nc', 'ne',
+    'nec', 'net', 'netbank', 'netflix', 'network', 'neustar',
+    'new', 'newholland', 'news', 'next', 'nextdirect', 'nexus',
+    'nf', 'nfl', 'ng', 'ngo', 'nhk', 'ni', 'nico', 'nike', 'nikon',
     'ninja', 'nissan', 'nissay', 'nl', 'no', 'nokia',
     'northwesternmutual', 'norton', 'now', 'nowruz', 'nowtv', 'np',
     'nr', 'nra', 'nrw', 'ntt', 'nu', 'nyc', 'nz', 'obi',

diff --git a/wat_extract_links.py b/wat_extract_links.py
@@ -168,10 +168,10 @@ def yield_http_header_links(self, url, headers):
             for m in ExtractLinksJob.http_link_pattern.finditer(headers['Link']):
                 yield url, m.group(1)
 
-    def yield_links(self, from_url, base_url, links, url_attr, opt_attr=None):
+    def yield_links(self, src_url, base_url, links, url_attr, opt_attr=None):
         # base_url = urlparse(base)
         if not base_url:
-            base_url = from_url
+            base_url = src_url
         has_links = False
         for l in links:
             link = None
@@ -187,11 +187,11 @@ def yield_links(self, from_url, base_url, links, url_attr, opt_attr=None):
             except ValueError:
                 continue
             has_links = True
-            yield from_url, lurl
+            yield src_url, lurl
         if not has_links:
             # ensure that every page is a node in the graph
             # even if it has not outgoing links
-            yield from_url, from_url
+            yield src_url, src_url
 
     def yield_link(self, src, target):
         yield src, target
@@ -314,7 +314,7 @@ class ExtractHostLinksJob(ExtractLinksJob):
     """Extract links from WAT files, redirects from WARC files,
     and sitemap links from robots.txt response records.
     Extract the host names, reverse the names (example.com -> com.example)
-    and save the pairs <from_host, to_host>."""
+    and save the pairs <source_host, target_host>."""
 
     name = "ExtrHostLinks"
     output_schema = StructType([
@@ -394,18 +394,16 @@ def get_surt_host(url):
         parts.reverse()
         return '.'.join(parts)
 
-    def yield_links(self, from_url, base_url, links, url_attr, opt_attr=None):
-        from_host = ExtractHostLinksJob.get_surt_host(from_url)
-        base_host = None
-        if not from_host:
-            if base_url:
-                base_host = ExtractHostLinksJob.get_surt_host(base_url)
-                if base_host:
-                    from_host = base_host
-                else:
-                    return
-            else:
-                return
+    def yield_links(self, src_url, base_url, links, url_attr, opt_attr=None,
+                    src_host=None, base_host=None):
+        if not src_host:
+            src_host = ExtractHostLinksJob.get_surt_host(src_url)
+        if base_url and not base_host:
+            base_host = ExtractHostLinksJob.get_surt_host(base_url)
+        if base_host and not src_host:
+            src_host = base_host
+        if not src_host:
+            return
         target_hosts = set()
         inner_host_links = 0
         for l in links:
@@ -422,7 +420,7 @@ def yield_links(self, from_url, base_url, links, url_attr, opt_attr=None):
                     thost = ExtractHostLinksJob.get_surt_host(link)
                     if not thost:
                         pass  # no host, e.g., http:///abc/, file:///C:...
-                    elif thost == from_host:
+                    elif thost == src_host:
                         pass  # global link to same host
                     else:
                         target_hosts.add(thost)
@@ -431,36 +429,90 @@ def yield_links(self, from_url, base_url, links, url_attr, opt_attr=None):
             else:
                 inner_host_links += 1
         for t in target_hosts:
-            yield from_host, t
+            yield src_host, t
         if inner_host_links > 0 and base_url is not None:
-            if not base_host:
-                base_host = ExtractHostLinksJob.get_surt_host(base_url)
-            if base_host and base_host != from_host:
+            if base_host and base_host != src_host:
                 # any internal link becomes an external link
-                yield from_host, base_host
+                yield src_host, base_host
 
     def yield_link(self, src, target):
         src_host = ExtractHostLinksJob.get_surt_host(src)
         thost = ExtractHostLinksJob.get_surt_host(target)
         if thost and src_host:
             yield src_host, thost
 
-    def yield_http_header_links(self, url, headers):
+    def yield_http_header_links(self, url, headers, src_host=None):
         links = []
         if 'Content-Location' in headers:
             links.append(headers['Content-Location'])
         if 'Link' in headers:
             for m in ExtractLinksJob.http_link_pattern.finditer(headers['Link']):
                 links.append(m.group(1))
         if links:
-            src_host = ExtractHostLinksJob.get_surt_host(url)
             if not src_host:
-                return
+                src_host = ExtractHostLinksJob.get_surt_host(url)
+                if not src_host:
+                    return
             for link in links:
                 host = ExtractHostLinksJob.get_surt_host(link)
                 if host is not None and src_host != host:
                     yield src_host, host
 
+    def get_links(self, url, record):
+        try:
+            response_meta = record['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']
+            src_host = ExtractHostLinksJob.get_surt_host(url)
+            if src_host:
+                if 'Headers' in response_meta:
+                    # extract links from HTTP header
+                    for l in self.yield_http_header_links(url, response_meta['Headers'],
+                                                          src_host=src_host):
+                        yield l
+            if 'HTML-Metadata' not in response_meta:
+                self.records_non_html.add(1)
+                return
+            html_meta = response_meta['HTML-Metadata']
+            base = None
+            base_host = None
+            if 'Head' in html_meta:
+                head = html_meta['Head']
+                if 'Base' in head:
+                    try:
+                        base = urljoin(url, head['Base'])
+                        base_host = ExtractHostLinksJob.get_surt_host(base)
+                    except ValueError:
+                        pass
+                if 'Link' in head:
+                    # <link ...>
+                    for l in self.yield_links(url, base, head['Link'], 'url',
+                                              src_host=src_host, base_host=base_host):
+                        yield l
+                if 'Metas' in head:
+                    for m in head['Metas']:
+                        if (('property' in m and m['property']
+                             in ExtractLinksJob.html_meta_property_links)
+                            or ('name' in m and m['name']
+                                in ExtractLinksJob.html_meta_links)
+                            or ('content' in m
+                                and ExtractLinksJob.url_abs_pattern.match(m['content']))):
+                            for l in self.yield_links(url, base, [m], 'content',
+                                                      src_host=src_host, base_host=base_host):
+                                yield l
+                if 'Scripts' in head:
+                    for l in self.yield_links(url, base, head['Scripts'], 'url',
+                                              src_host=src_host, base_host=base_host):
+                        yield l
+            if 'Links' in html_meta:
+                for l in self.yield_links(url, base, html_meta['Links'],
+                                          'url', 'href',
+                                          src_host=src_host, base_host=base_host):
+                    yield l
+
+        except KeyError as e:
+            self.get_logger().error("Failed to parse record for {}: {}".format(
+                url, e))
+            self.records_failed.add(1)
+
     def process_robotstxt(self, record, stream, _http_status_line):
         """Process robots.txt and yield sitemap links"""
         line = stream.readline()