# Minhash Deduplication Workload 

In this notebook, we will cover how to perform the minhash deduplication algorithm to X over the Y on common crawl data.


Common Crawl is 

In [None]:
import daft
from daft.io import IOConfig, S3Config
import os
# from daft.io.s3 import S3Config  # adjust import if needed for your project

s3_config = S3Config(
    region_name="us-east-1",
    requester_pays=True,
    profile_name=os.environ.get("AWS_PROFILE", "default"),
    anonymous=False,
)

io_config = IOConfig(s3=s3_config)

In [None]:
df = daft.from_glob_path("https://data.commoncrawl.org/crawl-data/CC-MAIN-2018-17/segments/1524125937193.1/warc/CC-MAIN-*.warc.gz").limit(10)
df.show()

FileNotFoundError: No files found at https://data.commoncrawl.org/crawl-data/CC-MAIN-2018-17/segments/1524125937193.1/warc/CC-MAIN-*.warc.gz

In [16]:
# Lets inspect a single WARC file - Note how daft can read gzipped WARC files directly
df_warc = daft.read_warc("https://data.commoncrawl.org/crawl-data/CC-MAIN-2018-17/segments/1524125937193.1/warc/CC-MAIN-20180420081400-20180420101400-00000.warc.gz")
df_warc.show()

WARC-Record-ID Utf8,WARC-Type Utf8,"WARC-Date Timestamp(Nanoseconds, Some(""Etc/UTC""))",Content-Length Int64,WARC-Identified-Payload-Type Utf8,warc_content Binary,warc_headers Utf8
bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3,warcinfo,2018-04-20 08:14:00 UTC,324,,"b""robots: classic\r\nhostname: ip-10""...","{""Content-Type"":""application/warc-fields"",""WARC-Filename"":""CC-MAIN-20180420081400-20180420101400-00000.warc.gz""}"
c23396b2-da8f-44ad-8e28-c72d2d152129,request,2018-04-20 08:32:21 UTC,286,,"b""GET /news-ed-eventi/news-seat HTTP""...","{""Content-Type"":""application/http; msgtype=request"",""WARC-IP-Address"":""89.118.107.132"",""WARC-Target-URI"":""http://00064.dealerseat.com/news-ed-eventi/news-seat"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
295bd657-a3ef-4c28-8027-ec136a026ceb,response,2018-04-20 08:32:21 UTC,385901,text/html,"b""HTTP/1.1 200 Apple\r\nServer: ngin""...","{""Content-Type"":""application/http; msgtype=response"",""WARC-Block-Digest"":""sha1:POWDQGDGUYEFWFCP5TLO5VL67EY7BH6K"",""WARC-Concurrent-To"":""<urn:uuid:c23396b2-da8f-44ad-8e28-c72d2d152129>"",""WARC-IP-Address"":""89.118.107.132"",""WARC-Payload-Digest"":""sha1:ACTPVYHET7H243TRUCUNM2FUI7PIY6MD"",""WARC-Target-URI"":""http://00064.dealerseat.com/news-ed-eventi/news-seat"",""WARC-Truncated"":""length"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
777d0888-fecc-4a6b-9ed7-167c8611cf0c,metadata,2018-04-20 08:32:21 UTC,21,,"b""fetchTimeMs: 1307\r\n\r\n""","{""Content-Type"":""application/warc-fields"",""WARC-Concurrent-To"":""<urn:uuid:295bd657-a3ef-4c28-8027-ec136a026ceb>"",""WARC-Target-URI"":""http://00064.dealerseat.com/news-ed-eventi/news-seat"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
c5a31119-976e-4c4e-939a-9dbfb040e3de,request,2018-04-20 08:32:40 UTC,269,,"b""GET /kouchi/goiken/ HTTP/1.0\r\nHo""...","{""Content-Type"":""application/http; msgtype=request"",""WARC-IP-Address"":""218.224.226.141"",""WARC-Target-URI"":""http://00monthly.com/kouchi/goiken/"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
5e643323-ea8e-4a4a-a1c3-6fc61791dd08,response,2018-04-20 08:32:40 UTC,10835,application/xhtml+xml,"b""HTTP/1.1 200 OK\r\nDate: Fri, 20 A""...","{""Content-Type"":""application/http; msgtype=response"",""WARC-Block-Digest"":""sha1:FD5A7ID6JPOT3XUAMFX3XNWDD2APIVBE"",""WARC-Concurrent-To"":""<urn:uuid:c5a31119-976e-4c4e-939a-9dbfb040e3de>"",""WARC-IP-Address"":""218.224.226.141"",""WARC-Payload-Digest"":""sha1:WEHISROIBSQ5IM2BKJDKJ3FRG4GSDZ5Y"",""WARC-Target-URI"":""http://00monthly.com/kouchi/goiken/"",""WARC-Truncated"":""length"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
16de518d-0ae1-4f61-8cfb-97c11b7e86c5,metadata,2018-04-20 08:32:40 UTC,20,,"b""fetchTimeMs: 620\r\n\r\n""","{""Content-Type"":""application/warc-fields"",""WARC-Concurrent-To"":""<urn:uuid:5e643323-ea8e-4a4a-a1c3-6fc61791dd08>"",""WARC-Target-URI"":""http://00monthly.com/kouchi/goiken/"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"
13cd06fc-beb5-4285-a903-c1fa531b3f85,request,2018-04-20 08:35:13 UTC,236,,"b""GET /pissysexy21-video HTTP/1.0\r\n""...","{""Content-Type"":""application/http; msgtype=request"",""WARC-IP-Address"":""144.217.113.16"",""WARC-Target-URI"":""http://00secretsweety.ozforex.info/pissysexy21-video"",""WARC-Warcinfo-ID"":""<urn:uuid:bc13aa27-5dbc-4f25-aeab-c802a3ab7ef3>""}"


The Minhash Deduplication algorithm

