In [6]:
import requests
import argparse
import time
import json
import gzip
import csv
import codecs
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import nltk
import re
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [685]:
def search_domain(domain):
    record_list = []
    print ("[*] Trying target domain: %s" % domain)
    cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % "2019-13"
    cc_url += "url=%s&matchType=prefix&output=json&filter=status:200&limit=1000" % domain
    response = requests.get(cc_url)
    if response.status_code == 200:
        records = response.content.splitlines()
        for record in records:
            record_list.append(json.loads(record))
        print ("[*] Added %d results." % len(records))
    print ("[*] Found a total of %d hits." % len(record_list))
    return record_list      

In [720]:
def download_page(record, genre):
    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    raw_data = gzip.decompress(resp.content)
    data = raw_data.decode()   
    content = ""
    if len(data):
        try:
            warc, header, response = data.strip().split('\r\n\r\n', 2)
            doc = BeautifulSoup(response , 'lxml')
            if genre in ("music", "country"):
                content = doc.find('meta', {"name" : "body"})['content']
                if ("<" in content):
                    content = ""
            elif genre in "jazz":
                content = " ".join([asd.getText() for asd in doc.find_all('p')[:-4]])
            elif genre in "rap":
                content = " ".join([asd.getText() for asd in doc.find_all('p')[2:-2]])
            elif genre in "rock":
                content = " ".join([asd.getText() for asd in doc.find_all('p')[:]])
            elif genre in "classical":
                content = re.sub(".*?\n\n\n", "", re.sub("See more Latest features.*", "", " ".join([asd.getText() for asd in doc.find_all('p')])), flags=re.DOTALL)
#                 content = re.sub("Additional notes.*", "",  content)
            elif genre in "pop":
                content = " ".join([asd.getText() for asd in doc.find_all('p')[:-4]])
            else:
                content = response
        except:
            pass   
    return  content

In [731]:
domainPrefix = [["rollingstone.com/music/", "music"], ["jazztimes.com/", "jazz"], ["allhiphop.com", "rap"], ["rollingstone.com/music/music-country/", "country"], ["loudwire.com", "rock"], ["popjustice.com/", "pop"], ["classicfm.com/discover-music/", "classical"] ]
for domain, genre  in domainPrefix:
    cnt = 0
    extractedText = []
    urlList = []
    pageList = []
    pages = search_domain(domain)
    for page in pages:
        if (cnt == 200):
            break
        downloadedPage = download_page(page, genre)
        if (len(re.sub("\s", "", downloadedPage)) > 0 and page['url'] not in urlList and downloadedPage not in pageList):
            extractedText.append([domain, genre, page['url'], downloadedPage, page['filename']])
            urlList.append(page['url'])
            pageList.append(downloadedPage)
            cnt+=1
    if (extractedText):
        pd.DataFrame(extractedText).to_csv("Extracted Raw Articles/Raw_Articles_"+genre+".csv", header=["Domain/Prefix", "Genre", "URL", "Text", "Warc file"])

[*] Trying target domain: rollingstone.com/music/
[*] Added 1000 results.
[*] Found a total of 1000 hits.
[*] Trying target domain: jazztimes.com/
[*] Added 1000 results.
[*] Found a total of 1000 hits.
[*] Trying target domain: allhiphop.com
[*] Added 1000 results.
[*] Found a total of 1000 hits.
[*] Trying target domain: rollingstone.com/music/music-country/
[*] Added 1000 results.
[*] Found a total of 1000 hits.
[*] Trying target domain: loudwire.com
[*] Added 1000 results.
[*] Found a total of 1000 hits.
[*] Trying target domain: popjustice.com/
[*] Added 807 results.
[*] Found a total of 807 hits.
[*] Trying target domain: classicfm.com/discover-music/
[*] Added 1000 results.
[*] Found a total of 1000 hits.


In [3]:
domainPrefix = [["rollingstone.com/music/", "music"], ["jazztimes.com/", "jazz"], ["allhiphop.com", "rap"], ["rollingstone.com/music/music-country/", "country"], ["loudwire.com", "rock"], ["popjustice.com/", "pop"], ["classicfm.com/discover-music/", "classical"] ]
ccDFs = []
for domain, genre in domainPrefix:
    ccDFs.append(pd.read_csv("Extracted Raw Articles/Raw_Articles_"+genre+".csv"))

In [756]:
ccDFs[0]

Unnamed: 0.1,Unnamed: 0,Domain/Prefix,Genre,URL,Text,Warc file
0,0,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,Die Antwoord are rap&apos;s new aliens: a trio...,crawl-data/CC-MAIN-2019-13/segments/1552912202...
1,1,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"The sound of hip-hop in 2014 is the rubbery, s...",crawl-data/CC-MAIN-2019-13/segments/1552912201...
2,2,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,In the four years since Australian powerhouse ...,crawl-data/CC-MAIN-2019-13/segments/1552912203...
3,3,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"Inconsistently brilliant, Graham Parker has so...",crawl-data/CC-MAIN-2019-13/segments/1552912202...
4,4,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"We&apos;re still singing that same song,"" the ...",crawl-data/CC-MAIN-2019-13/segments/1552912203...
5,5,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,Moby may look like an unlikely candidate for t...,crawl-data/CC-MAIN-2019-13/segments/1552912203...
6,6,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"p>Nowadays, Jerry Lee Lewis is a respected cou...",crawl-data/CC-MAIN-2019-13/segments/1552912202...
7,7,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,Like fellow crooners Amy Winehouse and Kate Na...,crawl-data/CC-MAIN-2019-13/segments/1552912205...
8,8,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"""You used to strip for a nigga,"" Terius Nash r...",crawl-data/CC-MAIN-2019-13/segments/1552912203...
9,9,rollingstone.com/music/,music,https://www.rollingstone.com/music/music-album...,"When Taylor Swift decides to do something, the...",crawl-data/CC-MAIN-2019-13/segments/1552912203...


In [4]:
for ccDF in ccDFs:
    if (ccDF.shape[0] != ccDF['URL'].nunique()):
        print("Duplicate URLs found!")
print("No duplicate URLs found!")
print("Total number of unique URLs: ", pd.concat(ccDFs)['URL'].nunique())

No duplicate URLs found!
Total number of unique URLs:  1400


In [47]:
stopWordsSet = set(stopwords.words('english')) 
stopWordsSet.update(("going","get","thing","also","really","would","know","say","way","got","lot","said","like","could","allhiphop","chuck","creekmur","he"))

for ccDF in ccDFs:
    keyword = ccDF["Genre"][0]
    with open("Pre-Processed Articles/commoncrawl_text_"+keyword+".txt", "a+") as text_file:
        for index, row in ccDF.iterrows():
            ccText = str.strip(re.sub('\s?https?://.*?[\s|\n]', '', row["Text"]+" "))
            ccText = re.sub('&.*?;', '', ccText)
            ccText = re.sub("[\n|\r]+", "\n", ccText)
            for text in ccText.split("\n"):
                text = " ".join(re.findall("[a-zA-Z]+",text))
                tokenizedText = word_tokenize(text.lower())
                finalText = " ".join([WordNetLemmatizer().lemmatize(w) for w in tokenizedText if not w in stopWordsSet and len(WordNetLemmatizer().lemmatize(w)) >= 3])
                text_file.write(str.strip(finalText) + "\n")