In [None]:
import os
import cloudscraper as cs
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import string
from google.cloud import language_v1

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "raw-data-392614-6e7d6f0fa9e1.json"

In [None]:
nltk.data.path.append(os.getcwd())
nltk.data.path

In [None]:
#nltk.download('stopwords') #download stopword corpus
stopwords.words('english')[0:10] # Show some stop words tp verify install

In [None]:
class Website:
    def __init__(self, domain):
        self.domain = domain
        
    def grab_details(self):
        scraper = cs.create_scraper()
        data = scraper.get(self.domain)
        return data
    
    def text_process(self, content):
        """
        Takes in a string of text, then performs the following:
        1. Remove all punctuation
        2. Remove all stopwords
        3. Returns a list of the cleaned text
        """
        # Check characters to see if they are in punctuation
        nopunc = [char for char in content if char not in string.punctuation]

        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)

        # Now just remove any stopwords
        return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
        
    def grab_content(self):
        try:
            data = self.grab_details()
            h1_all:str = ""
            p_all:str = ""
            h2_all:str = ""
            h3_all:str = ""
            allthecontent:str = ""

            if self.grab_details().status_code != 200:
                return None

            soup = BeautifulSoup(data.text, 'html.parser')
            title = soup.find('title').text
            description = soup.find('meta', attrs={'name': 'description'})

            if "content" in str(description):
                description = description.get("content")
            else:
                description = ""


            h1 = soup.find_all('h1')
            for x in range (len(h1)):
                h1_all = h1_all + " " + h1[x].text
                
            h2 = soup.find_all('h2')
            for x in range (len(h2)):
                h2_all = h2_all + " " + h2[x].text

            h3 = soup.find_all('h3')
            for x in range (len(h3)):
                h3_all = h3_all + " " + h3[x].text

            paragraphs = soup.find_all('p')
            for x in range (len(paragraphs)):
                p_all = p_all + " " + paragraphs[x].text

            allthecontent = title + " " + description + " " + h1_all + " " + h2_all + " " + h3_all + " " + p_all + " "
            allthecontent = self.text_process(allthecontent)
            allthecontent = allthecontent[0:999]
            allthecontent = " ".join(allthecontent)
            return allthecontent

        except Exception as e:
            print(e)

In [None]:
def classify(urlList:list):
    result = pd.DataFrame(columns=['URL', "Text", "Category", "Confidence", "Length"])
    language_client = language_v1.LanguageServiceClient()
    
    for url in urlList:
        website = Website(url)
        content = website.grab_content()

        document = language_v1.Document(
            content=content, type_=language_v1.Document.Type.PLAIN_TEXT
        )
        response = language_client.classify_text(request={"document": document})
        categories = response.categories

        for category in categories:
            result = result.append({"URL":url, "Text":content, "Category":category.name, "Confidence":category.confidence, "Length": len(content)}, ignore_index=True)
        

    return result

In [None]:
urls = ["https://www.disney.com", "https://wpengine.com", "https://deltiasgaming.com", "https://velocitize.com"]

classification = classify(urls)

In [None]:
classification

In [None]:
clean = classification.groupby("URL",as_index=False).max()
clean[clean["Confidence"] > .8]

In [None]:
classification["Length"].sum()