## URLs

These are real URLs that we want to classify or do keyword/topic extraction from

In [1]:
# This URL points to Datascience/ML related content
url1 = "https://pytorch3d.readthedocs.io/en/latest/overview.html"

In [2]:
# This URL points to Python and backend-web related content
url2 = "https://flask-mqtt.readthedocs.io/en/latest/"

In [3]:
# This URL points to Python, backend-web and DevOps related content
url3 = "https://huey.readthedocs.io/en/latest/guide.html"

## Topics & Keywords

These are some topics and keywords we're interested in. Ideally, these are the topics and keywords to focus on but these topics could have additional keywords we haven't identified. There also might be different topics to "discover".

In [4]:
TOPICS = {
    "data-science": {
        "name": "Data science & machine learning",
        "keywords": [
            "data-science",
            "datascience",
            "ml",
            "training-data",
            "model-training",
            "machine-learning",
            "machinelearning",
            "sentiment-analysis",
            "ai",
            "artificial-intelligence",
            "neural-net",
            "neural-nets",
            "data-analytics",
            "visualization",
            "data-visualization",
            "nlp",
            "object-detection",
            "computer-vision",
            "jupyter",
            "matplotlib",
            "deep-learning",
            "pytorch",
            "pydata",
            "opencv-python-library",
            "pandas",
            "numpy",
            "tensor",
            "tensorflow",
        ],
    },
    "backend-web": {
        "name": "Backend web development",
        "keywords": [
            "backend",
            "backend-web",
            "flask",
            "django",
            "werkzeug",
            "wsgi",
            "celery",
            "jinja",
        ],
    },
    "frontend-web": {
        "name": "Frontend web development",
        "keywords": [
            "frontend",
            "frontend-web",
            "javascript",
            "react",
            "reactjs",
            "css",
            "angular",
            "angularjs",
            "jquery",
            "vuejs",
            "vue",
            "webpack",
            "node",
            "nodejs",
        ],
    },
    "security-privacy": {
        "name": "Security & privacy",
        "keywords": [
            "security",
            "privacy",
            "cryptography",
            "oauth",
            "authorization",
            "authentication",
        ],
    },
    "devops": {
        "name": "DevOps",
        "keywords": [
            "devops",
            "cloud",
            "docker",
            "kubernetes",
            "container",
            "containers",
            "ansible",
            "serverless",
            "openshift",
            "aws",
            "linux",
            "ubuntu",
            "monitoring",
            "openid-connect",
            "oauth",
            "redis",
            "rabbitmq",
            "nosql",
            "postgres",
            "postgresql",
            "mysql",
            "database",
            "elasticsearch",
            "lucene",
            "solr",
            "terraform",
            "nginx",
        ],
    },
    "python": {
        "name": "Python development",
        "keywords": ["python", "django", "flask"],
    },
    "game-dev": {
        "name": "Game development",
        "keywords": ["gamedev", "minecraft", "godot", "game"],
    },
    "blockchain": {
        "name": "Blockchain",
        "keywords": [
            "blockchain",
            "ethereum",
            "bitcoin",
            "cryptocurrency",
            "hyperledger",
            "solidity",
        ],
    },
    "techwriting": {
        "name": "Technical writing",
        "keywords": ["technical-writing", "sphinx", "sphinx-doc", "mkdocs"],
    },
}

## Analyzers

These are some different analyzers we are already using as well as some proposed ones.

In [5]:
import requests
import urllib3

In [7]:
class BaseAnalyzerBackend:

    """Base class that all analyzers should extend."""

    def __init__(self, url, **kwargs):
        """Base constructor."""
        self.url = url
        self.user_agent = "EthicalAds Analyzer"

    def fetch(self, **kwargs):
        """Performs a URL fetch on the analyzed URL."""
        # Unless specifically following redirects, don't bother
        # Something is probably wrong if the request returns a redirect
        kwargs.setdefault("allow_redirects", False)
        kwargs.setdefault("timeout", 3)  # seconds
        kwargs.setdefault("headers", {"user-agent": self.user_agent})

        try:
            return requests.get(self.url, **kwargs)
        except (requests.exceptions.RequestException, urllib3.exceptions.HTTPError):
            pass
            # log.info("Error analyzing URL: %s", self.url, exc_info=True)

        return None

    def analyze(self):
        """
        Fetch the response and parse it for keywords.

        :returns list: a list of keywords or `None` if the URL doesn't respond.
        """
        resp = self.fetch()

        if resp and resp.ok:
            return self.analyze_response(resp)

        # A failed request results in `None`.
        return None

    def analyze_response(self, resp):
        """
        Analyze an HTTP response and return keywords/topics for the URL.

        This will only be passed a successful response (20x).
        All responses should return a list of keywords even if that list is empty.

        This needs to be defined by subclasses
        """
        raise NotImplementedError("Subclasses should define this.")


In [9]:
"""Naive keyword analyzer that is simply based on keyword counts."""
import collections

from bs4 import BeautifulSoup


class NaiveKeywordAnalyzerBackend(BaseAnalyzerBackend):

    """
    A very naive keyword analyzer.

    This mimics the results of our ad client.
    """

    # CSS selectors to select the "main" content of the page
    # The first of these to match anything is used
    MAIN_CONTENT_SELECTORS = (
        "[role='main']",
        "main",
        "body",
    )

    MAX_WORDS_ANALYZED = 1000
    MAX_KEYWORDS = 3
    MIN_KEYWORD_OCCURRENCES = 2

    def __init__(self, url, **kwargs):
        """Overrides to get the keyword corpus."""
        super().__init__(url, **kwargs)

        self.topics = TOPICS
        self.keywords = set()
        for t in self.topics:
            for kw in self.topics[t]["keywords"]:
                self.keywords.add(kw)

    def analyze_response(self, resp):
        """Analyze an HTTP response and return a list of keywords/topics for the URL."""
        keywords = []

        soup = BeautifulSoup(resp.content, features="html.parser")

        for selector in self.MAIN_CONTENT_SELECTORS:
            results = soup.select(selector, limit=1)

            # If no results, go to the next selector
            # If results are found, use these and stop looking at the selectors
            if results:
                text = results[0].get_text().replace("\n", " ")
                keywords = self.analyze_text(text)
                break

        return keywords

    def analyze_text(self, text):
        """Analyze a large string of text for keyword extraction."""
        # Differs from string.punctuation in that the hyphen is missing
        punctuation = r"""!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""

        word_counter = collections.Counter()

        for index, word in enumerate(text.split()):
            if index > self.MAX_WORDS_ANALYZED:
                break

            # Remove punctuation and make it lowercase
            word = word.translate(str.maketrans("", "", punctuation)).lower()

            if word in self.keywords:
                word_counter[word] += 1

        # Remove items with fewer than MIN_KEYWORD_OCCURRENCES
        word_counter = collections.Counter(
            {k: v for k, v in word_counter.items() if v >= self.MIN_KEYWORD_OCCURRENCES}
        )

        # Return the top MAX_KEYWORDS
        return [kw for kw, _ in word_counter.most_common(self.MAX_KEYWORDS)]


In [13]:
list(NaiveKeywordAnalyzerBackend("").keywords)[:5]

['postgres', 'serverless', 'minecraft', 'reactjs', 'frontend-web']

In [14]:
# Analyze URL1 which should probably be ML/Datascience related
analyzer = NaiveKeywordAnalyzerBackend(url1)
analyzer.analyze()

['pytorch', 'backend']

In [15]:
# Analyze URL2 which should probably be Python/backend related
analyzer = NaiveKeywordAnalyzerBackend(url2)
analyzer.analyze()

['flask', 'python']

In [16]:
# Analyze URL3 which should probably be Python/backend/devops related
analyzer = NaiveKeywordAnalyzerBackend(url3)
analyzer.analyze()

[]

**NOTE:** This last URL wasn't classified correctly. No real identifying stuff was in the first few paragraphs so it was misclassified.

### A textacy-based analyzer?

In [24]:
import textacy
from textacy import preprocessing
# from textacy.extract import keyterms

In [45]:
class TextacyAnalyzerBackend(NaiveKeywordAnalyzerBackend):
    def analyze_response(self, resp):
        """Analyze an HTTP response and return a list of keywords/topics for the URL."""
        keywords = []

        soup = BeautifulSoup(resp.content, features="html.parser")

        for selector in self.MAIN_CONTENT_SELECTORS:
            results = soup.select(selector, limit=1)

            # If no results, go to the next selector
            # If results are found, use these and stop looking at the selectors
            if results:
                text = results[0].get_text()
                keywords = self.analyze_text(text)
                break

        return keywords
    
    def analyze_text(self, text):
        """Analyze a large string of text for keyword extraction."""
        preproc = preprocessing.make_pipeline(
            preprocessing.normalize.unicode,
            preprocessing.remove.punctuation,
            preprocessing.normalize.whitespace,
        )
        
        # Followed tips from the Textacy official docs
        # https://textacy.readthedocs.io/en/latest/quickstart.html
        processed_text = preproc(text).lower()
        en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser",))
        doc = textacy.make_spacy_doc(processed_text, lang=en)
        keywords = []
        
        for phrase, _ in textacy.extract.keyterms.textrank(doc, normalize="lemma", topn=10):
            # Consider using the 2nd parameter - weight
            keywords.append(phrase)
        
        # The ngrams extractor didn't seem to work quite as well
        #keywords = list(textacy.extract.ngrams(doc, self.MAX_KEYWORDS, min_freq=self.MIN_KEYWORD_OCCURRENCES))
        
        return keywords


In [46]:
# Analyze URL1 which should probably be ML/Datascience related
analyzer = TextacyAnalyzerBackend(url1)
analyzer.analyze()

['pytorch3d https github com facebookresearch synsin',
 'pytorch3d codebase',
 'pytorch3d renderer',
 'mesh r cnn codebase',
 'fair pytorch3d',
 'pytorch3d useful',
 'pytorch3d documentation',
 'arxiv https arxiv org ab',
 'facebook ai research computer vision team',
 'differentiable mesh renderer']

In [47]:
# Analyze URL2 which should probably be Python/backend related
analyzer = TextacyAnalyzerBackend(url2)
analyzer.analyze()

['flask mqtt',
 'paho mqtt package',
 'mosquitto mqtt server',
 'mqtt s documentation',
 'multiple iot device',
 'mqtt client',
 'mqtt integration',
 'flask application',
 'multiple worker',
 'flask extension']

In [48]:
# Analyze URL3 which should probably be Python/backend/devops related
analyzer = TextacyAnalyzerBackend(url3)
analyzer.analyze()

['huey periodic task crontab minute= 0 hour=',
 'huey periodic task schedule',
 'task task',
 'def print signal args signal task exc',
 'huey task retries=2 retry',
 'signal = = signal error',
 'time sensitive task schedule',
 'huey lock task report lock',
 'huey lock task method',
 'huey periodic task decorator']