# Feature Extraction

After extracting the comments from the Hacker News posts, this notebook processes the data (headline and body), and attempts to extract relevant information:

* select only early-stage companies (exclude companies that have gone through Series A, B, C or D)
* company name
* location (using SpaCy and Flair, then data from GeoNames is used to determine if the location is in Europe or not)

In [1]:
import os
import warnings
import json

warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import spacy
from flair.models import SequenceTagger
from flair.data import Sentence
from tqdm.notebook import tqdm
import re

import tldextract
from Levenshtein import ratio

from string import punctuation

In [2]:
# LOAD DATA
loaded_data = []
with open("../data/hacker_news_comments.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        json_object = json.loads(line)
        loaded_data.append(json_object)

df = pd.DataFrame(loaded_data)

df["company_name"] = df['headline'].apply(lambda x: "|".join(x.split("|")[:1]).strip())
df["location"] = df['headline'].apply(lambda x: "|".join(x.split("|")[1:2]).strip().lower())
df["post_info"] = df['headline'].apply(lambda x: "|".join(x.split("|")[2:]).strip())

## Early-stage companies
Select only early-stage companies, by excluding companies that have already gone through a series A, B, C or D

In [3]:
def filter_early_stage_companies(df):
    series_pattern = r"series\s+[abcd]"
    df = df[
        (~df["headline"].str.contains(series_pattern, case=False, na=False))
        & (~df["body"].str.contains(series_pattern, case=False, na=False))
    ]
    return df

In [4]:
df = filter_early_stage_companies(df)

## Location Extraction

#### Geonames
Use Geonames to determine if a found location (using SpaCy) is in Europe or not

In [5]:
def load_geonames_loacations(filepath):
    geonames_df = pd.read_csv(
        filepath, delimiter=";", na_values=None, keep_default_na=False
    )

    geonames_df = geonames_df[geonames_df["Population"] >= 120000]
    geonames_df = geonames_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    # Remove "city" from the city names. Eg: New York city -> New York
    geonames_df["ASCII Name"] = geonames_df["ASCII Name"].str.replace(" city", "")
    geonames_df["Country name EN"] = geonames_df["Country name EN"].apply(
        lambda x: x.split(",")[0].strip()
    )
    # For cities with the same name, keep only the biggest (population-wise)
    geonames_df_sorted = geonames_df.sort_values(
        by=["Name", "Population"], ascending=[True, False]
    )
    geonames_df = geonames_df_sorted.groupby("Name").first().reset_index()

    # Filter European locations
    eu_locations = geonames_df[geonames_df["Timezone"].str.startswith("europe")][
        ["ASCII Name", "Country name EN"]
    ].values
    eu_locations = set(element for sublist in eu_locations for element in sublist)

    # Filter USA locations
    usa_locations = set(
        geonames_df[geonames_df["Country name EN"] == "united states"]["Name"]
    )

    # Filter all other locations
    other_locations = geonames_df[
        (~geonames_df["Timezone"].str.startswith("europe"))
        & (geonames_df["Country name EN"] != "united states")
    ][["ASCII Name", "Country name EN"]].values
    other_locations = set(element for sublist in other_locations for element in sublist)

    return eu_locations, usa_locations, other_locations

In [6]:
# Load Geonames data
eu_locations, usa_locations, other_locations = load_geonames_loacations("../data/geonames_1000.csv")

eu_locations.update(["europe", "european", "emea", "€", "£"])
usa_locations.update(["united states", "bay area", "palo alto", "us remote", "america", "nyc", "usd", "$", "new york", "usa"])

In [7]:
def assign_continent(obj):
    continent = "unknown"

    if obj in eu_locations:
        continent = "europe"
    if obj in usa_locations:
        continent = "usa"
    elif obj in other_locations:
        continent = "other"

    return continent

In [8]:
# Basic string matching
df["region"] = df["location"].apply(assign_continent)

#### Name Entity Recognition with SpaCy
Try NER on the columns `location`, `post_info` and `body`, and assign the location to a continent using the GeoNames.

In [9]:
# Load the spaCy model
spacy_model = spacy.load("en_core_web_trf")

In [10]:
# Attempt to find location first in the location column, then in the post_info, and lastly in the post's body
def ner_location(df, nlp):
    locations = []
    for column_value in df:
        doc = nlp(column_value)
        locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        # found location
        if len(locations) > 0:
            break
    return list(map(lambda x: x.lower(), set(locations)))

In [11]:
# Search location only for instances without a EU/US tag
no_location_index = df[(df["region"] == "unknown")].index

for i, row in tqdm(df.loc[no_location_index, ["location", "post_info", "body"]].iterrows(), total=len(no_location_index)):
    ner_location_list = ner_location(row, spacy_model)
    if ner_location_list:
        for location in ner_location_list:
            continent = assign_continent(location)
            if continent != "unknown":
                break
    df.loc[i, "region"] = continent

  0%|          | 0/9422 [00:00<?, ?it/s]

## Company Name Extraction
Extracts company names from a DataFrame using various methods:

    1. Preproces of the extracted company name (from Headline)
    2. Predicting names using NER models (Flair and spaCy).
    3. Reconciling predictions and choosing the best candidate.

In [12]:
# Cleans a company name string by removing extraneous text and formatting.
# Eg: removes text between parenthesis, removes text after "https" and after " - "

def clean_company_name(input_string):
    cleaned_string = re.sub(r"\(.*?\).*", "", input_string)
    # Split the text at " https" (with a space before)
    parts = cleaned_string.split(" https")

    # Clean each part separately
    cleaned_parts = []
    for part in parts:
        # Find the first non-escaped slash
        first_unescaped_slash = re.search(r"(?<!\\)/", part)  # Corrected regex

        # Remove everything after the first slash if not preceded by "https"
        if not part.startswith("https") and first_unescaped_slash:
            # Get the substring before the first non-escaped slash
            part = part[: first_unescaped_slash.start()]

        # Remove everything after " https" (with a space before)
        part = re.sub(r" https.*$", "", part)

        # Remove everything after " - " (with spaces and the hyphen)
        part = re.sub(r"(^|\s)+\-+[\>\s]+.*", "", part)

        # Remove trailing punctuation and leading/trailing whitespace
        cleaned_part = part.rstrip(punctuation).strip()

        # Add the cleaned part to the list
        cleaned_parts.append(cleaned_part)

    # Join the cleaned parts back together
    return " ".join(cleaned_parts)

In [13]:
df["cleaned_company_name"] = df["company_name"].apply(clean_company_name)
df.drop(["company_name"], axis=1, inplace=True)

In [14]:
def predict_company_name(company_names_list, ner_model):
    predicted_names = []
    for name in tqdm(company_names_list):
        if name:
            # SpaCy model
            if isinstance(ner_model, spacy.lang.en.English):
                doc = ner_model(name)
                entities = doc.ents
            # Flair model
            elif isinstance(ner_model, SequenceTagger):
                sentence = Sentence(name)
                ner_model.predict(sentence)
                entities = sentence.get_spans("ner")

            if entities:
                # If multiple spans found, return the first (higher likelihood)
                if len(entities) > 1:
                    predicted_names.append(entities[0].text)
                else:
                    for token in entities:
                        predicted_names.append(token.text)
            else:
                predicted_names.append("")
        else:
            predicted_names.append("")
    return [x.strip() for x in predicted_names]

In [15]:
# Load the spaCy model and the Flair SequenceTagger
flair_model = SequenceTagger.load("ner-fast")

2024-03-14 18:38:47,816 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [16]:
names2predict = df["cleaned_company_name"].tolist()

flair_ner = "flair_company_name"
spacy_ner = "spacy_company_name"

# Predict company names with Flair
df[flair_ner] = predict_company_name(names2predict, flair_model)
# Predict company names with spaCy
df[spacy_ner] = predict_company_name(names2predict, spacy_model)

  0%|          | 0/10274 [00:00<?, ?it/s]

  0%|          | 0/10274 [00:00<?, ?it/s]

In [17]:
# Reconcile company names if different

# Fill in missing values from the ner column with ner2 values
empty_ner1_index = df[(df[flair_ner] != df[spacy_ner]) & (df[flair_ner] == "")].index
df.loc[empty_ner1_index, flair_ner] = (
    df.loc[empty_ner1_index, flair_ner].replace("", pd.NA).fillna(df.loc[empty_ner1_index, spacy_ner])
)

# If contrasting entities found, chose the longest string
contrasting_ner_index = df[(df[flair_ner] != df[spacy_ner]) & (df[spacy_ner] != "")].index
df.loc[contrasting_ner_index, flair_ner] = df.loc[contrasting_ner_index, [flair_ner, spacy_ner]].apply(
    lambda x: max(x, key=len), axis=1
)

# If no NER was found, fill with cleaned company name
no_ner_index = df[(df[flair_ner] == "")].index
df.loc[no_ner_index, flair_ner] = (
    df.loc[no_ner_index, flair_ner].replace("", pd.NA).fillna(df.loc[no_ner_index, "cleaned_company_name"])
)

In [18]:
df.rename(columns={'flair_company_name': 'company_name'}, inplace=True)
df.drop([spacy_ner, "cleaned_company_name", "location", "post_info", "hash"], axis=1, inplace=True)

## URL Extraction
Extracts a company URL from text based on domain similarity to the company name.

In [19]:
def extract_company_url(company_name, text):
    regex = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
    matches = re.findall(regex, text)
    urls = set()
    for match in matches:
        url = tldextract.extract(match)
        urls.add((url.domain, url.suffix))

    ratios = [ratio(company_name, domain[0]) for domain in urls]
    if ratios:
        largest_ratio = max(zip(urls, ratios), key=lambda x: x[1])
        largest_ratio_domain = ".".join(largest_ratio[0])

        if largest_ratio[1] > 0.25:
            largest_ratio_domain = ".".join(largest_ratio[0])
            return largest_ratio_domain
        else:
            return ""
    else:
        return ""

In [20]:
df["company_url"] = df[["company_name", "headline", 'body']].apply(lambda x: extract_company_url(x[0], x[1]+x[2]), axis=1)

In [21]:
output_filename = os.path.join("../data/outputs/output.csv")
df.to_csv(output_filename, index=False)

## 🚀 Feature extraction completed.