In [1]:
import pandas as pd
import numpy as np
import regex as re

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download("punkt")

import matplotlib.pyplot as plt

import spacy
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")


from bs4 import BeautifulSoup

import torch
import transformers

from IPython.display import display

[nltk_data] Downloading package punkt to C:\Users\Daan
[nltk_data]     Brugmans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Define four helper functions

## 1. Function to get unique characters from a text column
def get_unique_characters(text_column):
    unique_chars = set()
    for text in text_column:
        if isinstance(text, str):  # Check if the value is a string and not NaN
            unique_chars.update(set(text))
    return list(unique_chars)

## 2. Cleaning/editing using RegEx
def regex_edits(df):
## An example cleaning operation using RegEx
  df['YOUR DATA COLUMN'] = [re.sub('YOUR REGEX', "YOUR SUBSTITUTE", str(x)) for x in df['YOUR DATA COLUMN']]
  return df

## 3. Descriptive statistics
def analyze_text_column(data_frame, column_name):
    # Ensure the column is of type string, and drop rows with missing values
    data_frame = data_frame.dropna(subset=[column_name])
    data_frame[column_name] = data_frame[column_name].astype(str)

    # Tokenize the text in the specified column
    tokens = [word_tokenize(text) for text in data_frame[column_name]]

    # Calculate token count
    token_count = sum(len(token_list) for token_list in tokens)

    # Calculate type count
    types = set(word for token_list in tokens for word in token_list)
    type_count = len(types)

    # Calculate type/token ratio
    type_token_ratio = type_count / token_count

    # Calculate frequency distribution
    flat_tokens = [word for token_list in tokens for word in token_list]
    freq_dist = FreqDist(flat_tokens)

    # Print the results
    print(f"Token Count: {token_count}")
    print(f"Type Count: {type_count}")
    print(f"Type/Token Ratio: {type_token_ratio:.4f}")

    print("\nFrequency List:")
    for word, freq in freq_dist.most_common(20):  # Change '20' to display more or fewer items
        print(f"{word}: {freq}")

## 4. Investigate rank/frequency distribution of the dataset.
def plot_frequency_rank_distribution(df, column_name, alpha=1.0):
    # Compute the frequency of each unique value in the specified column
    value_counts = df[column_name].value_counts()

    # Sort the frequencies in descending order and compute the rank
    sorted_counts = value_counts.sort_values(ascending=False)
    rank = np.arange(1, len(sorted_counts) + 1)

    # Plot the result as a line chart
    plt.figure(figsize=(10, 6))
    plt.plot(rank, sorted_counts, marker='.', linestyle='-', markersize=1, label='Actual Data')

    # Compute and plot the reference Zipf distribution
    # Calculate the Zipf distribution frequencies
    zipf_freq = [sorted_counts.iloc[0] / (i ** alpha) for i in rank]

    plt.plot(rank, zipf_freq, linestyle='--', label=f'Zipf (alpha={alpha})')

    # Add a legend for the top 5 most ranked tokens
    top_tokens = sorted_counts.index[:5]  # Get the top 5 tokens by rank

    # Create a legend with the top 5 tokens
    legend_labels = ['Actual Data', f'Zipf (alpha={alpha})'] + [f'Top 5: {token}' for token in top_tokens]

    # Add the legend with specified labels
    plt.legend(legend_labels)

    plt.xlabel('Rank')
    plt.ylabel('Frequency')
    plt.yscale('log')
    plt.xscale('log')
    plt.title('Frequency/Rank Distribution')
    plt.grid(True)
    plt.show()

In [12]:
df = pd.read_csv("../data/Webscraped raw English.csv")
df.iloc[8]["Webscraped"]

'Job Id: 21334722<br><br>The Project Senior Analyst is a seasoned professional role. Applies in-depth disciplinary knowledge, contributing to the development of new techniques and the improvement of processes and work-flow for the area or function. Integrates subject matter and industry expertise within a defined area. Requires in-depth understanding of how areas collectively integrate within the sub-function as well as coordinate and contribute to the objectives of the function and overall business. Evaluates moderately complex and variable issues with substantial potential impact, where development of an approach/taking of an action involves weighing various alternatives and balancing potentially conflicting situations using multiple sources of information. Requires good analytical skills in order to filter, prioritize and validate potentially complex and dynamic material from multiple sources. Strong communication and diplomacy skills are required. Regularly assumes informal/formal 

In [13]:
def remove_html_tags(text):
    """
    Remove HTML tags from the text,
    parse HTML entities (&nsbp, &gt, etc)
    """

    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(' ')

df['Webscraped'] = df['Webscraped'].apply(remove_html_tags)
df.iloc[8]["Webscraped"]

'Job Id: 21334722 The Project Senior Analyst is a seasoned professional role. Applies in-depth disciplinary knowledge, contributing to the development of new techniques and the improvement of processes and work-flow for the area or function. Integrates subject matter and industry expertise within a defined area. Requires in-depth understanding of how areas collectively integrate within the sub-function as well as coordinate and contribute to the objectives of the function and overall business. Evaluates moderately complex and variable issues with substantial potential impact, where development of an approach/taking of an action involves weighing various alternatives and balancing potentially conflicting situations using multiple sources of information. Requires good analytical skills in order to filter, prioritize and validate potentially complex and dynamic material from multiple sources. Strong communication and diplomacy skills are required. Regularly assumes informal/formal leaders

In [14]:
def remove_multiple_dashes(text):
    return re.sub(r'-{2,}', '-', text)

df['Webscraped'] = df['Webscraped'].apply(remove_multiple_dashes)
df.iloc[8]["Webscraped"]

'Job Id: 21334722 The Project Senior Analyst is a seasoned professional role. Applies in-depth disciplinary knowledge, contributing to the development of new techniques and the improvement of processes and work-flow for the area or function. Integrates subject matter and industry expertise within a defined area. Requires in-depth understanding of how areas collectively integrate within the sub-function as well as coordinate and contribute to the objectives of the function and overall business. Evaluates moderately complex and variable issues with substantial potential impact, where development of an approach/taking of an action involves weighing various alternatives and balancing potentially conflicting situations using multiple sources of information. Requires good analytical skills in order to filter, prioritize and validate potentially complex and dynamic material from multiple sources. Strong communication and diplomacy skills are required. Regularly assumes informal/formal leaders

In [15]:
def non_ascii_to_spaces(text):
    """"
    Replace non-ASCII characters with spaces.
    """
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

df['Webscraped'] = df['Webscraped'].apply(non_ascii_to_spaces)
df.iloc[8]["Webscraped"]

'Job Id: 21334722 The Project Senior Analyst is a seasoned professional role. Applies in-depth disciplinary knowledge, contributing to the development of new techniques and the improvement of processes and work-flow for the area or function. Integrates subject matter and industry expertise within a defined area. Requires in-depth understanding of how areas collectively integrate within the sub-function as well as coordinate and contribute to the objectives of the function and overall business. Evaluates moderately complex and variable issues with substantial potential impact, where development of an approach/taking of an action involves weighing various alternatives and balancing potentially conflicting situations using multiple sources of information. Requires good analytical skills in order to filter, prioritize and validate potentially complex and dynamic material from multiple sources. Strong communication and diplomacy skills are required. Regularly assumes informal/formal leaders

In [16]:
def collapse_spaces(text):
    """"
    Replace multiple spaces with a single space.
    """
    return re.sub(r'\s+', ' ', text)

df['Webscraped'] = df['Webscraped'].apply(collapse_spaces)
df.iloc[8]["Webscraped"]

'Job Id: 21334722 The Project Senior Analyst is a seasoned professional role. Applies in-depth disciplinary knowledge, contributing to the development of new techniques and the improvement of processes and work-flow for the area or function. Integrates subject matter and industry expertise within a defined area. Requires in-depth understanding of how areas collectively integrate within the sub-function as well as coordinate and contribute to the objectives of the function and overall business. Evaluates moderately complex and variable issues with substantial potential impact, where development of an approach/taking of an action involves weighing various alternatives and balancing potentially conflicting situations using multiple sources of information. Requires good analytical skills in order to filter, prioritize and validate potentially complex and dynamic material from multiple sources. Strong communication and diplomacy skills are required. Regularly assumes informal/formal leaders

In [20]:
def remove_non_english_texts(webscraped_df: pd.DataFrame) -> pd.DataFrame:
  
  device = "cuda" if torch.cuda.is_available() else "cpu"
  language_detection_model = "papluca/xlm-roberta-base-language-detection"
  language_detection_pipeline = transformers.pipeline("text-classification", model=language_detection_model, device=device)

  webscraped_df["Language"] = webscraped_df["Webscraped"].apply(lambda text: language_detection_pipeline(text, top_k=1, truncation=True)[0]["label"])
  webscraped_df = webscraped_df[webscraped_df["Language"] == "en"]

  del webscraped_df["Language"]

  return webscraped_df

df = remove_non_english_texts(df)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  webscraped_df["Language"] = webscraped_df["Webscraped"].apply(lambda text: language_detection_pipeline(text, top_k=1, truncation=True)[0]["label"])


'Alter Domus Job Description Taking an active part in the initial setup of any new funds take-on and liaising with independent tax specialists, lawyers, and notaries as part of the set up process; Taking care of the day-to-day operations of the funds and following up with third parties, including investors and fund managers; You will take part in the setup of funds, and implementing complex international real estate structures as part of restructuring plans, mergers and acquisitions Handling the fund accounts and preparation of periodic reports and regulatory reports for the fund entities; Preparing the funds periodic Net Asset Value calculation and year-end financial statements, management of the audit, and reviewing other fund related reports; Reviewing tax returns and tax balances and maintaining relevant contacts with the tax authorities; Accurately record all time spent dealing with client matters to ensure the clients are correctly invoiced; Pro-actively take on additional respon

In [21]:
def stop_word_removal(text, nlp):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    filtered_string = ' '.join(filtered_tokens)
    return filtered_string

df['Webscraped'] = df['Webscraped'].apply(lambda x: stop_word_removal(x, nlp))
df.iloc[8]["Webscraped"]

'Alter Domus Job Description Taking active initial setup new funds - liaising independent tax specialists , lawyers , notaries set process ; Taking care day - - day operations funds following parties , including investors fund managers ; setup funds , implementing complex international real estate structures restructuring plans , mergers acquisitions Handling fund accounts preparation periodic reports regulatory reports fund entities ; Preparing funds periodic Net Asset Value calculation year - end financial statements , management audit , reviewing fund related reports ; Reviewing tax returns tax balances maintaining relevant contacts tax authorities ; Accurately record time spent dealing client matters ensure clients correctly invoiced ; Pro - actively additional responsibilities administrational tasks required managers clients . Profile hold strong academic background , qualified Accountant ( ACCA / CIMA / ACA ) preferred . 1 3 years relevant experience fields accounting audit . exp

In [22]:
def lemmatizer(text, nlp):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

df['Webscraped'] = df['Webscraped'].apply(lambda x: lemmatizer(x, nlp))
df.iloc[8]["Webscraped"]

'Alter Domus Job Description take active initial setup new fund - liaise independent tax specialist , lawyer , notary set process ; take care day - - day operation fund follow party , include investor fund manager ; setup fund , implement complex international real estate structure restructure plan , merger acquisition handling fund account preparation periodic report regulatory report fund entity ; prepare fund periodic Net Asset Value calculation year - end financial statement , management audit , review fund relate report ; review tax return tax balance maintain relevant contact tax authority ; accurately record time spend deal client matter ensure client correctly invoice ; pro - actively additional responsibility administrational task require manager client . profile hold strong academic background , qualified Accountant ( ACCA / CIMA / ACA ) prefer . 1 3 year relevant experience field accounting audit . experience Real Estate consider plus . fluent English ( knowledge french Germ

In [23]:
for row in df['Webscraped']:
    print(row)
    print("\n")

Description Davy trust market leader wealth management capital market , build rewarding relationship . vision financial service partner trust client admire people . commit deliver world - class outcome client individual , business institution . develop people priority commit embrace diversity form recognise breadth think , perspective experience emerge diverse workforce essential deliver core value . city centre location , free access onsite gym fitness studio , impressive social calendar unique benefit employee enjoy work Davy . performance relate bonus , generous pension contribution investment education demonstrate value place develop rewarding staff . 800 people , manage 14bn+ client asset , office Dublin , Cork , Galway , Belfast , London Luxembourg . team , grow , put client . Client Information Management Team look strong administrator assist temporary assignment . busy team , involve ensure Davy Private client receive excellent service opening account Davy . Key Responsibilitie