In [None]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util

In [None]:
def load_csv(file_path, encoding='latin1', sep=',', usecols=None):
    return pd.read_csv(file_path, encoding=encoding, sep=sep, usecols=usecols)

In [None]:
def preprocess_cnae_codes(cnae_df):
    # Filter rows where CODINTEGR has more than 4 characters
    cnae_df = cnae_df[cnae_df['CODINTEGR'].str.len() > 4]
    # Extract sector and CNAE code
    cnae_df['sector'] = cnae_df['CODINTEGR'].str[0]
    cnae_df['cnae_code'] = cnae_df['CODINTEGR'].str[1:]
    # Rename the 'TITULO_CNAE2009' column to 'defin'
    cnae_df = cnae_df.rename(columns={'TITULO_CNAE2009': 'defin'})
    # Drop the original CODINTEGR column
    cnae_df = cnae_df.drop(columns=['CODINTEGR'])
    return cnae_df

In [None]:
def clean_text(text):
    page_title_pattern = (
        r'bolet[íi]n oficial del registro mercantil núm\. \d+ '
        r'(?:\w+ \d+ de \w+ de \d+|[a-z]+ \d+ [a-z]+ de \d+) '
        r'pág\. \d+ cve: borme-\w+-\d+-\d+(-\d+)?'
    )
    text = re.sub(page_title_pattern, '', text)
    text = text.strip()
    return text

def preprocess_text(text):
    text = clean_text(text)
    text = text.lower()
    return text

In [None]:
def split_text(description, valid_cnae_codes):
    if not isinstance(description, str):
        return []

    description = preprocess_text(description)

    # Define patterns for CNAE codes
    cnae_pattern_4_digits = re.compile(r'\b\d{4}\b')
    cnae_pattern_xx_xx = re.compile(r'(\d{2})\.(\d{2})')

    # Extract CNAE codes
    matches = cnae_pattern_4_digits.findall(description)
    matches += [m[0] + m[1] for m in cnae_pattern_xx_xx.findall(description)]
    
    # Filter valid CNAE codes
    matches = [match for match in matches if match in valid_cnae_codes]
    if matches:
        return list(set(matches))

    # Split by specific patterns or sentence boundaries
    specific_split_patterns = re.compile(r'(\d+\.)|([a-zA-Z]\))')
    specific_segments = specific_split_patterns.split(description)
    specific_segments = [seg.strip() for seg in specific_segments if seg and seg.strip() not in ['.', ')']]
    
    if len(specific_segments) > 1:
        return list(set(specific_segments))

    sentence_split_patterns = re.compile(r'(\n)|(\.)')
    sentence_segments = sentence_split_patterns.split(description)
    segments = [seg.strip() for seg in sentence_segments if seg and seg.strip()]

    return list(set(segments))

In [None]:
def encode_description(description, model):
    if not description:
        return None
    return model.encode(description, convert_to_tensor=True)

def process_row(row, cnae_codes, valid_cnae_codes, cnae_embeddings, model, sector_columns):
    segments = split_text(row.get('objeto_social', ''), valid_cnae_codes)
    sectors_found = set()

    for segment in segments:
        if segment in valid_cnae_codes:
            sector = cnae_codes[cnae_codes['cnae_code'] == segment]['sector'].values
            if sector:
                sectors_found.update(sector)
        else:
            split_embedding = encode_description(segment, model)
            if split_embedding is None:
                continue

            similarities = util.pytorch_cos_sim(split_embedding, cnae_embeddings)
            max_similarity = similarities.max()
            best_match_index = similarities.argmax().item()
            best_match_sector = cnae_codes['sector'].iloc[best_match_index]

            if max_similarity >= 0.5:
                sectors_found.add(best_match_sector)

    result = {sector_columns[sector]: 1 for sector in sectors_found}
    result['sector_count'] = len(sectors_found)
    return pd.Series(result)

In [None]:
def initialize_sector_columns(valencia_empresas, sector_columns):
    for column in sector_columns.values():
        if column not in valencia_empresas.columns:
            valencia_empresas[column] = 0

def process_data(valencia_empresas, cnae_codes, valid_cnae_codes, model):
    cnae_embeddings = model.encode(cnae_codes['defin'].tolist(), convert_to_tensor=True)

    # Define sector columns
    sectors = cnae_codes['sector'].unique()
    sector_columns = {sector: f'sector_{sector}' for sector in sectors}

    # Initialize sector columns in the valencia_empresas dataframe
    initialize_sector_columns(valencia_empresas, sector_columns)

    # Apply the processing to each row
    processed_results = valencia_empresas.apply(
        lambda row: process_row(row, cnae_codes, valid_cnae_codes, cnae_embeddings, model, sector_columns), axis=1
    )

    return processed_results

In [None]:
def save_to_csv(df, file_path, encoding='latin1'):
    df.to_csv(file_path, index=False, encoding=encoding)

In [None]:
# Load data
cnae_codes = load_csv(r'/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Raw/CNAE_Codes_Spanish.csv', usecols=['CODINTEGR', 'TITULO_CNAE2009'])
valencia_empresas = load_csv(r'/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_geocoded_pob_paro_survival.csv')

# Preprocess CNAE codes
cnae_codes = preprocess_cnae_codes(cnae_codes)

# Extract valid CNAE codes
valid_cnae_codes = set(cnae_codes['cnae_code'].astype(str))

# Load the Sentence-BERT model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Process the data
processed_results = process_data(valencia_empresas, cnae_codes, valid_cnae_codes, model)

# Merge the results back into the valencia_empresas dataframe
valencia_empresas.update(processed_results)

# Save the final dataframe
save_to_csv(valencia_empresas, r'/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_final_survival.csv')