Connect to Colab, Google Cloud bucket. Change project ID

In [2]:
#colab code: mount to drive to import and export data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install libaries and connect to Google Cloud account  to import and export data
!pip install google-cloud-storage transformers torch tqdm pandas
!pip install scikit-learn

# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

# Set your project ID
!gcloud config set project tokyo-silicon-441818-f7  # Replace with your actual project ID

Install libaries

In [None]:
!python -m spacy download da_core_news_lg #Download the Danish language model for spacy

Data loading and test/train split.
Data is lemmatized. Only run on 1 file because of time constraints.
Change input and output paths if necessary.

In [None]:
#byt evt. lemmatized ud med ikke lemmatized og tokenize i stedet for
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
from typing import List, Dict
import random
random.seed(42)

class NewsDataPreparator:
    def __init__(self):
        self.file_paths = [
            #"/content/drive/MyDrive/NewsData/Lemmatized_2007_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2008_300K.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2011_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2015_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2016_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2017_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2019_1M.txt",
            #"/content/drive/MyDrive/NewsData/Lemmatized_2020_1M.txt",
            "/content/drive/MyDrive/NewsData/Lemmatized_2021_1M.txt"
        ]

    def verify_file_access(self):
        """Verify access to all files and return accessible files"""
        accessible_files = []
        print("Verifying file access...")
        for file_path in self.file_paths:
            if os.path.exists(file_path):
                accessible_files.append(file_path)
                print(f"✓ Found: {os.path.basename(file_path)}")
            else:
                print(f"✗ Not found: {os.path.basename(file_path)}")

        if not accessible_files:
            raise FileNotFoundError("No input files are accessible. Please check file paths and permissions.")

        return accessible_files

    def load_single_file(self, file_path: str) -> pd.DataFrame:
        """Load a single file and convert to DataFrame"""
        try:
            # Extract year from filename
            year = file_path.split('_')[1]

            # Read the file line by line
            data = []
            with open(file_path, 'r', encoding='utf-8') as file:
                for line_num, line in enumerate(file, 1):
                    line = line.strip()
                    if line:  # Only process non-empty lines
                        data.append({
                            'year': year,
                            'sentence_id': line_num,
                            'sentence': line,
                            'file_source': os.path.basename(file_path)
                        })

                    if line_num % 100000 == 0:  # Progress update for large files
                        print(f"Processed {line_num} lines from {os.path.basename(file_path)}")

            if not data:
                print(f"Warning: No valid data extracted from {file_path}")
                return pd.DataFrame()

            df = pd.DataFrame(data)
            print(f"Successfully loaded {len(df)} sentences from {os.path.basename(file_path)}")
            return df

        except Exception as e:
            print(f"Error loading file {file_path}: {str(e)}")
            return pd.DataFrame()

    def load_all_files(self) -> pd.DataFrame:
        """Load all accessible files and combine into single DataFrame"""
        print("\nLoading files...")
        all_data = []

        # First verify which files are accessible
        accessible_files = self.verify_file_access()

        # Load each accessible file
        for file_path in tqdm(accessible_files):
            df = self.load_single_file(file_path)
            if not df.empty:
                all_data.append(df)

        if not all_data:
            raise ValueError("No data was successfully loaded from any files")

        # Combine all DataFrames
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal loaded: {len(combined_df)} sentences from {len(all_data)} files")

        # Basic data quality check
        print("\nSentences per year:")
        print(combined_df['year'].value_counts().sort_index())

        return combined_df

    def create_train_val_split(self, df: pd.DataFrame, val_size: float = 0.2) -> Dict[str, pd.DataFrame]:
        """Split data into training and validation sets"""
        train_df, val_df = train_test_split(
            df,
            test_size=val_size,
            random_state=42,
            stratify=df['year']
        )

        print(f"Training set: {len(train_df)} sentences")
        print(f"Validation set: {len(val_df)} sentences")

        return {
            'train': train_df,
            'validation': val_df
        }

    def save_processed_data(self, data_dict: Dict[str, pd.DataFrame], output_dir: str):
        """Save processed datasets"""
        os.makedirs(output_dir, exist_ok=True)

        for name, df in data_dict.items():
            output_path = os.path.join(output_dir, f'{name}_data.csv')
            df.to_csv(output_path, index=False, encoding='utf-8')
            print(f"Saved {name} data to {output_path}")
            print(f"Sample from {name} data:")
            print(df.head(2))

def main():
    # Initialize data preparator
    preparator = NewsDataPreparator()

    # Load all files
    print("Step 1: Loading and parsing files...")
    all_data = preparator.load_all_files()

    # Create train/validation split
    print("\nStep 2: Creating train/validation split...")
    data_splits = preparator.create_train_val_split(all_data)

    # Save processed data
    print("\nStep 3: Saving processed data...")
    output_dir = "/content/drive/MyDrive/NewsData/processed"
    preparator.save_processed_data(data_splits, output_dir)

    return data_splits

# Run the pipeline
if __name__ == "__main__":
    processed_data = main()

Pattern matching in batches seperately for train and validation set. Same script for train and validation set. Change input and output paths.

In [None]:

import pandas as pd
import numpy as np
import json
from collections import defaultdict
import spacy
from tqdm import tqdm
import os
from typing import Dict, List, Set, Tuple
from datetime import datetime
import re

class DanishPatternStructure:
    def __init__(self):
        self.market_categories = {
            'energy_and_green_transition': {
                'titles': [
                    'klimadirektør', 'klima direktør', 'klimachef', 'klima chef',
                    'bæredygtighedschef', 'bæredygtighed chef', 'miljøchef', 'miljø chef',
                    'energidirektør', 'energi direktør', 'vindmølle', 'kraft værk',
                    'energi chef', 'miljø direktør', 'bæredygtighed direktør'
                ],
                'orgs': [
                    'energiselskab', 'energi selskab', 'vindmølle', 'miljø',
                    'forsyning', 'energistyrelse', 'klima', 'miljøstyrelse'
                ],
                'keywords': [
                    'grøn', 'bæredygtig', 'vedvarende', 'energi', 'klima',
                    'miljø', 'vindkraft', 'power', 'omstilling'
                ]
            },
            'welfare': {
                'titles': [
                    'velfærdsdirektør', 'velfærd direktør', 'socialchef', 'social chef',
                    'ældrechef', 'ældre chef', 'børnechef', 'børne chef', 'ungechef',
                    'beskæftigelseschef', 'beskæftigelse chef', 'integrationschef'
                ],
                'orgs': [
                    'kommune', 'region', 'socialstyrelse', 'beskæftigelse',
                    'socialministerium', 'velfærd', 'ældrecenter', 'børnehus'
                ],
                'keywords': [
                    'velfærd', 'social', 'ældre', 'børn', 'unge',
                    'beskæftigelse', 'integration', 'kommune', 'omsorg'
                ]
            },
            'maritime_and_shipping': {
                'titles': [
                    'rederdirektør', 'reder', 'havnedirektør', 'havn chef',
                    'skibsreder', 'marinechef', 'marine chef', 'offshore chef',
                    'logistikchef', 'logistik direktør', 'fragtchef'
                ],
                'orgs': [
                    'rederi', 'havn', 'værft', 'søfart', 'maritime',
                    'offshore', 'shipping', 'container', 'fragt'
                ],
                'keywords': [
                    'søfart', 'shipping', 'maritim', 'offshore', 'havn',
                    'skib', 'container', 'logistik', 'fragt'
                ]
            },
            'agriculture_food': {
                'titles': [
                    'landbrugsdirektør', 'landbrug chef', 'fødevarechef', 'fødevare direktør',
                    'mejerichef', 'mejeri direktør', 'landbrugspræsident', 'fødevaredirektør',
                    'landmand', 'gårdejer'
                ],
                'orgs': [
                    'landbrug', 'fødevare', 'mejeri', 'slagteri',
                    'landbrugsorganisation', 'fødevarestyrelse', 'gård'
                ],
                'keywords': [
                    'landbrug', 'fødevare', 'mejeri', 'økologi',
                    'fødevaresikkerhed', 'eksport', 'gård', 'mark'
                ]
            },
            'union_labour': {
                'titles': [
                    'forbundsformand', 'forbund formand', 'fagforeningsformand',
                    'hovedkasserer', 'forhandlingsleder', 'forhandling chef',
                    'arbejdsmarkedschef', 'arbejdsmarked direktør'
                ],
                'orgs': [
                    'fagforening', 'forbund', 'hovedorganisation', 'akasse',
                    'arbejdsgiver', 'overenskomst', 'fagbevægelse'
                ],
                'keywords': [
                    'overenskomst', 'arbejdsmarked', 'fagbevægelse', 'medlem',
                    'forhandling', 'arbejdsret', 'faglig'
                ]
            },
            'real_estate': {
                'titles': [
                    'ejendomsmægler', 'developer', 'ejendomsadministrator',
                    'bygherrerådgiver', 'ejendom chef', 'bolig direktør'
                ],
                'orgs': [
                    'ejendomsselskab', 'developer', 'boligforening', 'administration',
                    'boligselskab', 'ejendomsadministration'
                ],
                'keywords': [
                    'ejendom', 'bolig', 'byggeri', 'udlejning', 'investering',
                    'developer', 'administration', 'bygherre'
                ]
            },
            'finance': {
                'titles': [
                    'bankdirektør', 'bank chef', 'finansdirektør', 'finans chef',
                    'investor', 'analytiker', 'økonom', 'fondforvalter',
                    'porteføljemanager', 'aktiestrateg', 'valutahandler'
                ],
                'orgs': [
                    'bank', 'børs', 'investering', 'kapitalfond', 'nationalbank',
                    'realkredit', 'pension', 'forsikring'
                ],
                'keywords': [
                    'finans', 'aktie', 'obligation', 'investering', 'rente',
                    'marked', 'valuta', 'børs', 'opkøb', 'fusion'
                ]
            },
            'industry': {
                'titles': [
                    'administrerende direktør', 'adm direktør', 'bestyrelsesformand',
                    'koncernchef', 'koncern direktør', 'fabriksdirektør',
                    'produktionschef', 'industri chef'
                ],
                'orgs': [
                    'virksomhed', 'koncern', 'industri', 'produktion',
                    'erhverv', 'fabrik', 'industri'
                ],
                'keywords': [
                    'produktion', 'industri', 'marked', 'fabrik', 'supply chain',
                    'erhverv', 'koncern', 'virksomhed'
                ]
            },
            'tech': {
                'titles': [
                    'udviklingschef', 'udvikling direktør', 'techekspert', 'tech chef',
                    'itdirektør', 'it chef', 'softwareudvikler', 'dataanalytiker',
                    'digital chef', 'innovation direktør'
                ],
                'orgs': [
                    'tech', 'startup', 'software', 'ai', 'teknologi',
                    'it', 'digital', 'data'
                ],
                'keywords': [
                    'teknologi', 'digital', 'innovation', 'data', 'kunstig intelligens',
                    'automatisering', 'software', 'it', 'tech'
                ]
            },
            'regulatory': {
                'titles': [
                    'minister', 'direktør', 'formand', 'tilsynschef',
                    'departementchef', 'styrelsesdirektør', 'afdelingschef'
                ],
                'orgs': [
                    'ministerium', 'styrelse', 'tilsyn', 'myndighed',
                    'domstol', 'departement', 'forvaltning'
                ],
                'keywords': [
                    'regulering', 'lovgivning', 'tilsyn', 'politik',
                    'beskatning', 'myndighed', 'forvaltning'
                ]
            },
            'education': {
                'titles': [
                    'rektor', 'professor', 'lektor', 'underviser',
                    'uddannelseschef', 'uddannelse direktør', 'skoleleder',
                    'dekan', 'institutleder'
                ],
                'orgs': [
                    'universitet', 'gymnasium', 'skole', 'uddannelse',
                    'institut', 'fakultet', 'akademi'
                ],
                'keywords': [
                    'uddannelse', 'forskning', 'læring', 'pædagogik',
                    'student', 'elev', 'undervisning'
                ]
            },
            'healthcare': {
                'titles': [
                    'læge', 'sygeplejerske', 'hospitaldirektør', 'hospital chef',
                    'specialist', 'sundhedsøkonom', 'forskningschef', 'overlæge',
                    'produktchef', 'regulatory affairs', 'medicinal direktør'
                ],
                'orgs': [
                    'hospital', 'klinik', 'sundhedsstyrelse', 'apotek',
                    'medicinal', 'biotek', 'sundhed', 'læge'
                ],
                'keywords': [
                    'sundhed', 'patient', 'medicin', 'behandling', 'sygdom',
                    'diabetes', 'biotek', 'klinisk', 'insulin'
                ]
            },
            'politics': {
                'titles': [
                    'politiker', 'minister', 'borgmester', 'rådmand',
                    'mfer', 'folketingsmedlem', 'regionsrådsformand',
                    'kommunalbestyrelsesmedlem'
                ],
                'orgs': [
                    'folketing', 'byråd', 'ministerium', 'parti',
                    'kommission', 'regering', 'kommunalbestyrelse'
                ],
                'keywords': [
                    'politik', 'valg', 'beslutning', 'samfund',
                    'demokrati', 'lovgivning', 'reform'
                ]
            },
            'aviation': {
                'titles': [
                    'pilot', 'flyveleder', 'luftfartsdirektør', 'luftfart chef',
                    'kabinechef', 'teknisk chef', 'lufthavnsdirektør'
                ],
                'orgs': [
                    'luftfart', 'lufthavn', 'flyproducent', 'flyselskab',
                    'aviation', 'airline'
                ],
                'keywords': [
                    'fly', 'luftfart', 'rejse', 'sikkerhed', 'transport',
                    'lufthavn', 'aviation', 'airline'
                ]
            },
            'design': {
                'titles': [
                    'designer', 'kreativ direktør', 'produktudvikler',
                    'modeekspert', 'designchef', 'art director'
                ],
                'orgs': [
                    'designstudie', 'modehus', 'designfirma',
                    'tegnestue', 'kreativ', 'mode'
                ],
                'keywords': [
                    'design', 'mode', 'produkt', 'æstetik',
                    'bruger', 'kreativ', 'kunst'
                ]
            },
            'architecture': {
                'titles': [
                    'arkitekt', 'bygningsdesigner', 'landskabsarkitekt',
                    'byplanlægger', 'partner', 'kreativ direktør'
                ],
                'orgs': [
                    'arkitektfirma', 'tegnestue', 'byplanlægning',
                    'arkitektur', 'design'
                ],
                'keywords': [
                    'arkitektur', 'byrum', 'byggeri', 'design',
                    'æstetik', 'byplanlægning', 'landskab'
                ]
            },
            'hospitality': {
                'titles': [
                    'hotelchef', 'hotel direktør', 'kok', 'restauratør',
                    'sommelier', 'restaurantchef', 'køkkenchef'
                ],
                'orgs': [
                    'hotel', 'restaurant', 'catering', 'gastronomi',
                    'hospitality'
                ],
                'keywords': [
                    'hotel', 'restaurant', 'mad', 'overnatning',
                    'oplevelse', 'service', 'gastronomi'
                ]
            },
            'tourism': {
                'titles': [
                    'turistchef', 'turist direktør', 'rejseleder',
                    'destinationschef', 'destination direktør'
                ],
                'orgs': [
                    'turistkontor', 'rejsebureau', 'destination',
                    'turisme', 'rejse'
                ],
                'keywords': [
                    'rejse', 'turisme', 'oplevelse', 'destination',
                    'attraktion', 'ferie', 'turist'
                ]
            },
            'appliances': {
                'titles': [
                    'produktchef', 'produkt direktør', 'ingeniør',
                    'udviklingschef', 'teknisk chef', 'salgschef'
                ],
                'orgs': [
                    'hvidevare', 'elektronik', 'distribution',
                    'producent', 'forhandler'
                ],
                'keywords': [
                    'hvidevare', 'elektronik', 'køkken', 'innovation',
                    'produkt', 'teknisk', 'udvikling'
                ]
            }
        }

        self.international_markers = {
            'eu': {
                'titles': [
                    'EU-kommissær', 'EU kommissær', 'EUkommissær',
                    'MEP', 'europaparlamentariker', 'europa parlamentariker',
                    'EU-direktør', 'EU direktør', 'EUdirektør',
                    'EU-chef', 'EU chef', 'EUchef',
                    'europa chef', 'europæisk direktør', 'europa direktør'
                ],
                'orgs': [
                    'EU-Kommission', 'EU Kommission', 'EUKommission',
                    'Europaparlament', 'Europa Parlament', 'Europa-Parlament',
                    'EU-Domstol', 'EU Domstol', 'EUDomstol',
                    'EU-agentur', 'EU agentur', 'EUagentur',
                    'EU-kontor', 'EU kontor', 'EUkontor',
                    'europæisk institution', 'europa institution'
                ]
            },
            'nordic': {
                'titles': [
                    'nordisk direktør', 'nordisk chef', 'nordisk leder',
                    'skandinavienchef', 'skandinavien chef', 'skandinavisk direktør',
                    'nordisk koordinator', 'skandinavisk leder', 'nordic director',
                    'nordisk ansvarlig', 'skandinavisk ansvarlig'
              ],
                'orgs': [
                    'Nordisk Råd', 'Nordisk Raad', 'Nordisk Ministerråd',
                    'Nordisk Ministerraad', 'skandinavisk afdeling',
                    'skandinavisk kontor', 'nordisk afdeling', 'nordisk kontor',
                    'nordic office', 'skandinavisk division'
                ]
            },
            'regional_danish': {
                'titles': [
                    'regionsdirektør', 'regions direktør', 'region direktør',
                    'borgmester', 'kommunaldirektør', 'kommunal direktør',
                    'regionsrådsformand', 'regionsraad formand', 'regionsråd formand',
                    'kommunal chef', 'regional chef', 'regions chef',
                    'regional direktør', 'lokal direktør'
                ],
                'orgs': [
                    'region', 'kommune', 'lokalråd', 'lokal råd',
                    'byråd', 'byraad', 'kommunal råd', 'kommunalråd',
                    'regional administration', 'kommunal forvaltning',
                    'regional myndighed', 'kommunal myndighed'
                ]
            }
        }

        self.elite_hierarchy = {
            'top_level': {
                'titles': [
                    'koncernchef', 'koncern chef', 'konsernchef',
                    'administrerende direktør', 'adm direktør', 'adm dir',
                    'administrerende dir', 'adm. direktør', 'topchef',
                    'top chef', 'bestyrelsesformand', 'bestyrelses formand',
                    'president', 'CEO', 'chief executive officer',
                    'grundlægger', 'grundlæger', 'stifter',
                    'ejer', 'direktionsformand', 'direktions formand',
                    'group ceo', 'koncerndirektør', 'koncern direktør'
                ],
                'indicators': [
                    'koncern', 'gruppe', 'holding', 'international',
                    'group', 'worldwide', 'global', 'nordic',
                    'skandinavisk', 'europæisk', 'executive'
                ]
            },
            'senior_level': {
                'titles': [
                    'direktør', 'områdechef', 'område chef',
                    'afdelingschef', 'afdelings chef', 'afdeling chef',
                    'regionsdirektør', 'regions direktør', 'region direktør',
                    'landechef', 'lande chef', 'land chef',
                    'divisionsdirektør', 'divisions direktør', 'division direktør',
                    'partner', 'senior manager', 'senior direktør'
                ],
                'indicators': [
                    'senior', 'chef', 'leder', 'manager',
                    'director', 'head', 'ansvarlig', 'lead'
                ]
            },
            'expert_level': {
                'titles': [
                    'chefanalytiker', 'chef analytiker', 'specialkonsulent',
                    'special konsulent', 'seniorøkonom', 'senior økonom',
                    'chefforsker', 'chef forsker', 'ekspert',
                    'specialist', 'seniorrådgiver', 'senior rådgiver',
                    'chief analyst', 'senior specialist', 'senior advisor'
                ],
                'indicators': [
                    'specialist', 'ekspert', 'forsker', 'analyst',
                    'researcher', 'advisor', 'consultant', 'expert',
                    'analytiker', 'rådgiver', 'konsulent'
                ]
            }
        }


        self.org_forms = {
            'private': [
                'A/S', 'AS', 'ApS', 'APS', 'I/S', 'IS',
                'K/S', 'KS', 'P/S', 'PS', 'IVS',
                'Holding', 'Group', 'Gruppen', 'Koncern',
                'Koncernen', 'Danmark', 'Danish', 'International',
                'Global', 'Nordic', 'Skandinavisk'
            ],
            # Additional organization forms omitted for brevity
            'public': [
                'Kommune', 'Kommunen', 'Region', 'Regionen',
                'Ministerium', 'Ministeriet', 'Styrelse',
                'Styrelsen', 'Direktorat', 'Direktoratet',
                'Institut', 'Instituttet', 'Center', 'Centret',
                'Forvaltning', 'Forvaltningen', 'Myndighed',
                'Myndigheden', 'Råd', 'Rådet'
            ],
            'associations': [
                'Forening', 'Foreningen', 'Forbund', 'Forbundet',
                'Organisation', 'Organisationen', 'Fond', 'Fonden',
                'Fagforening', 'Fagforeningen', 'Sammenslutning',
                'Sammenslutningen', 'Selskab', 'Selskabet',
                'Forening', 'NGO', 'Interesseorganisation'
            ]
        }





        # Add lemmatized forms of compound connectors
        self.compound_connectors = {
            's': ['erhverv', 'forbund', 'regering', 'arbejd', 'uddannelse', 'udvikling'],
            'e': ['børn', 'folk', 'kommun', 'skol', 'virksomhed'],
            'r': ['lær', 'led', 'arbejd', 'direktør', 'chef']
        }

        # Add lemmatized forms of prefixes and suffixes
        self.common_prefixes = [
            'over', 'under', 'mellem', 'chef', 'top', 'vice', 'først', 'senior', 'junior',
            'special', 'hoved', 'general', 'central', 'koncern', 'gruppe', 'region'
        ]

        self.common_suffixes = [
            'chef', 'direktør', 'leder', 'ansvarlig', 'koordinator', 'konsulent',
            'specialist', 'analytiker', 'rådgiver', 'formand', 'præsident'
        ]


    def get_all_patterns(self, term: str) -> List[str]:
        """Generate all possible patterns for a term including lemmatized forms"""
        patterns = [term]

        # Add space-separated version
        if '-' in term:
            patterns.append(term.replace('-', ' '))

        # Add compound variations
        for connector in self.compound_connectors.keys():
            patterns.append(f"{term}{connector}")

        # Add common prefix/suffix combinations
        for prefix in self.common_prefixes:
            patterns.append(f"{prefix}{term}")
            patterns.append(f"{prefix}-{term}")
            patterns.append(f"{prefix} {term}")

        for suffix in self.common_suffixes:
            patterns.append(f"{term}{suffix}")
            patterns.append(f"{term}-{suffix}")
            patterns.append(f"{term} {suffix}")

        return patterns
class PatternMatchingAnalyzer:
    def __init__(self, batch_size=50000):
        self.batch_size = batch_size
        self.danish_patterns = DanishPatternStructure()
        self.compiled_patterns = self._precompile_patterns()

    def _precompile_patterns(self):
        """Precompile all patterns once during initialization"""
        patterns = {}
        for category, type_dict in self.danish_patterns.market_categories.items():
            patterns[category] = {}
            for type_name, terms in type_dict.items():
                patterns[category][type_name] = self._compile_patterns_for_terms(terms)

        # Add patterns for other dictionaries if needed
        for scope, data in self.danish_patterns.international_markers.items():
            patterns[f'international_{scope}'] = {
                type_name: self._compile_patterns_for_terms(terms)
                for type_name, terms in data.items()
            }

        # Add hierarchy patterns
        for level, data in self.danish_patterns.elite_hierarchy.items():
            patterns[f'hierarchy_{level}'] = {
                type_name: self._compile_patterns_for_terms(terms)
                for type_name, terms in data.items()
            }

        return patterns

    def _compile_patterns_for_terms(self, terms):
        """Compile regex patterns for a list of terms"""
        return [re.compile(fr'\b{re.escape(term)}\b', re.IGNORECASE | re.UNICODE)
                for term in terms]

    def _find_matches_in_text(self, text: str) -> Dict[str, Dict[str, Set[str]]]:
        """Find all pattern matches in text"""
        results = defaultdict(lambda: defaultdict(set))

        for category, type_patterns in self.compiled_patterns.items():
            for type_name, patterns in type_patterns.items():
                for pattern in patterns:
                    matches = pattern.findall(text)
                    if matches:
                        results[category][type_name].update(matches)

        return results

    def process_batch(self, sentences: List[str], batch_num: int) -> Dict:
        """Process a batch of sentences"""
        batch_results = defaultdict(lambda: defaultdict(set))

        for sentence in tqdm(sentences, desc=f"Processing batch {batch_num}"):
            matches = self._find_matches_in_text(sentence)
            for category, type_dict in matches.items():
                for type_name, found_matches in type_dict.items():
                    batch_results[category][type_name].update(found_matches)

        return {
            'markers': {k: {sk: list(sv) for sk, sv in v.items()}
                      for k, v in batch_results.items()}
        }

    def save_batch_results(self, batch_results: Dict, batch_num: int, output_dir: str):
        """Save batch results with timestamp"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        os.makedirs(output_dir, exist_ok=True)

        batch_path = os.path.join(output_dir, f"batch_{batch_num}_{timestamp}.json")
        with open(batch_path, 'w', encoding='utf-8') as f:
            json.dump(batch_results, f, ensure_ascii=False, indent=2)

        print(f"Saved batch {batch_num} results")

def main():
    # Initialize analyzer
    analyzer = PatternMatchingAnalyzer(batch_size=50000)

    # Set up directories
    input_path = "/content/drive/MyDrive/NewsData/processed/train_data.csv" #skift mellem train og validation
    output_dir = "/content/drive/MyDrive/NewsData/processed/Elitenessbatchestrain101224" #ændr navn v. validation

    try:
        # Load data
        print(f"Loading data from: {input_path}")
        df = pd.read_csv(input_path, usecols=['sentence'])

        # Process first batch and validate
        print("\nProcessing first batch for validation...")
        first_batch_sentences = df['sentence'].iloc[:analyzer.batch_size].tolist()
        first_batch_results = analyzer.process_batch(first_batch_sentences, 0)

        # Validate first batch
        validation_results = validate_first_batch(first_batch_results)
        print_validation_results(validation_results)

        # Ask for user confirmation
        if validation_results['status'] == 'invalid':
            print("\nWarning: Validation found issues with the data structure.")

        user_input = input("\nDo you want to continue processing all batches? (yes/no): ")

        if user_input.lower() != 'yes':
            print("Stopping after first batch")
            return

        # Save first batch if continuing
        analyzer.save_batch_results(first_batch_results, 0, output_dir)

        # Process remaining batches
        total_sentences = len(df)
        total_batches = (total_sentences + analyzer.batch_size - 1) // analyzer.batch_size

        print(f"\nProcessing remaining {total_batches-1} batches...")

        for batch_num in range(1, total_batches):
            start_idx = batch_num * analyzer.batch_size
            end_idx = min(start_idx + analyzer.batch_size, total_sentences)

            batch_sentences = df['sentence'].iloc[start_idx:end_idx].tolist()

            try:
                batch_results = analyzer.process_batch(batch_sentences, batch_num)
                analyzer.save_batch_results(batch_results, batch_num, output_dir)
            except Exception as e:
                print(f"Error processing batch {batch_num}: {str(e)}")
                continue

        print("\nProcessing complete!")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()



Merge and save results.
Same script for train and validation set. Change input and output paths

In [None]:
#merging and saving results
import json
import os
from collections import defaultdict
from datetime import datetime
from typing import Dict, List, Set
import glob

class BatchMerger:
    def __init__(self, input_dir: str):
        self.input_dir = input_dir

    def get_batch_files(self) -> List[str]:
        """Get all batch result files, excluding summary files"""
        files = glob.glob(os.path.join(self.input_dir, "batch_*.json"))
        return [f for f in files if not f.endswith('_summary.json')]

    def merge_batches(self) -> Dict:
        """Merge all batch files into a single dictionary"""
        merged_results = defaultdict(lambda: defaultdict(set))
        processed_files = 0

        batch_files = self.get_batch_files()
        print(f"Found {len(batch_files)} batch files to merge")

        for file_path in sorted(batch_files):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    batch_data = json.load(f)

                # Merge markers from this batch
                if 'markers' in batch_data:
                    for category, type_dict in batch_data['markers'].items():
                        for type_name, matches in type_dict.items():
                            # Convert matches to set to remove duplicates
                            merged_results[category][type_name].update(matches)

                processed_files += 1
                print(f"Processed file {processed_files}/{len(batch_files)}: {os.path.basename(file_path)}")

            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue

        # Convert sets to lists for JSON serialization
        final_results = {
            category: {
                type_name: sorted(list(matches))
                for type_name, matches in type_dict.items()
            }
            for category, type_dict in merged_results.items()
        }

        return final_results

    def generate_statistics(self, merged_results: Dict) -> Dict:
        """Generate statistics about the merged results"""
        stats = {
            'total_categories': len(merged_results),
            'categories': {},
            'total_matches': 0
        }

        for category, type_dict in merged_results.items():
            category_stats = {
                'total_matches': 0,
                'types': {}
            }

            for type_name, matches in type_dict.items():
                num_matches = len(matches)
                category_stats['types'][type_name] = num_matches
                category_stats['total_matches'] += num_matches
                stats['total_matches'] += num_matches

            stats['categories'][category] = category_stats

        return stats

    def save_merged_results(self, merged_results: Dict, stats: Dict):
        """Save merged results and statistics"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save merged results
        results_filename = f"merged_results_{timestamp}.json"
        results_path = os.path.join(self.input_dir, results_filename)

        with open(results_path, 'w', encoding='utf-8') as f:
            json.dump(merged_results, f, ensure_ascii=False, indent=2)

    # Save statistics
        stats_filename = f"merged_results_stats_{timestamp}.json"
        stats_path = os.path.join(self.input_dir, stats_filename)

        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(stats, f, ensure_ascii=False, indent=2)

        print(f"\nResults saved to: {results_filename}")
        print(f"Statistics saved to: {stats_filename}")

    # Print summary statistics
        print("\nSummary Statistics:")
        print(f"Total main categories: {stats['total_main_categories']}")
        print(f"Total matches: {stats['total_matches']}")
        print("\nMatches by main category:")
        for main_category, main_cat_stats in stats['main_categories'].items():
            print(f"\n{main_category}:")
            print(f"  Total matches: {main_cat_stats['total_matches']}")
            for subcategory, subcat_stats in main_cat_stats['subcategories'].items():
                print(f"\n  {subcategory}:")
                print(f"    Total matches: {subcat_stats['total_matches']}")
                for type_name, count in subcat_stats['types'].items():
                    print(f"    - {type_name}: {count}")

def main():
    # Directory containing the batch files
    input_dir = "/content/drive/MyDrive/NewsData/processed/ElitenessbatchesValidation101224" #skift mellem train og validation

    try:
        # Initialize merger
        merger = BatchMerger(input_dir)

        # Merge batches
        print("Starting merge process...")
        merged_results = merger.merge_batches()

        # Generate statistics
        print("\nGenerating statistics...")
        stats = merger.generate_statistics(merged_results)

        # Save results and statistics
        print("\nSaving results...")
        merger.save_merged_results(merged_results, stats)

    except Exception as e:
        print(f"Error during merge process: {str(e)}")

if __name__ == "__main__":
    main()

Content check

In [None]:
#Check kategorierne og indholdet af dem

import json
from collections import defaultdict
from pprint import pprint

def analyze_category_structure(file_path):
    """
    Analyze the category structure of the merged results JSON file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Analyze structure
        category_stats = defaultdict(lambda: defaultdict(int))
        category_examples = defaultdict(lambda: defaultdict(list))

        # Collect statistics and examples
        for main_category, subcats in data.items():
            for subcat, items in subcats.items():
                # Count items
                category_stats[main_category][subcat] = len(items)

                # Store first few examples
                category_examples[main_category][subcat] = items[:3]

        # Print analysis
        print("\nCategory Structure Analysis:")
        print("=" * 50)

        for main_cat in sorted(category_stats.keys()):
            print(f"\nMain Category: {main_cat}")
            print("-" * 30)

            for subcat, count in sorted(category_stats[main_cat].items()):
                print(f"\nSubcategory: {subcat}")
                print(f"Count: {count}")
                print("Examples:", ', '.join(category_examples[main_cat][subcat]))

        # Print summary statistics
        print("\nSummary Statistics:")
        print("=" * 50)
        print(f"Total main categories: {len(category_stats)}")
        total_items = sum(sum(subcats.values()) for subcats in category_stats.values())
        print(f"Total items across all categories: {total_items}")

        return category_stats, category_examples

    except Exception as e:
        print(f"Error analyzing file: {str(e)}")
        return None, None

# Run analysis
file_path = "/content/drive/MyDrive/NewsData/processed/Elitenessbatches091224/merged_results_20241210_110522.json"
stats, examples = analyze_category_structure(file_path)

First validation check. Comparison between train and validation set

In [7]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from typing import Dict, List, Set, Tuple
from collections import defaultdict

class PatternValidator:
    def __init__(self, predictions_path: str, ground_truth_path: str):
        self.predictions = self._load_json(predictions_path)
        self.ground_truth = self._load_json(ground_truth_path)
        self.main_categories = ['market_categories', 'elite_hierarchy', 'org_forms', 'international_markers']
        self.subcategories = [
            'energy_and_green_transition', 'welfare', 'maritime_and_shipping', 'agriculture_food',
            'union_labour', 'real_estate', 'finance', 'industry', 'tech', 'regulatory',
            'education', 'healthcare', 'politics', 'aviation', 'design', 'architecture',
            'hospitality', 'tourism', 'appliances'
        ]

    def _load_json(self, path: str) -> Dict:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def calculate_category_distribution(self) -> Dict:
        """Calculate distribution of matches across categories"""
        distribution = defaultdict(lambda: defaultdict(int))

        for dataset in [self.ground_truth, self.predictions]:
            for main_cat in self.main_categories:
                if main_cat in dataset:
                    for subcat in dataset[main_cat]:
                        if isinstance(dataset[main_cat][subcat], dict):
                            for type_name in ['titles', 'orgs', 'keywords']:
                                if type_name in dataset[main_cat][subcat]:
                                    distribution[main_cat][subcat] += len(dataset[main_cat][subcat][type_name])

        return distribution

    def calculate_metrics(self) -> Dict:
        """Calculate precision, recall, and F1 score for each category and type"""
        metrics = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

        for main_cat in self.main_categories:
            if main_cat not in self.ground_truth or main_cat not in self.predictions:
                continue

            for subcat in self.subcategories:
                if subcat not in self.ground_truth[main_cat] or subcat not in self.predictions[main_cat]:
                    continue

                for type_name in ['titles', 'orgs', 'keywords']:
                    if (type_name not in self.ground_truth[main_cat][subcat] or
                        type_name not in self.predictions[main_cat][subcat]):
                        continue

                    true_set = set(self.ground_truth[main_cat][subcat][type_name])
                    pred_set = set(self.predictions[main_cat][subcat][type_name])

                    tp = len(true_set & pred_set)
                    fp = len(pred_set - true_set)
                    fn = len(true_set - pred_set)

                    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
                    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                    metrics[main_cat][subcat][type_name] = {
                        'precision': precision,
                        'recall': recall,
                        'f1': f1,
                        'true_positives': tp,
                        'false_positives': fp,
                        'false_negatives': fn,
                        'examples': {
                            'correct_matches': list(true_set & pred_set)[:5],
                            'false_positives': list(pred_set - true_set)[:5],
                            'missed_matches': list(true_set - pred_set)[:5]
                        }
                    }

        return metrics

    def get_error_analysis(self) -> Dict:
        """Analyze patterns in false positives and false negatives"""
        error_analysis = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

        for main_cat in self.main_categories:
            if main_cat not in self.ground_truth or main_cat not in self.predictions:
                continue

            for subcat in self.subcategories:
                if subcat not in self.ground_truth[main_cat] or subcat not in self.predictions[main_cat]:
                    continue

                for type_name in ['titles', 'orgs', 'keywords']:
                    if (type_name not in self.ground_truth[main_cat][subcat] or
                        type_name not in self.predictions[main_cat][subcat]):
                        continue

                    true_set = set(self.ground_truth[main_cat][subcat][type_name])
                    pred_set = set(self.predictions[main_cat][subcat][type_name])

                    # Analyze false positives
                    for fp in (pred_set - true_set):
                        error_analysis[main_cat][subcat][type_name]['false_positives'].append({
                            'term': fp,
                            'similar_to': self._find_similar_terms(fp, true_set)
                        })

                    # Analyze false negatives
                    for fn in (true_set - pred_set):
                        error_analysis[main_cat][subcat][type_name]['false_negatives'].append({
                            'term': fn,
                            'similar_to': self._find_similar_terms(fn, pred_set)
                        })

        return error_analysis

    def _find_similar_terms(self, term: str, term_set: Set[str], threshold: float = 0.8) -> List[str]:
        """Find similar terms using string similarity"""
        from difflib import SequenceMatcher

        similar_terms = []
        for other_term in term_set:
            similarity = SequenceMatcher(None, term.lower(), other_term.lower()).ratio()
            if similarity >= threshold:
                similar_terms.append((other_term, similarity))

        return [term for term, _ in sorted(similar_terms, key=lambda x: x[1], reverse=True)[:3]]

    def save_validation_results(self, output_path: str):
        """Save validation results to JSON"""
        # Calculate distribution, metrics and error analysis
        distribution = self.calculate_category_distribution()
        metrics = self.calculate_metrics()
        error_analysis = self.get_error_analysis()

        # Calculate overall metrics for each main category and subcategory
        overall_metrics = defaultdict(lambda: defaultdict(dict))
        for main_cat in metrics:
            for subcat in metrics[main_cat]:
                metrics_list = [
                    metrics[main_cat][subcat][t]
                    for t in metrics[main_cat][subcat]
                ]

                overall_metrics[main_cat][subcat] = {
                    'precision': np.mean([m['precision'] for m in metrics_list]),
                    'recall': np.mean([m['recall'] for m in metrics_list]),
                    'f1': np.mean([m['f1'] for m in metrics_list])
                }

        # Prepare validation report
        validation_report = {
            'distribution': distribution,
            'metrics': metrics,
            'error_analysis': error_analysis,
            'overall_metrics': overall_metrics,
            'summary': {
                'overall_precision': np.mean([
                    m[t]['precision']
                    for m in metrics.values()
                    for s in m.values()
                    for t in s.keys()
                ]),
                'overall_recall': np.mean([
                    m[t]['recall']
                    for m in metrics.values()
                    for s in m.values()
                    for t in s.keys()
                ]),
                'overall_f1': np.mean([
                    m[t]['f1']
                    for m in metrics.values()
                    for s in m.values()
                    for t in s.keys()
                ])
            }
        }

        # Save to file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(validation_report, f, ensure_ascii=False, indent=2)

        # Print summary
        print("\nValidation Summary:")
        print(f"Overall Precision: {validation_report['summary']['overall_precision']:.3f}")
        print(f"Overall Recall: {validation_report['summary']['overall_recall']:.3f}")
        print(f"Overall F1 Score: {validation_report['summary']['overall_f1']:.3f}")

        # Print category-wise summary
        print("\nCategory-wise Summary:")
        for main_cat in overall_metrics:
            print(f"\n{main_cat}:")
            for subcat in overall_metrics[main_cat]:
                metrics = overall_metrics[main_cat][subcat]
                print(f"  {subcat}:")
                print(f"    Precision: {metrics['precision']:.3f}")
                print(f"    Recall: {metrics['recall']:.3f}")
                print(f"    F1: {metrics['f1']:.3f}")

def main():
    # Specific paths to your files
    validation_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json"
    training_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json"

    # Path for saving validation analysis
    output_path = "/content/drive/MyDrive/NewsData/processed/validation_analysis_2024161224.json"

    print("Loading validation and training results...")
    print(f"Validation results path: {validation_results_path}")
    print(f"Training results path: {training_results_path}")

    try:
        # Initialize validator using training results as predictions and validation as ground truth
        validator = PatternValidator(
            predictions_path=training_results_path,
            ground_truth_path=validation_results_path
        )

        # Run validation and save results
        print("\nCalculating validation metrics...")
        validator.save_validation_results(output_path)

        print(f"\nValidation analysis saved to: {output_path}")

    except FileNotFoundError as e:
        print(f"Error: Could not find one of the input files: {e}")
    except Exception as e:
        print(f"Error during validation: {e}")

if __name__ == "__main__":
    main()

Loading validation and training results...
Validation results path: /content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json
Training results path: /content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json

Calculating validation metrics...
Error during validation: dictionary changed size during iteration


In [None]:
#Første check af sammenligning mellem train og validation ift. dictionaries genereret i train. Ikke overraskende ikke så stort overlap. En del markeret som falsk positive (men er det nok ikke)
import json
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from typing import Dict, List, Set, Tuple
from collections import defaultdict

class PatternValidator:
    def __init__(self, predictions_path: str, ground_truth_path: str):
        self.predictions = self._load_json(predictions_path)
        self.ground_truth = self._load_json(ground_truth_path)

    def _load_json(self, path: str) -> Dict:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def calculate_metrics(self) -> Dict:
        """Calculate precision, recall, and F1 score for each category"""
        metrics = defaultdict(dict)

        for category in self.ground_truth.keys():
            if category not in self.predictions:
                print(f"Warning: Category {category} not found in predictions")
                continue

            for type_name in ['titles', 'orgs', 'keywords']:
                if type_name not in self.ground_truth[category]:
                    continue

                # Get ground truth and predicted sets
                true_set = set(self.ground_truth[category][type_name])
                pred_set = set(self.predictions[category][type_name])

                # Calculate metrics
                tp = len(true_set & pred_set)  # True positives
                fp = len(pred_set - true_set)  # False positives
                fn = len(true_set - pred_set)  # False negatives

                # Calculate precision, recall, F1
                precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0
                f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                metrics[category][type_name] = {
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'true_positives': tp,
                    'false_positives': fp,
                    'false_negatives': fn,
                    'examples': {
                        'correct_matches': list(true_set & pred_set)[:5],
                        'false_positives': list(pred_set - true_set)[:5],
                        'missed_matches': list(true_set - pred_set)[:5]
                    }
                }

        return metrics

    def get_error_analysis(self) -> Dict:
        """Analyze patterns in false positives and false negatives"""
        error_analysis = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

        for category in self.ground_truth.keys():
            if category not in self.predictions:
                continue

            for type_name in ['titles', 'orgs', 'keywords']:
                if type_name not in self.ground_truth[category]:
                    continue

                true_set = set(self.ground_truth[category][type_name])
                pred_set = set(self.predictions[category][type_name])

                # Analyze false positives
                for fp in (pred_set - true_set):
                    error_analysis[category][type_name]['false_positives'].append({
                        'term': fp,
                        'similar_to': self._find_similar_terms(fp, true_set)
                    })

                # Analyze false negatives
                for fn in (true_set - pred_set):
                    error_analysis[category][type_name]['false_negatives'].append({
                        'term': fn,
                        'similar_to': self._find_similar_terms(fn, pred_set)
                    })

        return error_analysis

    def _find_similar_terms(self, term: str, term_set: Set[str], threshold: float = 0.8) -> List[str]:
        """Find similar terms using string similarity"""
        from difflib import SequenceMatcher

        similar_terms = []
        for other_term in term_set:
            similarity = SequenceMatcher(None, term.lower(), other_term.lower()).ratio()
            if similarity >= threshold:
                similar_terms.append((other_term, similarity))

        return [term for term, _ in sorted(similar_terms, key=lambda x: x[1], reverse=True)[:3]]

    def save_validation_results(self, output_path: str):
        """Save validation results to JSON"""
        # Calculate metrics and error analysis
        metrics = self.calculate_metrics()
        error_analysis = self.get_error_analysis()

        # Prepare validation report
        validation_report = {
            'metrics': metrics,
            'error_analysis': error_analysis,
            'summary': {
                'overall_precision': np.mean([
                    m[t]['precision']
                    for m in metrics.values()
                    for t in m.keys()
                ]),
                'overall_recall': np.mean([
                    m[t]['recall']
                    for m in metrics.values()
                    for t in m.keys()
                ]),
                'overall_f1': np.mean([
                    m[t]['f1']
                    for m in metrics.values()
                    for t in m.keys()
                ])
            }
        }

        # Save to file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(validation_report, f, ensure_ascii=False, indent=2)

        # Print summary
        print("\nValidation Summary:")
        print(f"Overall Precision: {validation_report['summary']['overall_precision']:.3f}")
        print(f"Overall Recall: {validation_report['summary']['overall_recall']:.3f}")
        print(f"Overall F1 Score: {validation_report['summary']['overall_f1']:.3f}")

def main():
    # Specific paths to your files
    validation_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessbatchesValidation101224/merged_resultsValidation_20241210_124802.json"
    training_results_path = "/content/drive/MyDrive/NewsData/processed/Elitenessbatches091224/merged_results_20241210_110522.json"

    # Path for saving validation analysis
    output_path = "/content/drive/MyDrive/NewsData/processed/validation_analysis_20241210.json"

    print("Loading validation and training results...")
    print(f"Validation results path: {validation_results_path}")
    print(f"Training results path: {training_results_path}")

    try:
        # Initialize validator using training results as predictions and validation as ground truth
        validator = PatternValidator(
            predictions_path=training_results_path,  # Training results are our predictions
            ground_truth_path=validation_results_path  # Validation results are our ground truth
        )

        # Run validation and save results
        print("\nCalculating validation metrics...")
        validator.save_validation_results(output_path)

        print(f"\nValidation analysis saved to: {output_path}")

    except FileNotFoundError as e:
        print(f"Error: Could not find one of the input files: {e}")
    except Exception as e:
        print(f"Error during validation: {e}")

if __name__ == "__main__":
    main()

Embeddings creation for all markers using BERT for Danish

In [None]:
#embedding etc. creates embeddings and similarity files.

import json
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from google.cloud import storage
from tqdm import tqdm
import os

# Initialize BERT model
tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")

def load_categories():
    """Load eliteness categories from GCS"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")
    blob = bucket.blob("merged_results_20241210_110522.json")
    categories = json.loads(blob.download_as_string())
    print(f"Loaded categories from GCS")
    return categories

def get_word_embedding(word):
    """Get embedding for a single word/pointer"""
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the actual word embedding, not special tokens
        word_embedding = outputs.last_hidden_state[0][1].numpy()
    return word_embedding

def process_subcategory_batch(category_name, subcat, items, batch_id, sub_batch_size=1000):
    """Process a subcategory in smaller sub-batches"""
    sub_batch_embeddings = {}
    sub_batch_map = {}

    for i in range(0, len(items), sub_batch_size):
        sub_batch = items[i:i + sub_batch_size]
        print(f"\nProcessing {category_name}_{subcat} sub-batch {i//sub_batch_size}")

        for item in tqdm(sub_batch):
            try:
                embedding = get_word_embedding(item)
                sub_batch_embeddings[item] = embedding
                sub_batch_map[item] = f"{category_name}_{subcat}"
            except Exception as e:
                print(f"Error processing {item}: {str(e)}")

        # Save sub-batch
        if sub_batch_embeddings:
            save_batch_results(
                sub_batch_embeddings,
                sub_batch_map,
                f"{batch_id}_{subcat}_{i//sub_batch_size}",
                category_name
            )

    return sub_batch_embeddings, sub_batch_map

def process_category_batch(category_name, subcategories, batch_id):
    """Process each subcategory separately"""
    batch_embeddings = {}
    batch_map = {}

    print(f"\nProcessing category: {category_name} (Batch {batch_id})")

    for subcat, items in subcategories.items():
        if isinstance(items, list):
            print(f"\nProcessing subcategory: {subcat} ({len(items)} items)")
            sub_embeddings, sub_map = process_subcategory_batch(
                category_name, subcat, items, batch_id
            )
            batch_embeddings.update(sub_embeddings)
            batch_map.update(sub_map)

            # Clear memory
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Save complete category results
    if batch_embeddings:
        save_batch_results(batch_embeddings, batch_map, batch_id, category_name)

    return batch_embeddings, batch_map

def save_batch_results(batch_embeddings, batch_map, batch_id, category_name):
    """Save batch results with error handling"""
    try:
        # Create DataFrame for this batch
        words = list(batch_embeddings.keys())
        embeddings_matrix = np.stack([batch_embeddings[word] for word in words])
        similarity_matrix = np.dot(embeddings_matrix, embeddings_matrix.T)

        results = {
            'pointer': words,
            'category': [batch_map[word] for word in words],
            'embedding': [batch_embeddings[word] for word in words]
        }
        batch_df = pd.DataFrame(results)

        # Save locally with timestamp
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        local_path_df = f'/content/batch_{batch_id}_{category_name}_{timestamp}_embeddings.pkl'
        local_path_matrix = f'/content/batch_{batch_id}_{category_name}_{timestamp}_similarity.npy'

        batch_df.to_pickle(local_path_df)
        np.save(local_path_matrix, similarity_matrix)

        # Save to GCS
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        for local_file, file_type in [
            (local_path_df, 'embeddings'),
            (local_path_matrix, 'similarity')
        ]:
            blob = bucket.blob(f'embeddings_analysis/batches/batch_{batch_id}_{category_name}_{timestamp}_{file_type}')
            blob.upload_from_filename(local_file)

        print(f"Saved batch {batch_id} ({category_name}) with {len(words)} pointers")

        # Clear local files
        os.remove(local_path_df)
        os.remove(local_path_matrix)

    except Exception as e:
        print(f"Error saving batch {batch_id}: {str(e)}")
        # Try to save locally only
        try:
            batch_df.to_pickle(f'/content/BACKUP_batch_{batch_id}_{category_name}.pkl')
            print(f"Saved backup to local storage")
        except:
            print("Failed to save backup")

def check_processed_batches():
    """Check which batches and subcategories have already been processed"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Track at subcategory level
    processed_items = defaultdict(dict)
    existing_batches = set()

    print("\nChecking existing files in bucket...")
    for blob in bucket.list_blobs(prefix='embeddings_analysis/batches/'):
        filename = blob.name.split('/')[-1]
        if not filename.startswith('batch_'):
            continue

        parts = filename.split('_')
        if len(parts) >= 4:
            batch_num = parts[1]
            category_type = parts[2]  # e.g., 'entities', 'public'
            subcategory = parts[3]    # e.g., 'organization'

            # Store the complete batch identifier
            batch_key = f"{batch_num}_{category_type}_{subcategory}"
            existing_batches.add(batch_key)

            processed_items[batch_key] = {
                'filename': filename,
                'timestamp': parts[-2] if len(parts) > 4 else None,
                'processed': True
            }

    if existing_batches:
        print("\nFound existing batches:")
        for batch in sorted(existing_batches):
            print(f"- {batch}")
    else:
        print("\nNo existing batches found")

    return processed_items

def process_subcategory_batch(category_name, subcat, items, batch_id, processed_items, sub_batch_size=1000):
    """Process a subcategory in smaller sub-batches with better tracking"""
    sub_batch_embeddings = {}
    sub_batch_map = {}

    total_sub_batches = (len(items) + sub_batch_size - 1) // sub_batch_size

    for i in range(0, len(items), sub_batch_size):
        sub_batch = items[i:i + sub_batch_size]
        sub_batch_id = i // sub_batch_size

        # Check if this specific sub-batch was processed
        key = f"{batch_id}_{category_name}_{subcat}_{sub_batch_id}"
        if key in processed_items:
            print(f"Skipping already processed sub-batch: {key}")
            continue

        print(f"\nProcessing {key} ({len(sub_batch)} items) - Sub-batch {sub_batch_id + 1}/{total_sub_batches}")

        current_batch_embeddings = {}
        current_batch_map = {}

        for item in tqdm(sub_batch):
            try:
                embedding = get_word_embedding(item)
                current_batch_embeddings[item] = embedding
                current_batch_map[item] = f"{category_name}_{subcat}"
            except Exception as e:
                print(f"Error processing {item}: {str(e)}")

        # Save current sub-batch
        if current_batch_embeddings:
            save_batch_results(
                current_batch_embeddings,
                current_batch_map,
                f"{batch_id}_{subcat}_{sub_batch_id}",
                category_name
            )

            # Update main dictionaries
            sub_batch_embeddings.update(current_batch_embeddings)
            sub_batch_map.update(current_batch_map)

            # Clear memory
            del current_batch_embeddings
            del current_batch_map
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return sub_batch_embeddings, sub_batch_map

def process_all_categories(resume=True):
    """Process categories with improved tracking"""
    categories = load_categories()
    processed_items = check_processed_batches() if resume else {}

    # Initialize collection dictionaries
    all_embeddings = {}
    all_category_maps = {}

    # Check if we've completed all categories
    total_categories = len(categories.keys())
    max_batch_found = -1
    for key in processed_items.keys():
        try:
            batch_num = int(key.split('_')[0])
            max_batch_found = max(max_batch_found, batch_num)
        except:
            continue

    if max_batch_found >= total_categories - 1:  # -1 because batches start at 0
        print(f"\nAll categories have been processed (found {max_batch_found + 1} batches for {total_categories} categories)")
        print("Processing complete!")
        return all_embeddings, all_category_maps

    print(f"\nContinuing from batch {max_batch_found + 1} of {total_categories} total categories")

    # Continue with processing...
    for batch_id, (category_name, subcategories) in enumerate(categories.items(), start=max_batch_found + 1):
        if batch_id >= total_categories:
            break

        print(f"\nProcessing category: {category_name} (Batch {batch_id})")

        for subcat, items in subcategories.items():
            if isinstance(items, list) and items:
                key_prefix = f"{batch_id}_{category_name}_{subcat}"

                if key_prefix not in processed_items:
                    print(f"\nProcessing subcategory: {subcat} ({len(items)} items)")
                    sub_embeddings, sub_map = process_subcategory_batch(
                        category_name, subcat, items, batch_id, processed_items
                    )
                    if sub_embeddings and sub_map:
                        all_embeddings.update(sub_embeddings)
                        all_category_maps.update(sub_map)
                else:
                    print(f"\nSkipping already processed: {key_prefix}")

    print("\nProcessing complete!")
    return all_embeddings, all_category_maps

def load_batch_results(batch_id, category_name):
    """Load results for a specific batch"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Find the latest version of this batch's files
    batch_files = []
    for blob in bucket.list_blobs(prefix=f'embeddings_analysis/batches/batch_{batch_id}_{category_name}'):
        batch_files.append(blob.name)

    if not batch_files:
        raise Exception(f"No files found for batch {batch_id}")

    # Get latest version based on timestamp
    latest_embedding_file = max([f for f in batch_files if 'embeddings' in f])

    # Download and load
    blob = bucket.blob(latest_embedding_file)
    blob.download_to_filename('/content/temp_batch.pkl')
    batch_df = pd.read_pickle('/content/temp_batch.pkl')

    # Convert back to dictionary format
    embeddings = dict(zip(batch_df['pointer'], batch_df['embedding']))
    category_map = dict(zip(batch_df['pointer'], batch_df['category']))

    # Clean up
    os.remove('/content/temp_batch.pkl')

    print(f"Loaded {len(embeddings)} pointers from existing batch {batch_id}")
    return embeddings, category_map

def main(resume=True):
    print("Starting batch processing of categories...")
    if resume:
        print("Checking for existing progress...")

    all_embeddings, all_category_maps = process_all_categories(resume=resume)
    return all_embeddings, all_category_maps

if __name__ == "__main__":
    all_embeddings, all_category_maps = main(resume=True)


Embeddings cleaning and reorganizing

In [None]:
from google.cloud import storage
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import os

def clean_and_reorganize_embeddings():
    """Clean embedding files and save to new organized structure"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Create a directory for temporary files
    temp_dir = '/content/temp_clean'
    os.makedirs(temp_dir, exist_ok=True)

    # Track unique pointers and their embeddings
    unique_embeddings = {}
    file_mappings = defaultdict(list)

    print("Scanning for embedding files...")
    embedding_files = [
        blob.name for blob in bucket.list_blobs(prefix='embeddings_analysis/batches/')
        if '_embeddings' in blob.name and '_similarity' not in blob.name
    ]

    print(f"Found {len(embedding_files)} files to process")

    # Process files and collect unique embeddings
    for file_name in tqdm(embedding_files, desc="Processing files"):
        try:
            # Download and load file
            temp_file = os.path.join(temp_dir, 'temp_batch.pkl')
            blob = bucket.blob(file_name)
            blob.download_to_filename(temp_file)
            batch_df = pd.read_pickle(temp_file)
            os.remove(temp_file)  # Clean up immediately after use

            # Extract categories from filename
            parts = file_name.split('/')[-1].split('_')
            category_start = 2
            main_category = '_'.join([p for p in parts[category_start:]
                                   if not p.isdigit() and p not in ['embeddings', 'similarity']])

            sub_types = ['titles', 'orgs', 'keywords', 'entities', 'public', 'private']
            sub_category = next((p for p in parts if p in sub_types), 'unknown')

            # Process each row
            for _, row in batch_df.iterrows():
                pointer = row['pointer']
                if pointer not in unique_embeddings:
                    unique_embeddings[pointer] = {
                        'embedding': row['embedding'],
                        'category': row['category'],
                        'main_category': main_category,
                        'sub_category': sub_category
                    }
                    file_mappings[f"{main_category}_{sub_category}"].append(pointer)

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue

    print(f"\nFound {len(unique_embeddings)} unique pointers")

    # Save cleaned embeddings by category
    print("\nSaving cleaned embeddings...")

    for category, pointers in file_mappings.items():
        try:
            # Create DataFrame for this category
            category_data = {
                'pointer': [],
                'embedding': [],
                'category': [],
                'main_category': [],
                'sub_category': []
            }

            for pointer in pointers:
                data = unique_embeddings[pointer]
                category_data['pointer'].append(pointer)
                category_data['embedding'].append(data['embedding'])
                category_data['category'].append(data['category'])
                category_data['main_category'].append(data['main_category'])
                category_data['sub_category'].append(data['sub_category'])

            df = pd.DataFrame(category_data)

            # Save locally first
            temp_file = os.path.join(temp_dir, f'{category}_clean.pkl')
            df.to_pickle(temp_file)

            # Upload to new folder in GCS
            timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
            blob = bucket.blob(f'embeddings_clean/{category}_{timestamp}.pkl')
            blob.upload_from_filename(temp_file)

            print(f"Saved {len(df)} embeddings for {category}")

            # Clean up local file
            os.remove(temp_file)

        except Exception as e:
            print(f"Error saving {category}: {str(e)}")
            continue

    # Clean up temp directory - now should be empty
    import shutil
    shutil.rmtree(temp_dir)

    print("\nCleaning complete!")
    return len(unique_embeddings)

def verify_cleaned_embeddings():
    """Verify the cleaned embeddings"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    print("\nVerifying cleaned embeddings...")
    clean_files = [
        blob.name for blob in bucket.list_blobs(prefix='embeddings_clean/')
    ]

    total_embeddings = 0
    categories = set()

    for file_name in clean_files:
        try:
            blob = bucket.blob(file_name)
            blob.download_to_filename('/content/verify_temp.pkl')
            df = pd.read_pickle('/content/verify_temp.pkl')

            total_embeddings += len(df)
            categories.add(file_name.split('/')[-1].split('_')[0])

            os.remove('/content/verify_temp.pkl')

        except Exception as e:
            print(f"Error verifying {file_name}: {str(e)}")
            continue

    print(f"\nVerification Results:")
    print(f"Total clean files: {len(clean_files)}")
    print(f"Total embeddings: {total_embeddings}")
    print(f"Categories: {sorted(categories)}")

if __name__ == "__main__":
    print("Starting embedding cleanup process...")
    unique_count = clean_and_reorganize_embeddings()
    verify_cleaned_embeddings()

Classifier training

In [None]:
#Classifier trainer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from google.cloud import storage
from tqdm import tqdm
import pickle
from collections import defaultdict

def combine_batch_embeddings():
    """Load and combine batch embeddings with memory management and duplicate prevention"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Track processed files and unique embeddings
    processed_files = set()
    unique_pointers = set()
    all_embeddings = []
    total_count = 0

    print("Scanning for embedding files...")
    embedding_files = [
        blob.name for blob in bucket.list_blobs(prefix='embeddings_analysis/batches/')
        if '_embeddings' in blob.name and '_similarity' not in blob.name
    ]

    print(f"Found {len(embedding_files)} embedding files")
    batch_size = 10  # Process 10 files at a time

    for i in range(0, len(embedding_files), batch_size):
        batch_files = embedding_files[i:i + batch_size]
        batch_embeddings = []

        print(f"\nProcessing batch {i//batch_size + 1} of {(len(embedding_files) + batch_size - 1)//batch_size}")

        for file_name in batch_files:
            try:
                if file_name in processed_files:
                    continue

                blob = bucket.blob(file_name)
                blob.download_to_filename('/content/temp_batch.pkl')
                batch_df = pd.read_pickle('/content/temp_batch.pkl')

                # Check for duplicate pointers
                new_pointers = set(batch_df['pointer'])
                duplicate_count = len(new_pointers & unique_pointers)
                if duplicate_count > 0:
                    print(f"Found {duplicate_count} duplicates in {file_name}")
                    batch_df = batch_df[~batch_df['pointer'].isin(unique_pointers)]

                if len(batch_df) == 0:
                    continue

                # Update unique pointers
                unique_pointers.update(new_pointers)

                # Extract categories
                parts = file_name.split('/')[-1].split('_')
                category_start = 2
                main_category = '_'.join([p for p in parts[category_start:]
                                       if not p.isdigit() and p not in ['embeddings', 'similarity']])

                sub_types = ['titles', 'orgs', 'keywords', 'entities', 'public', 'private']
                sub_category = next((p for p in parts if p in sub_types), 'unknown')

                # Add category information
                batch_df['main_category'] = main_category
                batch_df['sub_category'] = sub_category

                batch_embeddings.append(batch_df)
                processed_files.add(file_name)

                print(f"Added {len(batch_df)} unique embeddings from {sub_category} in {main_category}")
                total_count += len(batch_df)

            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                continue

        # Combine batch and clear memory
        if batch_embeddings:
            combined_batch = pd.concat(batch_embeddings, ignore_index=True)
            all_embeddings.append(combined_batch)

        # Clear memory
        del batch_embeddings
        import gc
        gc.collect()

    # Final combination
    if not all_embeddings:
        raise ValueError("No embedding files were successfully loaded!")

    final_df = pd.concat(all_embeddings, ignore_index=True)
    print(f"\nFinal Statistics:")
    print(f"Total unique embeddings: {len(final_df)}")
    print(f"Main categories: {sorted(final_df['main_category'].unique())}")
    print(f"Sub-categories: {sorted(final_df['sub_category'].unique())}")

    return final_df

def train_hierarchical_classifier(data_df, batch_size=10000):
    """Train classifier with memory-efficient batching"""
    # Convert embeddings to numpy array in batches
    print("Converting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(data_df), batch_size):
        batch = data_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = data_df['main_category'].values

    print(f"Prepared {len(X)} samples for training")

    # Continue with classifier training as before...
    # [rest of the training code remains the same]

def train_hierarchical_classifier(data_df, batch_size=10000, min_samples=2):
    """Train classifier with memory-efficient batching and minimum sample filtering"""
    print("\nPreparing training data...")

    # Filter categories with too few samples
    category_counts = data_df['main_category'].value_counts()
    valid_categories = category_counts[category_counts >= min_samples].index
    print(f"\nFound {len(valid_categories)} categories with {min_samples}+ samples")
    print("Removing categories:", set(data_df['main_category'].unique()) - set(valid_categories))

    filtered_df = data_df[data_df['main_category'].isin(valid_categories)].copy()

    # Convert embeddings to numpy array in batches
    print("\nConverting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(filtered_df), batch_size):
        batch = filtered_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = filtered_df['main_category'].values

    print(f"\nPrepared {len(X)} samples for training")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_main, test_size=0.2, random_state=42, stratify=y_main
    )

    # Train main classifier
    print("\nTraining main category classifier...")
    main_classifier = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        class_weight='balanced'
    )
    main_classifier.fit(X_train, y_train)

    # Evaluate main classifier
    print("\nMain Category Classification:")
    print("Cross-validation scores:")
    cv_scores = cross_val_score(main_classifier, X, y_main, cv=5)
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

    y_pred = main_classifier.predict(X_test)
    print("\nTest set performance:")
    print(classification_report(y_test, y_pred))

    # Train subcategory classifiers
    sub_classifiers = {}
    for main_cat in valid_categories:
        cat_mask = filtered_df['main_category'] == main_cat
        cat_data = filtered_df[cat_mask]

        # Check if subcategories have enough samples
        subcat_counts = cat_data['sub_category'].value_counts()
        valid_subcats = subcat_counts[subcat_counts >= min_samples].index

        if len(valid_subcats) > 1:  # Need at least 2 valid subcategories
            print(f"\nTraining {main_cat} subcategory classifier...")

            sub_X = np.stack(cat_data[cat_data['sub_category'].isin(valid_subcats)]['embedding'].values)
            sub_y = cat_data[cat_data['sub_category'].isin(valid_subcats)]['sub_category']

            try:
                sub_clf = LogisticRegression(
                    multi_class='multinomial',
                    max_iter=1000,
                    class_weight='balanced'
                )
                sub_clf.fit(sub_X, sub_y)
                sub_classifiers[main_cat] = sub_clf

                # Evaluate if enough samples
                if len(sub_y) >= 10:  # Minimum for 5-fold CV
                    cv_scores = cross_val_score(sub_clf, sub_X, sub_y, cv=min(5, len(sub_y)))
                    print(f"Mean CV score: {cv_scores.mean():.3f}")
            except Exception as e:
                print(f"Error training subcategory classifier for {main_cat}: {str(e)}")
                continue

    return {
        'main_classifier': main_classifier,
        'sub_classifiers': sub_classifiers,
        'valid_categories': valid_categories
    }

def main():
    # Load embeddings
    print("Loading and combining embeddings...")
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

def save_classifiers(classifiers, prefix='eliteness'):
    """Save trained classifiers to GCS"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Save locally first
    with open(f'/content/{prefix}_classifiers.pkl', 'wb') as f:
        pickle.dump(classifiers, f)

    # Upload to GCS
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    blob = bucket.blob(f'classifiers/{prefix}_classifier_{timestamp}.pkl')
    blob.upload_from_filename(f'/content/{prefix}_classifiers.pkl')
    print(f"Saved classifiers to GCS")

def predict_eliteness(text, classifiers, tokenizer, model):
    """Predict eliteness categories for new text"""
    # Get embedding
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[0][1].numpy()

    # Predict main category
    main_category = classifiers['main_classifier'].predict([embedding])[0]

    # Predict subcategory if available
    sub_category = None
    if main_category in classifiers['sub_classifiers']:
        sub_category = classifiers['sub_classifiers'][main_category].predict([embedding])[0]

    return {
        'text': text,
        'main_category': main_category,
        'sub_category': sub_category
    }

def main():
    # Load and combine embeddings
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

if __name__ == "__main__":
    classifiers, data_df = main()

Eliteness classification and sentiment score estimate on data collected with Google News API.
Change model version for comparison

In [None]:
#nyt data fra google api
from transformers import pipeline, AutoTokenizer, AutoModel
from collections import defaultdict
import spacy
import pandas as pd
import torch
import logging
from google.cloud import storage
import pickle
import numpy as np
from typing import Dict, Tuple, List

def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier.pkl')
        with open('/content/classifier.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

class NewsAnalyzer:
    def __init__(self):
        self.classifiers = load_classifier()
        self.tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
        self.model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")
        self.nlp = spacy.load('da_core_news_lg')
        self.sentiment_pipeline = pipeline("sentiment-analysis",
                                        model="DGurgurov/xlm-r_danish_sentiment")

    def clean_text(self, text: str) -> str:
        """Clean text by fixing encoding issues and removing truncation markers"""
        if not isinstance(text, str):
            return ""

        # Fix common encoding issues
        replacements = {
            'Ã¸': 'ø',
            'Ã¦': 'æ',
            'Ã¥': 'å',
            'Ã˜': 'Ø',
            'Ã†': 'Æ',
            'Ã…': 'Å'
        }

        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove truncation marker and everything after it
        if '[+' in text and 'chars]' in text:
            text = text.split('[+')[0].strip()

        return text

    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        return embedding

    def classify_text(self, text: str) -> Dict:
        """Classify a piece of text using the trained classifier"""
        try:
            if not text:
                return {'category': 'unknown', 'confidence': 0.0}

            embedding = self.get_embedding(text)
            main_classifier = self.classifiers['main_classifier']
            pred_class = main_classifier.predict([embedding])[0]
            confidence = np.max(main_classifier.predict_proba([embedding])[0])

            return {'category': pred_class, 'confidence': confidence}
        except Exception as e:
            logging.error(f"Error in text classification: {str(e)}")
            return {'category': 'unknown', 'confidence': 0.0}

    def analyze_sentiment(self, text: str) -> Dict:
        """Analyze sentiment of cleaned text"""
        try:
            if not text:
                return {'label': 'L', 'confidence': 0.0}

            cleaned_text = self.clean_text(text)
            result = self.sentiment_pipeline(cleaned_text[:512])[0]

            label_map = {
                'LABEL_1': 'P',
                'LABEL_0': 'N'
            }
            return {
                'label': label_map.get(result['label'], 'L'),
                'confidence': result['score']
            }
        except Exception as e:
            logging.error(f"Error in sentiment analysis: {str(e)}")
            return {'label': 'L', 'confidence': 0.0}

    def get_combined_sentiment(self, sentiments: List[Dict]) -> Dict:
        """Calculate combined sentiment from multiple analyses"""
        valid_sentiments = [s for s in sentiments if s['label'] != 'L']
        if not valid_sentiments:
            return {'label': 'L', 'confidence': 0.0}

        # Weight sentiments by confidence
        weighted_sum = sum(1 if s['label'] == 'P' else -1 * s['confidence']
                         for s in valid_sentiments)
        avg_confidence = sum(s['confidence'] for s in valid_sentiments) / len(valid_sentiments)

        return {
            'label': 'P' if weighted_sum > 0 else 'N',
            'confidence': avg_confidence
        }

    def analyze_article(self, title: str, description: str, content: str) -> Dict:
        """Analyze full article including title, description, and content"""
        # Clean texts
        clean_title = self.clean_text(title)
        clean_desc = self.clean_text(description)
        clean_content = self.clean_text(content)

        # Split content into sentences
        doc = self.nlp(clean_content)
        sentences = [sent.text for sent in doc.sents]

        # Classify each sentence
        category_counts = defaultdict(int)

        # Classify title
        title_class = self.classify_text(clean_title)
        category_counts[title_class['category']] += 1

        # Classify each sentence
        for sentence in sentences:
            classification = self.classify_text(sentence)
            category_counts[classification['category']] += 1

        # Get primary categories based on counts
        primary_category = max(category_counts.items(), key=lambda x: x[1])

        # Get sentiment for each part
        sentiments = {
            'title': self.analyze_sentiment(clean_title),
            'description': self.analyze_sentiment(clean_desc),
            'content': self.analyze_sentiment(clean_content)
        }

        # Calculate combined sentiment
        combined_sentiment = self.get_combined_sentiment(list(sentiments.values()))
        sentiments['combined'] = combined_sentiment

        return {
            'category_counts': dict(category_counts),
            'primary_category': primary_category[0],
            'category_frequency': primary_category[1],
            'sentiment': sentiments
        }

def analyze_news_dataset(file_path: str) -> Dict:
    analyzer = NewsAnalyzer()
    results = {}

    df = pd.read_csv(file_path)
    print(f"Analyzing {len(df)} articles...")

    for idx, row in df.iterrows():
        print(f"Processing article {idx + 1}/{len(df)}")
        results[idx] = {
            'source': row['source'],
            'title': row['title'],
            'publishedAt': row['publishedAt'],
            'analysis': analyzer.analyze_article(
                row['title'],
                row['description'],
                row['content']
            )
        }

    return results

def display_results(results: Dict):
    for article_id, data in results.items():
        print("\n" + "="*50)
        print(f"Source: {data['source']}")
        print(f"Title: {data['title']}")
        print(f"Date: {data['publishedAt']}")

        analysis = data['analysis']
        print(f"\nPrimary Category: {analysis['primary_category']} "
              f"(frequency: {analysis['category_frequency']})")

        print("\nCategory Distribution:")
        for category, count in sorted(analysis['category_counts'].items(),
                                    key=lambda x: x[1], reverse=True):
            print(f"{category}: {count}")

        print("\nSentiment Analysis:")
        sentiments = analysis['sentiment']
        for part, sent in sentiments.items():
            print(f"{part.capitalize()}: {sent['label']} "
                  f"(confidence: {sent['confidence']:.2f})")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    results = analyze_news_dataset('/content/drive/MyDrive/NewsMarketAnalysis/danish_news_multiple_topics.csv')
    display_results(results)

In [8]:
#inspection of document categories
import json

def inspect_categories(json_file_path):
    """
    Inspect the categories and their associated keys in the given JSON document.
    :param json_file_path: Path to the JSON file.
    """
    try:
        # Load the JSON file
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        print("Categories and their associated keys:\n")

        # Iterate through the categories
        for category, content in data.items():
            print(f"Category: {category}")
            if isinstance(content, dict):
                print(f"  Keys: {list(content.keys())}")
            else:
                print(f"  Content is not a dictionary (type: {type(content)}).")
            print()

    except Exception as e:
        print(f"Error occurred while processing the file: {e}")

def main():
    # Path to the input JSON file
    input_file = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json"

    #validation_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json"
    #training_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json"

    # Inspect categories
    inspect_categories(input_file)


if __name__ == "__main__":
    main()


Categories and their associated keys:

Category: market_categories
  Keys: ['energy_and_green_transition', 'welfare', 'maritime_and_shipping', 'agriculture_food', 'union_labour', 'real_estate', 'finance', 'industry', 'tech', 'regulatory', 'education', 'healthcare', 'politics', 'aviation', 'design', 'architecture', 'hospitality', 'tourism', 'appliances']

Category: elite_hierarchy
  Keys: []

Category: org_forms
  Keys: []

Category: international_markers
  Keys: []



In [6]:
import json

def extract_market_categories_details(json_file_path):
    """
    Extract details for each subcategory under 'market_categories' and display their keys and values.
    :param json_file_path: Path to the JSON file.
    """
    try:
        # Load the JSON file
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Check if 'market_categories' exists
        if 'market_categories' not in data:
            print("'market_categories' not found in the JSON file.")
            return

        # Extract subcategories and their details
        market_categories = data['market_categories']

        print("Details for each category in 'market_categories':\n")
        for subcategory, content in market_categories.items():
            print(f"- {subcategory}:")
            if isinstance(content, dict):
                for key, values in content.items():
                    print(f"  - {key}: {values}")
            else:
                print(f"  - Content is not a dictionary (type: {type(content)}).")
            print()

    except Exception as e:
        print(f"Error occurred while processing the file: {e}")


def main():
    # Path to the input JSON file
    input_file = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json"

    # Extract details for 'market_categories'
    extract_market_categories_details(input_file)


if __name__ == "__main__":
    main()


Details for each category in 'market_categories':

- energy_and_green_transition:
  - titles: ['Bæredygtighedschef', 'Klimachef', 'Klimadirektør', 'Miljøchef', 'VINDMØLLER', 'Vindmølle', 'Vindmøller', 'bæredygtighedschef', 'energidirektør', 'klimachef', 'klimadirektør', 'koncernmiljøchef', 'kraft værk', 'miljøchef', 'over vindmølle', 'vindmølle', 'vindmølleanalytiker', 'vindmøller', 'vindmølles']
  - orgs: ['Energiselskab', 'Energiselskabs', 'Energistyrelse', 'Forsyning', 'Forsynings', 'KLIMA', 'KLIMAs', 'Klima', 'Klimachef', 'Klimadirektør', 'Klimarådgiver', 'MILJØ', 'MIljø', 'Miljø', 'Miljøchef', 'Miljødirektør', 'Miljøformand', 'Miljøkonsulent', 'Miljøs', 'Miljøstyrelse', 'Miljøstyrelses', 'VINDMØLLER', 'Vindmølle', 'Vindmøller', 'energiselskab', 'energiselskabs', 'energistyrelse', 'energistyrelses', 'forsyning', 'forsynings', 'først Miljø', 'gruppe miljø', 'klima', 'klimaanalytiker', 'klimaansvarlig', 'klimachef', 'klimadirektør', 'klimaformand', 'klimakonsulent', 'klimakoordinator