Connect to Colab, Google Cloud Storage

In [51]:
#colab code: mount to drive to import and export data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# install libaries and connect to Google Cloud account  to import and export data
!pip install google-cloud-storage transformers torch tqdm pandas
!pip install scikit-learn

# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

# Set your project ID
!gcloud config set project tokyo-silicon-441818-f7  # Replace with your actual project ID

Updated property [core/project].


Install libraries

In [8]:
!python -m spacy download da_core_news_lg #Download the Danish language model for spacy

Collecting da-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0-py3-none-any.whl (567.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.1/567.1 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: da-core-news-lg
Successfully installed da-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('da_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Load and prepare data, incl. train and test split



In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
from typing import List, Dict
import random
import re
random.seed(42)

class NewsDataPreparator:
    def __init__(self):
        self.file_paths = [
            "/content/drive/MyDrive/NewsData/dan_news_2021_1M (2)/dan_news_2021_1M/dan_news_2021_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_news_2020_1M (1)/dan_news_2020_1M/dan_news_2020_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_newscrawl_2019_1M (1)/dan_newscrawl_2019_1M/dan_newscrawl_2019_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_newscrawl_2017_1M (1)/dan_newscrawl_2017_1M/dan_newscrawl_2017_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_newscrawl_2016_1M (1)/dan_newscrawl_2016_1M/dan_newscrawl_2016_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_newscrawl_2015_1M (1)/dan_newscrawl_2015_1M/dan_newscrawl_2015_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_newscrawl_2011_1M (1)/dan_newscrawl_2011_1M-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_news_2008_300K (1)/dan_news_2008_300K-sentences.txt",
            "/content/drive/MyDrive/NewsData/dan_news_2007_1M (1)/dan_news_2007_1M-sentences.txt"
        ]

    def extract_year_from_path(self, file_path: str) -> str:
        """Extract year from file path using regex"""
        match = re.search(r'(?:news|newscrawl)_(\d{4})', file_path)
        if match:
            return match.group(1)
        return "unknown"

    def load_single_file(self, file_path: str) -> pd.DataFrame:
        """Load a single file and convert to DataFrame with proper line parsing"""
        try:
            year = self.extract_year_from_path(file_path)
            data = []

            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        # Split on first tab only
                        parts = line.split('\t', 1)
                        if len(parts) == 2:
                            line_num = parts[0].strip()
                            sentence = parts[1].strip()

                            # Store the data
                            data.append({
                                'line_num': line_num,
                                'sentence': sentence,
                                'year': year
                            })

                    if len(data) % 100000 == 0:
                        print(f"Processed {len(data)} lines from {os.path.basename(file_path)}")

            if not data:
                print(f"Warning: No valid data extracted from {file_path}")
                return pd.DataFrame()

            df = pd.DataFrame(data)
            print(f"Successfully loaded {len(df)} sentences from {os.path.basename(file_path)}")

            return df

        except Exception as e:
            print(f"Error loading file {file_path}: {str(e)}")
            return pd.DataFrame()

    def load_all_files(self) -> pd.DataFrame:
        """Load all accessible files and combine into single DataFrame"""
        print("\nLoading files...")
        all_data = []

        accessible_files = [f for f in self.file_paths if os.path.exists(f)]
        if not accessible_files:
            raise FileNotFoundError("No input files are accessible")

        for file_path in tqdm(accessible_files, desc="Loading files"):
            df = self.load_single_file(file_path)
            if not df.empty:
                all_data.append(df)

        if not all_data:
            raise ValueError("No data was successfully loaded from any files")

        combined_df = pd.concat(all_data, ignore_index=True)

        # Sort by line number to maintain order
        combined_df['line_num'] = combined_df['line_num'].astype(int)
        combined_df = combined_df.sort_values('line_num').reset_index(drop=True)

        print(f"\nTotal loaded: {len(combined_df)} sentences")
        print("\nSentences per year:")
        print(combined_df['year'].value_counts().sort_index())

        print("\nSample data:")
        print(combined_df.head(3))

        return combined_df

    def create_train_val_split(self, df: pd.DataFrame, val_size: float = 0.2) -> Dict[str, pd.DataFrame]:
        """Split data into training and validation sets"""
        train_df, val_df = train_test_split(
            df,
            test_size=val_size,
            random_state=42,
            stratify=df['year']
        )

        print(f"\nTraining set: {len(train_df)} sentences")
        print(f"Validation set: {len(val_df)} sentences")

        return {
            'train': train_df,
            'validation': val_df
        }

    def save_processed_data(self, data_dict: Dict[str, pd.DataFrame], output_dir: str):
        """Save processed datasets"""
        os.makedirs(output_dir, exist_ok=True)

        for name, df in data_dict.items():
            output_path = os.path.join(output_dir, f'{name}_data.csv')

            # Save with proper encoding and quoting
            df.to_csv(output_path,
                     index=False,
                     encoding='utf-8',
                     quoting=1)  # QUOTE_ALL to ensure proper handling of embedded commas

            print(f"\nSaved {name} data to {output_path}")
            print(f"Sample from {name} data:")
            print(df.head(3))

            # Validate saved file
            loaded_df = pd.read_csv(output_path)
            print(f"Validation - Successfully loaded saved {name} file: {len(loaded_df)} rows")

def main():
    # Initialize data preparator
    preparator = NewsDataPreparator()

    try:
        # Load all files
        print("Step 1: Loading and parsing files...")
        all_data = preparator.load_all_files()

        # Create train/validation split
        print("\nStep 2: Creating train/validation split...")
        data_splits = preparator.create_train_val_split(all_data)

        # Save processed data
        print("\nStep 3: Saving processed data...")
        output_dir = "/content/drive/MyDrive/NewsData/processed"
        preparator.save_processed_data(data_splits, output_dir)

        print("\nData preprocessing completed successfully!")
        return data_splits

    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        return None

if __name__ == "__main__":
    processed_data = main()

Find markers in data set based on regular expressions pattern matching

In [None]:
#forsøg på at forbedre preprocessing - starter ved batch det er nået til
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from tqdm import tqdm
import os
from typing import Dict, List, Set, Tuple
from datetime import datetime
import re
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

class DanishPatternStructure:
    def __init__(self):
        self.elite_hierarchy = {
            'top_level': {
                'titles': [
                    'koncernchef', 'koncern chef', 'konsernchef',
                    'administrerende direktør', 'adm direktør', 'adm dir',
                    'administrerende dir', 'adm. direktør', 'topchef',
                    'top chef', 'bestyrelsesformand', 'bestyrelses formand',
                    'president', 'CEO', 'chief executive officer',
                    'grundlægger', 'grundlæger', 'stifter',
                    'ejer', 'direktionsformand', 'direktions formand',
                    'group ceo', 'koncerndirektør', 'koncern direktør'
                ],
                'indicators': [
                    'koncern', 'gruppe', 'holding', 'international',
                    'group', 'worldwide', 'global', 'nordic',
                    'skandinavisk', 'europæisk', 'executive'
                ]
            },
            'senior_level': {
                'titles': [
                    'direktør', 'områdechef', 'område chef',
                    'afdelingschef', 'afdelings chef', 'afdeling chef',
                    'regionsdirektør', 'regions direktør', 'region direktør',
                    'landechef', 'lande chef', 'land chef',
                    'divisionsdirektør', 'divisions direktør', 'division direktør',
                    'partner', 'senior manager', 'senior direktør'
                ],
                'indicators': [
                    'senior', 'chef', 'leder', 'manager',
                    'director', 'head', 'ansvarlig', 'lead'
                ]
            },
            'expert_level': {
                'titles': [
                    'chefanalytiker', 'chef analytiker', 'specialkonsulent',
                    'special konsulent', 'seniorøkonom', 'senior økonom',
                    'chefforsker', 'chef forsker', 'ekspert',
                    'specialist', 'seniorrådgiver', 'senior rådgiver',
                    'chief analyst', 'senior specialist', 'senior advisor'
                ],
                'indicators': [
                    'specialist', 'ekspert', 'forsker', 'analyst',
                    'researcher', 'advisor', 'consultant', 'expert',
                    'analytiker', 'rådgiver', 'konsulent'
                ]
            }
        }

        # Organizational forms
        self.org_forms = {
            'private': [
                'A/S', 'AS', 'ApS', 'APS', 'I/S', 'IS',
                'K/S', 'KS', 'P/S', 'PS', 'IVS',
                'Holding', 'Group', 'Gruppen', 'Koncern',
                'Koncernen', 'Danmark', 'Danish', 'International',
                'Global', 'Nordic', 'Skandinavisk'
            ],
            'public': [
                'Kommune', 'Kommunen', 'Region', 'Regionen',
                'Ministerium', 'Ministeriet', 'Styrelse',
                'Styrelsen', 'Direktorat', 'Direktoratet',
                'Institut', 'Instituttet', 'Center', 'Centret',
                'Forvaltning', 'Forvaltningen', 'Myndighed',
                'Myndigheden', 'Råd', 'Rådet'
            ],
            'associations': [
                'Forening', 'Foreningen', 'Forbund', 'Forbundet',
                'Organisation', 'Organisationen', 'Fond', 'Fonden',
                'Fagforening', 'Fagforeningen', 'Sammenslutning',
                'Sammenslutningen', 'Selskab', 'Selskabet',
                'Forening', 'NGO', 'Interesseorganisation'
            ]
        }

        # International markers
        self.international_markers = {
            'eu': {
                'titles': [
                    'EU-kommissær', 'EU kommissær', 'EUkommissær',
                    'MEP', 'europaparlamentariker', 'europa parlamentariker',
                    'EU-direktør', 'EU direktør', 'EUdirektør',
                    'EU-chef', 'EU chef', 'EUchef',
                    'europa chef', 'europæisk direktør', 'europa direktør'
                ],
                'indicators': [
                    'EU-Kommission', 'EU Kommission', 'EUKommission',
                    'Europaparlament', 'Europa Parlament', 'Europa-Parlament',
                    'EU-Domstol', 'EU Domstol', 'EUDomstol',
                    'EU-agentur', 'EU agentur', 'EUagentur',
                    'EU-kontor', 'EU kontor', 'EUkontor',
                    'europæisk institution', 'europa institution'
                ]
            },
            'nordic': {
                'titles': [
                    'nordisk direktør', 'nordisk chef', 'nordisk leder',
                    'skandinavienchef', 'skandinavien chef', 'skandinavisk direktør',
                    'nordisk koordinator', 'skandinavisk leder', 'nordic director',
                    'nordisk ansvarlig', 'skandinavisk ansvarlig'
                ],
                'indicators': [
                    'Nordisk Råd', 'Nordisk Raad', 'Nordisk Ministerråd',
                    'Nordisk Ministerraad', 'skandinavisk afdeling',
                    'skandinavisk kontor', 'nordisk afdeling', 'nordisk kontor',
                    'nordic office', 'skandinavisk division'
                ]
            }
        }
        self.market_categories = {
            'energy_and_green_transition': {
                'titles': [
                    'klimadirektør', 'klima direktør', 'klimachef', 'klima chef',
                    'bæredygtighedschef', 'bæredygtighed chef', 'miljøchef', 'miljø chef',
                    'energidirektør', 'energi direktør', 'vindmølle', 'kraft værk',
                    'energi chef', 'miljø direktør', 'bæredygtighed direktør'
                ],
                'orgs': [
                    'energiselskab', 'energi selskab', 'vindmølle', 'miljø',
                    'forsyning', 'energistyrelse', 'klima', 'miljøstyrelse'
                ],
                'keywords': [
                    'grøn', 'bæredygtig', 'vedvarende', 'energi', 'klima',
                    'miljø', 'vindkraft', 'power', 'omstilling'
                ]
            },
            'welfare': {
                'titles': [
                    'velfærdsdirektør', 'velfærd direktør', 'socialchef', 'social chef',
                    'ældrechef', 'ældre chef', 'børnechef', 'børne chef', 'ungechef',
                    'beskæftigelseschef', 'beskæftigelse chef', 'integrationschef'
                ],
                'orgs': [
                    'kommune', 'region', 'socialstyrelse', 'beskæftigelse',
                    'socialministerium', 'velfærd', 'ældrecenter', 'børnehus'
                ],
                'keywords': [
                    'velfærd', 'social', 'ældre', 'børn', 'unge',
                    'beskæftigelse', 'integration', 'kommune', 'omsorg'
                ]
            },
            'maritime_and_shipping': {
                'titles': [
                    'rederdirektør', 'reder', 'havnedirektør', 'havn chef',
                    'skibsreder', 'marinechef', 'marine chef', 'offshore chef',
                    'logistikchef', 'logistik direktør', 'fragtchef'
                ],
                'orgs': [
                    'rederi', 'havn', 'værft', 'søfart', 'maritime',
                    'offshore', 'shipping', 'container', 'fragt'
                ],
                'keywords': [
                    'søfart', 'shipping', 'maritim', 'offshore', 'havn',
                    'skib', 'container', 'logistik', 'fragt'
                ]
            },
            'agriculture_food': {
                'titles': [
                    'landbrugsdirektør', 'landbrug chef', 'fødevarechef', 'fødevare direktør',
                    'mejerichef', 'mejeri direktør', 'landbrugspræsident', 'fødevaredirektør',
                    'landmand', 'gårdejer'
                ],
                'orgs': [
                    'landbrug', 'fødevare', 'mejeri', 'slagteri',
                    'landbrugsorganisation', 'fødevarestyrelse', 'gård'
                ],
                'keywords': [
                    'landbrug', 'fødevare', 'mejeri', 'økologi',
                    'fødevaresikkerhed', 'eksport', 'gård', 'mark'
                ]
            },
            'union_labour': {
                'titles': [
                    'forbundsformand', 'forbund formand', 'fagforeningsformand',
                    'hovedkasserer', 'forhandlingsleder', 'forhandling chef',
                    'arbejdsmarkedschef', 'arbejdsmarked direktør'
                ],
                'orgs': [
                    'fagforening', 'forbund', 'hovedorganisation', 'akasse',
                    'arbejdsgiver', 'overenskomst', 'fagbevægelse'
                ],
                'keywords': [
                    'overenskomst', 'arbejdsmarked', 'fagbevægelse', 'medlem',
                    'forhandling', 'arbejdsret', 'faglig'
                ]
            },
            'real_estate': {
                'titles': [
                    'ejendomsmægler', 'developer', 'ejendomsadministrator',
                    'bygherrerådgiver', 'ejendom chef', 'bolig direktør'
                ],
                'orgs': [
                    'ejendomsselskab', 'developer', 'boligforening', 'administration',
                    'boligselskab', 'ejendomsadministration'
                ],
                'keywords': [
                    'ejendom', 'bolig', 'byggeri', 'udlejning', 'investering',
                    'developer', 'administration', 'bygherre'
                ]
            },
            'finance': {
                'titles': [
                    'bankdirektør', 'bank chef', 'finansdirektør', 'finans chef',
                    'investor', 'analytiker', 'økonom', 'fondforvalter',
                    'porteføljemanager', 'aktiestrateg', 'valutahandler'
                ],
                'orgs': [
                    'bank', 'børs', 'investering', 'kapitalfond', 'nationalbank',
                    'realkredit', 'pension', 'forsikring'
                ],
                'keywords': [
                    'finans', 'aktie', 'obligation', 'investering', 'rente',
                    'marked', 'valuta', 'børs', 'opkøb', 'fusion'
                ]
            },
            'industry': {
                'titles': [
                    'administrerende direktør', 'adm direktør', 'bestyrelsesformand',
                    'koncernchef', 'koncern direktør', 'fabriksdirektør',
                    'produktionschef', 'industri chef'
                ],
                'orgs': [
                    'virksomhed', 'koncern', 'industri', 'produktion',
                    'erhverv', 'fabrik', 'industri'
                ],
                'keywords': [
                    'produktion', 'industri', 'marked', 'fabrik', 'supply chain',
                    'erhverv', 'koncern', 'virksomhed'
                ]
            },
            'tech': {
                'titles': [
                    'udviklingschef', 'udvikling direktør', 'techekspert', 'tech chef',
                    'itdirektør', 'it chef', 'softwareudvikler', 'dataanalytiker',
                    'digital chef', 'innovation direktør'
                ],
                'orgs': [
                    'tech', 'startup', 'software', 'ai', 'teknologi',
                    'it', 'digital', 'data'
                ],
                'keywords': [
                    'teknologi', 'digital', 'innovation', 'data', 'kunstig intelligens',
                    'automatisering', 'software', 'it', 'tech'
                ]
            },
            'regulatory': {
                'titles': [
                    'minister', 'direktør', 'formand', 'tilsynschef',
                    'departementchef', 'styrelsesdirektør', 'afdelingschef'
                ],
                'orgs': [
                    'ministerium', 'styrelse', 'tilsyn', 'myndighed',
                    'domstol', 'departement', 'forvaltning'
                ],
                'keywords': [
                    'regulering', 'lovgivning', 'tilsyn', 'politik',
                    'beskatning', 'myndighed', 'forvaltning'
                ]
            },
            'education': {
                'titles': [
                    'rektor', 'professor', 'lektor', 'underviser',
                    'uddannelseschef', 'uddannelse direktør', 'skoleleder',
                    'dekan', 'institutleder'
                ],
                'orgs': [
                    'universitet', 'gymnasium', 'skole', 'uddannelse',
                    'institut', 'fakultet', 'akademi'
                ],
                'keywords': [
                    'uddannelse', 'forskning', 'læring', 'pædagogik',
                    'student', 'elev', 'undervisning'
                ]
            },
            'healthcare': {
                'titles': [
                    'læge', 'sygeplejerske', 'hospitaldirektør', 'hospital chef',
                    'specialist', 'sundhedsøkonom', 'forskningschef', 'overlæge',
                    'produktchef', 'regulatory affairs', 'medicinal direktør'
                ],
                'orgs': [
                    'hospital', 'klinik', 'sundhedsstyrelse', 'apotek',
                    'medicinal', 'biotek', 'sundhed', 'læge'
                ],
                'keywords': [
                    'sundhed', 'patient', 'medicin', 'behandling', 'sygdom',
                    'diabetes', 'biotek', 'klinisk', 'insulin'
                ]
            },
            'politics': {
                'titles': [
                    'politiker', 'minister', 'borgmester', 'rådmand',
                    'mfer', 'folketingsmedlem', 'regionsrådsformand',
                    'kommunalbestyrelsesmedlem'
                ],
                'orgs': [
                    'folketing', 'byråd', 'ministerium', 'parti',
                    'kommission', 'regering', 'kommunalbestyrelse'
                ],
                'keywords': [
                    'politik', 'valg', 'beslutning', 'samfund',
                    'demokrati', 'lovgivning', 'reform'
                ]
            },
            'aviation': {
                'titles': [
                    'pilot', 'flyveleder', 'luftfartsdirektør', 'luftfart chef',
                    'kabinechef', 'teknisk chef', 'lufthavnsdirektør'
                ],
                'orgs': [
                    'luftfart', 'lufthavn', 'flyproducent', 'flyselskab',
                    'aviation', 'airline'
                ],
                'keywords': [
                    'fly', 'luftfart', 'rejse', 'sikkerhed', 'transport',
                    'lufthavn', 'aviation', 'airline'
                ]
            },
            'design': {
                'titles': [
                    'designer', 'kreativ direktør', 'produktudvikler',
                    'modeekspert', 'designchef', 'art director'
                ],
                'orgs': [
                    'designstudie', 'modehus', 'designfirma',
                    'tegnestue', 'kreativ', 'mode'
                ],
                'keywords': [
                    'design', 'mode', 'produkt', 'æstetik',
                    'bruger', 'kreativ', 'kunst'
                ]
            },
            'architecture': {
                'titles': [
                    'arkitekt', 'bygningsdesigner', 'landskabsarkitekt',
                    'byplanlægger', 'partner', 'kreativ direktør'
                ],
                'orgs': [
                    'arkitektfirma', 'tegnestue', 'byplanlægning',
                    'arkitektur', 'design'
                ],
                'keywords': [
                    'arkitektur', 'byrum', 'byggeri', 'design',
                    'æstetik', 'byplanlægning', 'landskab'
                ]
            },
            'hospitality': {
                'titles': [
                    'hotelchef', 'hotel direktør', 'kok', 'restauratør',
                    'sommelier', 'restaurantchef', 'køkkenchef'
                ],
                'orgs': [
                    'hotel', 'restaurant', 'catering', 'gastronomi',
                    'hospitality'
                ],
                'keywords': [
                    'hotel', 'restaurant', 'mad', 'overnatning',
                    'oplevelse', 'service', 'gastronomi'
                ]
            },
            'tourism': {
                'titles': [
                    'turistchef', 'turist direktør', 'rejseleder',
                    'destinationschef', 'destination direktør'
                ],
                'orgs': [
                    'turistkontor', 'rejsebureau', 'destination',
                    'turisme', 'rejse'
                ],
                'keywords': [
                    'rejse', 'turisme', 'oplevelse', 'destination',
                    'attraktion', 'ferie', 'turist'
                ]
            },
            'appliances': {
                'titles': [
                    'produktchef', 'produkt direktør', 'ingeniør',
                    'udviklingschef', 'teknisk chef', 'salgschef'
                ],
                'orgs': [
                    'hvidevare', 'elektronik', 'distribution',
                    'producent', 'forhandler'
                ],
                'keywords': [
                    'hvidevare', 'elektronik', 'køkken', 'innovation',
                    'produkt', 'teknisk', 'udvikling'
                ]
            }
          }

        # Add compound connectors
        self.compound_connectors = {
            's': ['erhverv', 'forbund', 'regering', 'arbejd', 'uddannelse', 'udvikling'],
            'e': ['børn', 'folk', 'kommun', 'skol', 'virksomhed'],
            'r': ['lær', 'led', 'arbejd', 'direktør', 'chef']
        }

        # Add prefixes and suffixes
        self.common_prefixes = [
            'over', 'under', 'mellem', 'chef', 'top', 'vice', 'først', 'senior', 'junior',
            'special', 'hoved', 'general', 'central', 'koncern', 'gruppe', 'region'
        ]

        self.common_suffixes = [
            'chef', 'direktør', 'leder', 'ansvarlig', 'koordinator', 'konsulent',
            'specialist', 'analytiker', 'rådgiver', 'formand', 'præsident'
        ]

    def get_all_patterns(self, term: str) -> List[str]:
        """Generate all possible patterns for a term including variations"""
        patterns = [term]

        # Add space-separated version
        if '-' in term:
            patterns.append(term.replace('-', ' '))

        # Add compound variations
        for connector in self.compound_connectors.keys():
            patterns.append(f"{term}{connector}")

        # Add prefix/suffix combinations
        for prefix in self.common_prefixes:
            patterns.append(f"{prefix}{term}")
            patterns.append(f"{prefix}-{term}")
            patterns.append(f"{prefix} {term}")

        for suffix in self.common_suffixes:
            patterns.append(f"{term}{suffix}")
            patterns.append(f"{term}-{suffix}")
            patterns.append(f"{term} {suffix}")

        return patterns

def validate_first_batch(batch_results):
    """Validate the structure of the first batch results"""
    status = 'valid'
    messages = []

    if not isinstance(batch_results, dict):
        status = 'invalid'
        messages.append("Results should be a dictionary")

    if 'markers' not in batch_results:
        status = 'invalid'
        messages.append("Results should contain 'markers' key")

    return {
        'status': status,
        'messages': messages
    }

def print_validation_results(validation_results):
    """Print validation results"""
    if validation_results['status'] == 'valid':
        print("Validation passed successfully")
    else:
        print("Validation failed:")
        for message in validation_results['messages']:
            print(f"- {message}")

import pandas as pd
import numpy as np
import json
from collections import defaultdict
from tqdm import tqdm
import os
from typing import Dict, List, Set, Tuple
from datetime import datetime
import re
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

def process_chunk(chunk_data: Tuple[List[str], Dict]) -> Dict[str, Dict[str, List[str]]]:
    """Process a chunk of sentences - standalone function for multiprocessing"""
    sentences, compiled_patterns = chunk_data
    results = {}

    for sentence in sentences:
        for category, type_patterns in compiled_patterns.items():
            if category not in results:
                results[category] = {}
            for type_name, pattern in type_patterns.items():
                matches = pattern.findall(sentence)
                if matches:
                    if type_name not in results[category]:
                        results[category][type_name] = []
                    results[category][type_name].extend(matches)

    return results

def merge_results(results_list: List[Dict]) -> Dict:
    """Merge results from multiple chunks"""
    merged = {}

    for result in results_list:
        for category, type_dict in result.items():
            if category not in merged:
                merged[category] = {}
            for type_name, matches in type_dict.items():
                if type_name not in merged[category]:
                    merged[category][type_name] = []
                merged[category][type_name].extend(matches)

    # Convert lists to sets and back to remove duplicates
    for category in merged:
        for type_name in merged[category]:
            merged[category][type_name] = list(set(merged[category][type_name]))

    return merged

class OptimizedPatternMatchingAnalyzer:
    def __init__(self, batch_size=50000):
        self.batch_size = batch_size
        self.danish_patterns = DanishPatternStructure()
        self.compiled_patterns = self._precompile_patterns()

    def _precompile_patterns(self):
        """Precompile all patterns with optimized regex"""
        patterns = {}
        for category, type_dict in self.danish_patterns.market_categories.items():
            patterns[category] = {}
            for type_name, terms in type_dict.items():
                # Combine all terms into a single regex pattern with alternation
                expanded_terms = []
                for term in terms:
                    expanded_terms.extend(self.danish_patterns.get_all_patterns(term))
                # Use alternation (|) to create a single pattern for all terms
                combined_pattern = '|'.join(fr'\b{re.escape(term)}\b' for term in expanded_terms)
                patterns[category][type_name] = re.compile(combined_pattern, re.IGNORECASE | re.UNICODE)
        return patterns

    def _find_matches_in_text(self, text: str) -> Dict[str, Dict[str, List[str]]]:
        """Optimized pattern matching"""
        results = {}

        for category, type_patterns in self.compiled_patterns.items():
            if category not in results:
                results[category] = {}
            for type_name, pattern in type_patterns.items():
                matches = pattern.findall(text)
                if matches:
                    if type_name not in results[category]:
                        results[category][type_name] = []
                    results[category][type_name].extend(matches)

        return results

    def process_batch(self, sentences: List[str], batch_num: int) -> Dict:
        """Process a batch of sentences using parallel processing"""
        # Determine number of CPU cores to use
        num_cores = max(1, multiprocessing.cpu_count() - 1)

        # Split sentences into chunks for parallel processing
        chunk_size = max(1, len(sentences) // num_cores)
        chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

        # Prepare data for parallel processing
        chunk_data = [(chunk, self.compiled_patterns) for chunk in chunks]

        # Process chunks in parallel
        with ProcessPoolExecutor(max_workers=num_cores) as executor:
            chunk_results = list(executor.map(process_chunk, chunk_data))

        # Merge results from all chunks
        merged_results = merge_results(chunk_results)

        return {'markers': merged_results}

def get_last_processed_batch(output_dir):
    """
    Get the number of the last processed batch from existing files
    """
    try:
        # Get all batch files in the directory
        if not os.path.exists(output_dir):
            return -1

        files = [f for f in os.listdir(output_dir) if f.startswith('batch_') and f.endswith('.json')]
        if not files:
            return -1

        # Extract batch numbers from filenames
        batch_numbers = []
        for file in files:
            match = re.search(r'batch_(\d+)_', file)
            if match:
                batch_numbers.append(int(match.group(1)))

        return max(batch_numbers) if batch_numbers else -1
    except Exception as e:
        print(f"Error checking last processed batch: {str(e)}")
        return -1

def get_total_batches(input_path, batch_size):
    """
    Calculate total number of batches in the dataset
    """
    try:
        # Count total lines in CSV without loading entire file
        with open(input_path, 'r', encoding='utf-8') as f:
            total_lines = sum(1 for _ in f) - 1  # Subtract 1 for header
        return (total_lines // batch_size) + (1 if total_lines % batch_size else 0)
    except Exception as e:
        print(f"Error calculating total batches: {str(e)}")
        return 0

def process_dataset(analyzer, input_path, output_dir, dataset_name):
    """Optimized dataset processing with crash recovery"""
    try:
        print(f"\nProcessing {dataset_name} data from: {input_path}")

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Get last processed batch number
        last_processed_batch = get_last_processed_batch(output_dir)
        total_batches = get_total_batches(input_path, analyzer.batch_size)

        print(f"Found {last_processed_batch + 1} previously processed batches")
        print(f"Total batches to process: {total_batches}")

        if last_processed_batch >= total_batches - 1:
            print(f"All batches already processed for {dataset_name}")
            return

        # Read data in chunks to reduce memory usage
        chunk_iterator = pd.read_csv(input_path, chunksize=analyzer.batch_size)

        # Skip already processed batches
        for _ in range(last_processed_batch + 1):
            next(chunk_iterator, None)

        # Process remaining batches
        for batch_num, chunk in enumerate(chunk_iterator, start=last_processed_batch + 1):
            print(f"\nProcessing batch {batch_num} of {total_batches}...")

            batch_sentences = chunk['sentence'].tolist()

            try:
                batch_results = analyzer.process_batch(batch_sentences, batch_num)

                # Save results
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                batch_path = os.path.join(output_dir, f"batch_{batch_num}_{timestamp}.json")

                with open(batch_path, 'w', encoding='utf-8') as f:
                    json.dump(batch_results, f, ensure_ascii=False, indent=2)

                print(f"Saved batch {batch_num} results")

            except Exception as e:
                print(f"Error processing batch {batch_num}: {str(e)}")
                continue

        print(f"\nCompleted processing {dataset_name} data!")

    except Exception as e:
        print(f"Error processing {dataset_name} dataset: {str(e)}")

def main():
    try:
        # Initialize analyzer
        analyzer = OptimizedPatternMatchingAnalyzer(batch_size=50000)

        # Process datasets
        train_input = "/content/drive/MyDrive/NewsData/processed/train_data.csv"
        train_output = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain"
        process_dataset(analyzer, train_input, train_output, "Training")

        val_input = "/content/drive/MyDrive/NewsData/processed/validation_data.csv"
        val_output = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation"
        process_dataset(analyzer, val_input, val_output, "Validation")

        print("\nAll processing completed successfully!")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Check batches for categories

In [8]:
import json
import os
import glob
from collections import defaultdict

def analyze_batch_files(input_dir: str):
    """Analyze batch files to check which categories are present"""
    print(f"Analyzing batch files in: {input_dir}")

    # Get all batch files
    batch_files = glob.glob(os.path.join(input_dir, "batch_*.json"))
    if not batch_files:
        print("No batch files found!")
        return

    print(f"Found {len(batch_files)} batch files")

    # Track categories and their structure
    categories = defaultdict(set)
    sample_content = {}

    # Analyze a few files to get structure
    for file_path in batch_files[:3]:  # Look at first 3 files
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

                if 'markers' in data:
                    for category, content in data['markers'].items():
                        # Track category names
                        categories[category].update(content.keys())
                        # Save sample content structure
                        if category not in sample_content:
                            sample_content[category] = content

        except Exception as e:
            print(f"Error reading {os.path.basename(file_path)}: {str(e)}")
            continue

    # Print findings
    print("\nFound Categories and their subcategories:")
    for category, subcategories in categories.items():
        print(f"\n{category}:")
        for subcategory in sorted(subcategories):
            print(f"  - {subcategory}")
            # Show a sample of content (first 3 items)
            if category in sample_content and subcategory in sample_content[category]:
                sample = sample_content[category][subcategory][:3] if sample_content[category][subcategory] else []
                if sample:
                    print(f"    Sample matches: {', '.join(sample)}")

    print("\nAnalysis complete!")

def main():
    # Directory containing the batch files
    train_dir = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain"
    val_dir = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation"

    print("Analyzing Training Data:")
    analyze_batch_files(train_dir)

    print("\nAnalyzing Validation Data:")
    analyze_batch_files(val_dir)

if __name__ == "__main__":
    main()

Analyzing Training Data:
Analyzing batch files in: /content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain
Found 119 batch files

Found Categories and their subcategories:

energy_and_green_transition:
  - keywords
    Sample matches: Power, Powers, vedvarende
  - orgs
    Sample matches: klimachef, forsynings, vindmøller
  - titles
    Sample matches: klimachef, vindmøller, Vindmøller

welfare:
  - keywords
    Sample matches: Kommune, børn, børne
  - orgs
    Sample matches: Kommune, Ældrecenter, Region
  - titles
    Sample matches: socialchef

maritime_and_shipping:
  - keywords
    Sample matches: Havns, Fragt, containere
  - orgs
    Sample matches: Havns, Fragt, containere
  - titles
    Sample matches: skibsreder, reder

agriculture_food:
  - keywords
    Sample matches: eksport, fødevarer, Overmark
  - orgs
    Sample matches: landbrug, fødevarer, landbrugs
  - titles
    Sample matches: Landmands, Landmand, gårdejer

union_labour:
  - keywords
    Sample matches: Arbe

Merging of batches for train and validation data. Change paths accordingly

In [20]:
#merging and saving results
import json
import os
from collections import defaultdict
from datetime import datetime
from typing import Dict, List, Set
import glob

class BatchMerger:
    def __init__(self, input_dir: str):
        self.input_dir = input_dir

    def get_batch_files(self) -> List[str]:
        """Get all batch result files, excluding summary files"""
        files = glob.glob(os.path.join(self.input_dir, "batch_*.json"))
        return [f for f in files if not f.endswith('_summary.json')]

    def merge_batches(self) -> Dict:
        """Merge all batch files into a single dictionary"""
        merged_results = {
            'market_categories': defaultdict(lambda: defaultdict(set)),
            'elite_hierarchy': defaultdict(lambda: defaultdict(set)),
            'org_forms': defaultdict(lambda: defaultdict(set)),
            'international_markers': defaultdict(lambda: defaultdict(set))
        }
        processed_files = 0

        batch_files = self.get_batch_files()
        print(f"Found {len(batch_files)} batch files to merge")

        for file_path in sorted(batch_files):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    batch_data = json.load(f)

                # Merge markers from this batch
                if 'markers' in batch_data:
                    for category, type_dict in batch_data['markers'].items():
                        # Determine which top-level category this belongs to
                        if category in ['top_level', 'senior_level', 'expert_level']:
                            for type_name, matches in type_dict.items():
                                merged_results['elite_hierarchy'][category][type_name].update(matches)
                        elif category in ['private', 'public', 'associations']:
                            for type_name, matches in type_dict.items():
                                merged_results['org_forms'][category][type_name].update(matches)
                        elif category in ['eu', 'nordic']:
                            for type_name, matches in type_dict.items():
                                merged_results['international_markers'][category][type_name].update(matches)
                        else:  # market categories
                            for type_name, matches in type_dict.items():
                                merged_results['market_categories'][category][type_name].update(matches)

                processed_files += 1
                print(f"Processed file {processed_files}/{len(batch_files)}: {os.path.basename(file_path)}")

            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue

        # Convert sets to lists for JSON serialization
        final_results = {}
        for main_category, categories in merged_results.items():
            final_results[main_category] = {
                category: {
                    type_name: sorted(list(matches))
                    for type_name, matches in type_dict.items()
                }
                for category, type_dict in categories.items()
            }

        return final_results

    def generate_statistics(self, merged_results: Dict) -> Dict:
        """Generate statistics about the merged results"""
        stats = {
            'total_main_categories': len(merged_results),
            'main_categories': {},
            'total_matches': 0
        }

        for main_category, categories in merged_results.items():
            main_category_stats = {
                'total_matches': 0,
                'subcategories': {}
            }

            for category, type_dict in categories.items():
                category_stats = {
                    'total_matches': 0,
                    'types': {}
                }

                for type_name, matches in type_dict.items():
                    num_matches = len(matches)
                    category_stats['types'][type_name] = num_matches
                    category_stats['total_matches'] += num_matches
                    main_category_stats['total_matches'] += num_matches
                    stats['total_matches'] += num_matches

                main_category_stats['subcategories'][category] = category_stats

            stats['main_categories'][main_category] = main_category_stats

        return stats

    # The save_merged_results method remains the same
    def save_merged_results(self, merged_results: Dict, stats: Dict):
        """Save merged results and statistics"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save merged results
        results_filename = f"merged_results_{timestamp}.json"
        results_path = os.path.join(self.input_dir, results_filename)

        with open(results_path, 'w', encoding='utf-8') as f:
            json.dump(merged_results, f, ensure_ascii=False, indent=2)

    # Save statistics
        stats_filename = f"merged_results_stats_{timestamp}.json"
        stats_path = os.path.join(self.input_dir, stats_filename)

        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(stats, f, ensure_ascii=False, indent=2)

        print(f"\nResults saved to: {results_filename}")
        print(f"Statistics saved to: {stats_filename}")

    # Print summary statistics
        print("\nSummary Statistics:")
        print(f"Total main categories: {stats['total_main_categories']}")
        print(f"Total matches: {stats['total_matches']}")
        print("\nMatches by main category:")
        for main_category, main_cat_stats in stats['main_categories'].items():
            print(f"\n{main_category}:")
            print(f"  Total matches: {main_cat_stats['total_matches']}")
            for subcategory, subcat_stats in main_cat_stats['subcategories'].items():
                print(f"\n  {subcategory}:")
                print(f"    Total matches: {subcat_stats['total_matches']}")
                for type_name, count in subcat_stats['types'].items():
                    print(f"    - {type_name}: {count}")

def main():
    # Directory containing the batch files
    input_dir = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation" #skift mellem train og validation

    try:
        # Initialize merger
        merger = BatchMerger(input_dir)

        # Merge batches
        print("Starting merge process...")
        merged_results = merger.merge_batches()

        # Generate statistics
        print("\nGenerating statistics...")
        stats = merger.generate_statistics(merged_results)

        # Save results and statistics
        print("\nSaving results...")
        merger.save_merged_results(merged_results, stats)

    except Exception as e:
        print(f"Error during merge process: {str(e)}")

if __name__ == "__main__":
    main()

Starting merge process...
Found 34 batch files to merge
Processed file 1/34: batch_0_20241216_011536.json
Processed file 2/34: batch_10_20241216_073407.json
Processed file 3/34: batch_11_20241216_075627.json
Processed file 4/34: batch_12_20241216_081856.json
Processed file 5/34: batch_13_20241216_084114.json
Processed file 6/34: batch_14_20241216_090336.json
Processed file 7/34: batch_15_20241216_092550.json
Processed file 8/34: batch_16_20241216_094809.json
Processed file 9/34: batch_17_20241216_101045.json
Processed file 10/34: batch_18_20241216_103302.json
Processed file 11/34: batch_19_20241216_105518.json
Processed file 12/34: batch_1_20241216_013742.json
Processed file 13/34: batch_20_20241216_111736.json
Processed file 14/34: batch_21_20241216_113952.json
Processed file 15/34: batch_22_20241216_120212.json
Processed file 16/34: batch_23_20241216_122434.json
Processed file 17/34: batch_24_20241216_124655.json
Processed file 18/34: batch_25_20241216_130910.json
Processed file 19/3

Merging the validation data

Comparison between train and validation set

In [30]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from typing import Dict, List, Set, Tuple
from collections import defaultdict

class PatternValidator:
    def __init__(self, predictions_path: str, ground_truth_path: str):
        self.predictions = self._load_json(predictions_path)
        self.ground_truth = self._load_json(ground_truth_path)

        # Define fixed categories upfront
        self.main_categories = ['market_categories', 'elite_hierarchy', 'org_forms', 'international_markers']
        self.market_subcategories = [
            'energy_and_green_transition', 'welfare', 'maritime_and_shipping', 'agriculture_food',
            'union_labour', 'real_estate', 'finance', 'industry', 'tech', 'regulatory',
            'education', 'healthcare', 'politics', 'aviation', 'design', 'architecture',
            'hospitality', 'tourism', 'appliances'
        ]
        self.type_names = ['titles', 'orgs', 'keywords']

    def _load_json(self, path: str) -> Dict:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _get_safe_set(self, data: Dict, main_cat: str, subcat: str, type_name: str) -> Set[str]:
        """Safely get a set of items from nested dictionary structure"""
        try:
            if main_cat == 'market_categories':
                return set(data.get(main_cat, {}).get(subcat, {}).get(type_name, []))
            return set()
        except Exception as e:
            print(f"Warning: Error getting set for {main_cat}/{subcat}/{type_name}: {e}")
            return set()

    def calculate_metrics(self) -> Dict:
        """Calculate precision, recall, and F1 score for each category and type"""
        metrics = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

        # Only process market_categories as others are empty
        main_cat = 'market_categories'

        # Use fixed list of subcategories
        for subcat in self.market_subcategories:
            for type_name in self.type_names:
                # Get sets safely
                true_set = self._get_safe_set(self.ground_truth, main_cat, subcat, type_name)
                pred_set = self._get_safe_set(self.predictions, main_cat, subcat, type_name)

                # Calculate metrics
                tp = len(true_set & pred_set)
                fp = len(pred_set - true_set)
                fn = len(true_set - pred_set)

                precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0
                f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                metrics[main_cat][subcat][type_name] = {
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'true_positives': tp,
                    'false_positives': fp,
                    'false_negatives': fn,
                    'total_predicted': len(pred_set),
                    'total_actual': len(true_set),
                    'examples': {
                        'correct_matches': list(true_set & pred_set)[:5],
                        'false_positives': list(pred_set - true_set)[:5],
                        'missed_matches': list(true_set - pred_set)[:5]
                    }
                }

        return metrics

    def get_error_analysis(self) -> Dict:
        """Analyze patterns in false positives and false negatives"""
        error_analysis = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

        # Only process market_categories
        main_cat = 'market_categories'

        for subcat in self.market_subcategories:
            for type_name in self.type_names:
                true_set = self._get_safe_set(self.ground_truth, main_cat, subcat, type_name)
                pred_set = self._get_safe_set(self.predictions, main_cat, subcat, type_name)

                # Analyze false positives
                for fp in (pred_set - true_set):
                    error_analysis[main_cat][subcat][type_name]['false_positives'].append({
                        'term': fp,
                        'similar_to': self._find_similar_terms(fp, true_set)
                    })

                # Analyze false negatives
                for fn in (true_set - pred_set):
                    error_analysis[main_cat][subcat][type_name]['false_negatives'].append({
                        'term': fn,
                        'similar_to': self._find_similar_terms(fn, pred_set)
                    })

        return error_analysis

    def _find_similar_terms(self, term: str, term_set: Set[str], threshold: float = 0.8) -> List[str]:
        """Find similar terms using string similarity"""
        from difflib import SequenceMatcher

        similar_terms = []
        for other_term in term_set:
            similarity = SequenceMatcher(None, term.lower(), other_term.lower()).ratio()
            if similarity >= threshold:
                similar_terms.append((other_term, similarity))

        return [term for term, _ in sorted(similar_terms, key=lambda x: x[1], reverse=True)[:3]]

    def calculate_overall_metrics(self, metrics: Dict) -> Dict:
        """Calculate overall metrics across all categories"""
        main_cat = 'market_categories'
        all_precision = []
        all_recall = []
        all_f1 = []

        for subcat in metrics[main_cat]:
            for type_name in metrics[main_cat][subcat]:
                type_metrics = metrics[main_cat][subcat][type_name]
                all_precision.append(type_metrics['precision'])
                all_recall.append(type_metrics['recall'])
                all_f1.append(type_metrics['f1'])

        return {
            'overall_precision': np.mean(all_precision) if all_precision else 0,
            'overall_recall': np.mean(all_recall) if all_recall else 0,
            'overall_f1': np.mean(all_f1) if all_f1 else 0
        }

    def save_validation_results(self, output_path: str):
        """Save validation results to JSON"""
        try:
            # Calculate metrics and error analysis
            print("Calculating metrics...")
            metrics = self.calculate_metrics()

            print("Performing error analysis...")
            error_analysis = self.get_error_analysis()

            # Calculate overall metrics for market categories
            overall_metrics = defaultdict(lambda: defaultdict(dict))
            main_cat = 'market_categories'

            for subcat in self.market_subcategories:
                if subcat in metrics[main_cat]:
                    subcat_precision = []
                    subcat_recall = []
                    subcat_f1 = []

                    for type_name in self.type_names:
                        if type_name in metrics[main_cat][subcat]:
                            subcat_precision.append(metrics[main_cat][subcat][type_name]['precision'])
                            subcat_recall.append(metrics[main_cat][subcat][type_name]['recall'])
                            subcat_f1.append(metrics[main_cat][subcat][type_name]['f1'])

                    overall_metrics[main_cat][subcat] = {
                        'precision': np.mean(subcat_precision) if subcat_precision else 0,
                        'recall': np.mean(subcat_recall) if subcat_recall else 0,
                        'f1': np.mean(subcat_f1) if subcat_f1 else 0
                    }

            # Calculate summary metrics
            summary = self.calculate_overall_metrics(metrics)

            # Prepare validation report
            validation_report = {
                'metrics': metrics,
                'error_analysis': error_analysis,
                'overall_metrics': overall_metrics,
                'summary': summary
            }

            # Save to file
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(validation_report, f, ensure_ascii=False, indent=2)

            # Print summary
            print("\nValidation Summary:")
            print(f"Overall Precision: {summary['overall_precision']:.3f}")
            print(f"Overall Recall: {summary['overall_recall']:.3f}")
            print(f"Overall F1 Score: {summary['overall_f1']:.3f}")

            # Print category-wise summary
            print("\nCategory-wise Summary:")
            for subcat in overall_metrics[main_cat]:
                metrics = overall_metrics[main_cat][subcat]
                print(f"\n{subcat}:")
                print(f"  Precision: {metrics['precision']:.3f}")
                print(f"  Recall: {metrics['recall']:.3f}")
                print(f"  F1: {metrics['f1']:.3f}")

        except Exception as e:
            print(f"Error in save_validation_results: {str(e)}")
            import traceback
            print(traceback.format_exc())
            raise

def main():
    # Specific paths to your files
    validation_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json"
    training_results_path = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json"
    output_path = "/content/drive/MyDrive/NewsData/processed/validation_analysis_2024161224.json"

    print("Loading validation and training results...")
    print(f"Validation results path: {validation_results_path}")
    print(f"Training results path: {training_results_path}")

    try:
        validator = PatternValidator(
            predictions_path=training_results_path,
            ground_truth_path=validation_results_path
        )

        print("\nCalculating validation metrics...")
        validator.save_validation_results(output_path)

        print(f"\nValidation analysis saved to: {output_path}")

    except FileNotFoundError as e:
        print(f"Error: Could not find one of the input files: {e}")
    except Exception as e:
        print(f"Error during validation: {e}")
        import traceback
        print(traceback.format_exc())

if __name__ == "__main__":
    main()

Loading validation and training results...
Validation results path: /content/drive/MyDrive/NewsData/processed/ElitenessBatchesValidation/merged_results_20241216_181011.json
Training results path: /content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json

Calculating validation metrics...
Calculating metrics...
Performing error analysis...

Validation Summary:
Overall Precision: 0.652
Overall Recall: 0.908
Overall F1 Score: 0.756

Category-wise Summary:

energy_and_green_transition:
  Precision: 0.622
  Recall: 0.953
  F1: 0.752

welfare:
  Precision: 0.685
  Recall: 0.927
  F1: 0.787

maritime_and_shipping:
  Precision: 0.693
  Recall: 0.936
  F1: 0.794

agriculture_food:
  Precision: 0.639
  Recall: 0.846
  F1: 0.724

union_labour:
  Precision: 0.763
  Recall: 0.868
  F1: 0.812

real_estate:
  Precision: 0.749
  Recall: 0.965
  F1: 0.843

finance:
  Precision: 0.650
  Recall: 0.887
  F1: 0.750

industry:
  Precision: 0.628
  Recall: 0.915
  F1

Find embeddings for domains/market categories for 120 batches, each 50.000 sentences

In [16]:
#embedding for every category and subcategory
import json
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from google.cloud import storage
from tqdm import tqdm
import os
from collections import defaultdict

# Initialize BERT model
tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")

def load_categories(file_path):
    """Load eliteness categories from local file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        categories = json.load(f)
    print(f"Loaded categories from {file_path}")
    return categories

def get_word_embedding(word):
    """Get embedding for a single word/pointer"""
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        word_embedding = outputs.last_hidden_state[0][1].numpy()
    return word_embedding

def save_batch_results(batch_embeddings, batch_map, category_path):
    """Save batch results with error handling"""
    try:
        # Create DataFrame for this batch
        words = list(batch_embeddings.keys())
        embeddings_matrix = np.stack([batch_embeddings[word] for word in words])
        similarity_matrix = np.dot(embeddings_matrix, embeddings_matrix.T)

        results = {
            'pointer': words,
            'category': [batch_map[word] for word in words],
            'embedding': [batch_embeddings[word] for word in words]
        }
        batch_df = pd.DataFrame(results)

        # Save locally with timestamp (temporary)
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        local_path_df = f'/content/batch_{category_path.replace("/", "_")}_{timestamp}_embeddings.pkl'
        local_path_matrix = f'/content/batch_{category_path.replace("/", "_")}_{timestamp}_similarity.npy'

        batch_df.to_pickle(local_path_df)
        np.save(local_path_matrix, similarity_matrix)

        # Save to GCS
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        for local_file, file_type in [
            (local_path_df, 'embeddings'),
            (local_path_matrix, 'similarity')
        ]:
            gcs_path = f'embeddings_analysis/market_categories/{category_path}/{timestamp}_{file_type}'
            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_file)

        print(f"Saved {category_path} with {len(words)} pointers")

        # Clean up local files
        os.remove(local_path_df)
        os.remove(local_path_matrix)

    except Exception as e:
        print(f"Error saving {category_path}: {str(e)}")
        try:
            backup_path = f'/content/BACKUP_batch_{category_path.replace("/", "_")}.pkl'
            batch_df.to_pickle(backup_path)
            print(f"Saved backup to {backup_path}")
        except:
            print("Failed to save backup")

def check_processed_categories():
    """Check which categories have already been processed"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    processed = defaultdict(set)
    print("\nChecking existing files in bucket...")

    prefix = 'embeddings_analysis/market_categories/'
    for blob in bucket.list_blobs(prefix=prefix):
        path_parts = blob.name.split('/')
        if len(path_parts) >= 5:  # ensure we have enough path parts
            category = path_parts[2]  # e.g., energy_and_green_transition
            subcategory = path_parts[3]  # e.g., titles
            processed[category].add(subcategory)

    if processed:
        print("\nFound processed categories:")
        for category, subcats in processed.items():
            print(f"- {category}:")
            for subcat in sorted(subcats):
                print(f"  - {subcat}")
    else:
        print("\nNo processed categories found")

    return processed

def process_subcategory(category_name, subcat, items):
    """Process a subcategory"""
    batch_embeddings = {}
    batch_map = {}

    category_path = f"{category_name}/{subcat}"
    print(f"\nProcessing {category_path} ({len(items)} items)")

    for item in tqdm(items):
        try:
            embedding = get_word_embedding(item)
            batch_embeddings[item] = embedding
            batch_map[item] = category_path
        except Exception as e:
            print(f"Error processing {item}: {str(e)}")

    if batch_embeddings:
        save_batch_results(batch_embeddings, batch_map, category_path)

    return batch_embeddings, batch_map

def process_all_categories(categories_file, resume=True):
    """Process all categories"""
    categories = load_categories(categories_file)
    processed = check_processed_categories() if resume else defaultdict(set)

    all_embeddings = {}
    all_category_maps = {}

    # Process only market_categories
    if 'market_categories' in categories:
        for category_name, subcategories in categories['market_categories'].items():
            print(f"\nProcessing category: {category_name}")

            for subcat_name, items in subcategories.items():
                if isinstance(items, list) and items:
                    if subcat_name not in processed.get(category_name, set()):
                        print(f"\nProcessing subcategory: {subcat_name} ({len(items)} items)")
                        sub_embeddings, sub_map = process_subcategory(
                            category_name, subcat_name, items
                        )
                        if sub_embeddings and sub_map:
                            all_embeddings.update(sub_embeddings)
                            all_category_maps.update(sub_map)
                    else:
                        print(f"\nSkipping already processed: {category_name}/{subcat_name}")

                    # Clear memory
                    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print("\nProcessing complete!")
    return all_embeddings, all_category_maps

def main():
    # Define paths
    categories_file = "/content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json"

    print("Starting batch processing of categories...")
    print("Checking for existing progress...")

    all_embeddings, all_category_maps = process_all_categories(categories_file, resume=True)
    return all_embeddings, all_category_maps

if __name__ == "__main__":
    all_embeddings, all_category_maps = main()

Starting batch processing of categories...
Checking for existing progress...
Loaded categories from /content/drive/MyDrive/NewsData/processed/ElitenessBatchesTrain/merged_results_20241215_203840.json

Checking existing files in bucket...

Found processed categories:
- agriculture_food:
  - keywords
  - orgs
  - titles
- appliances:
  - keywords
  - orgs
  - titles
- architecture:
  - keywords
  - orgs
  - titles
- aviation:
  - keywords
  - orgs
  - titles
- design:
  - keywords
  - orgs
  - titles
- education:
  - keywords
  - orgs
  - titles
- energy_and_green_transition:
  - keywords
  - orgs
  - titles
- finance:
  - keywords
  - orgs
  - titles
- healthcare:
  - keywords
  - orgs
  - titles
- hospitality:
  - keywords
  - orgs
  - titles
- industry:
  - keywords
  - orgs
  - titles
- maritime_and_shipping:
  - keywords
  - orgs
  - titles
- politics:
  - keywords
  - orgs
  - titles
- real_estate:
  - keywords
  - orgs
  - titles
- regulatory:
  - keywords
  - orgs
  - titles
- te

Embeddings for hiearchy, international, forms markers separately (2021 data set)

In [17]:
#embeddings, the rest
import json
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from google.cloud import storage
from tqdm import tqdm
import os
from collections import defaultdict

# Initialize BERT model
tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")

def load_categories(file_path):
    """Load eliteness categories from local file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        categories = json.load(f)
    print(f"Loaded categories from {file_path}")
    return categories

def get_word_embedding(word):
    """Get embedding for a single word/pointer"""
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        word_embedding = outputs.last_hidden_state[0][1].numpy()
    return word_embedding

def save_batch_results(batch_embeddings, batch_map, category_path):
    """Save batch results with error handling"""
    try:
        # Create DataFrame for this batch
        words = list(batch_embeddings.keys())
        embeddings_matrix = np.stack([batch_embeddings[word] for word in words])
        similarity_matrix = np.dot(embeddings_matrix, embeddings_matrix.T)

        results = {
            'pointer': words,
            'category': [batch_map[word] for word in words],
            'embedding': [batch_embeddings[word] for word in words]
        }
        batch_df = pd.DataFrame(results)

        # Save locally with timestamp (temporary)
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        local_path_df = f'/content/batch_{category_path.replace("/", "_")}_{timestamp}_embeddings.pkl'
        local_path_matrix = f'/content/batch_{category_path.replace("/", "_")}_{timestamp}_similarity.npy'

        batch_df.to_pickle(local_path_df)
        np.save(local_path_matrix, similarity_matrix)

        # Save to GCS
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        for local_file, file_type in [
            (local_path_df, 'embeddings'),
            (local_path_matrix, 'similarity')
        ]:
            gcs_path = f'embeddings_analysis/{category_path}/{timestamp}_{file_type}'
            #gcs_path = f'embeddings_analysis/market_categories/{category_path}/{timestamp}_{file_type}'
            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_file)

        print(f"Saved {category_path} with {len(words)} pointers")

        # Clean up local files
        os.remove(local_path_df)
        os.remove(local_path_matrix)

    except Exception as e:
        print(f"Error saving {category_path}: {str(e)}")
        try:
            backup_path = f'/content/BACKUP_batch_{category_path.replace("/", "_")}.pkl'
            batch_df.to_pickle(backup_path)
            print(f"Saved backup to {backup_path}")
        except:
            print("Failed to save backup")

def check_processed_categories():
    """Check which categories have already been processed"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    processed = defaultdict(set)
    print("\nChecking existing files in bucket...")

    #prefix = 'embeddings_analysis/market_categories/'
    prefix = 'embeddings_analysis/'
    for blob in bucket.list_blobs(prefix=prefix):
        path_parts = blob.name.split('/')
        if len(path_parts) >= 5:  # ensure we have enough path parts
            category = path_parts[2]  # e.g., energy_and_green_transition
            subcategory = path_parts[3]  # e.g., titles
            processed[category].add(subcategory)

    if processed:
        print("\nFound processed categories:")
        for category, subcats in processed.items():
            print(f"- {category}:")
            for subcat in sorted(subcats):
                print(f"  - {subcat}")
    else:
        print("\nNo processed categories found")

    return processed

def process_subcategory(category_name, subcat, items):
    """Process a subcategory"""
    batch_embeddings = {}
    batch_map = {}

    category_path = f"{category_name}/{subcat}"
    print(f"\nProcessing {category_path} ({len(items)} items)")

    for item in tqdm(items):
        try:
            embedding = get_word_embedding(item)
            batch_embeddings[item] = embedding
            batch_map[item] = category_path
        except Exception as e:
            print(f"Error processing {item}: {str(e)}")

    if batch_embeddings:
        save_batch_results(batch_embeddings, batch_map, category_path)

    return batch_embeddings, batch_map

def process_additional_categories(categories_file, resume=True):
    """Process international, hierarchy, and organization categories"""
    categories = load_categories(categories_file)
    processed = check_processed_categories() if resume else defaultdict(set)

    all_embeddings = {}
    all_category_maps = {}

    # Process 'organization' categories
    if 'organization' in categories:
        org_categories = categories['organization']
        for subcat_name, items in org_categories.items():
            # Skip 'entities' subcategory
            if subcat_name != 'entities' and isinstance(items, list):
                gcs_path = f'organization/{subcat_name}'
                if subcat_name not in processed.get('organization', set()):
                    print(f"\nProcessing organization subcategory: {subcat_name} ({len(items)} items)")
                    sub_embeddings, sub_map = process_subcategory(
                        'organization', subcat_name, items
                    )
                    if sub_embeddings and sub_map:
                        all_embeddings.update(sub_embeddings)
                        all_category_maps.update(sub_map)
                else:
                    print(f"\nSkipping already processed: organization/{subcat_name}")

                torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Category mappings for other categories
    category_mappings = {
        'international_eu': 'international/eu',
        'international_nordic': 'international/nordic',
        'international_regional_danish': 'international/regional_danish',
        'hierarchy_expert_level': 'hierarchy/expert_level',
        'hierarchy_senior_level': 'hierarchy/senior_level',
        'hierarchy_top_level': 'hierarchy/top_level'
    }

    # Process other categories
    for category, gcs_path in category_mappings.items():
        if category in categories:
            print(f"\nProcessing category: {category} (Path: {gcs_path})")

            subcats_to_process = categories[category]
            for subcat_name, items in subcats_to_process.items():
                if isinstance(items, list) and items:
                    if subcat_name not in processed.get(gcs_path, set()):
                        print(f"\nProcessing subcategory: {subcat_name} ({len(items)} items)")
                        sub_embeddings, sub_map = process_subcategory(
                            gcs_path, subcat_name, items
                        )
                        if sub_embeddings and sub_map:
                            all_embeddings.update(sub_embeddings)
                            all_category_maps.update(sub_map)
                    else:
                        print(f"\nSkipping already processed: {gcs_path}/{subcat_name}")

                    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print("\nProcessing complete!")
    return all_embeddings, all_category_maps

def main():
    # Define paths
    categories_file = "/content/drive/MyDrive/NewsData/processed/Elitenessbatches091224/merged_results_20241210_110522.json"

    print("Starting batch processing of categories...")
    print("Checking for existing progress...")

    all_embeddings, all_category_maps = process_additional_categories(categories_file, resume=True)
    #all_embeddings, all_category_maps = process_all_categories(categories_file, resume=True)
    return all_embeddings, all_category_maps

if __name__ == "__main__":
    all_embeddings, all_category_maps = main()

Starting batch processing of categories...
Checking for existing progress...
Loaded categories from /content/drive/MyDrive/NewsData/processed/Elitenessbatches091224/merged_results_20241210_110522.json

Checking existing files in bucket...

Found processed categories:
- expert_level:
  - indicators
  - titles
- senior_level:
  - indicators
  - titles
- top_level:
  - indicators
  - titles
- eu:
  - orgs
  - titles
- nordic:
  - orgs
  - titles
- regional_danish:
  - orgs
  - titles
- agriculture_food:
  - keywords
  - orgs
  - titles
- appliances:
  - keywords
  - orgs
  - titles
- architecture:
  - keywords
  - orgs
  - titles
- aviation:
  - keywords
  - orgs
  - titles
- design:
  - keywords
  - orgs
  - titles
- education:
  - keywords
  - orgs
  - titles
- energy_and_green_transition:
  - keywords
  - orgs
  - titles
- finance:
  - keywords
  - orgs
  - titles
- healthcare:
  - keywords
  - orgs
  - titles
- hospitality:
  - keywords
  - orgs
  - titles
- industry:
  - keywords
  -

100%|██████████| 3278/3278 [04:25<00:00, 12.34it/s]


Saved organization/public with 3278 pointers

Processing organization subcategory: private (1600 items)

Processing organization/private (1600 items)


100%|██████████| 1600/1600 [02:06<00:00, 12.62it/s]


Saved organization/private with 1600 pointers

Processing organization subcategory: associations (2939 items)

Processing organization/associations (2939 items)


100%|██████████| 2939/2939 [03:56<00:00, 12.43it/s]


Saved organization/associations with 2939 pointers

Processing category: international_eu (Path: international/eu)

Processing subcategory: titles (3 items)

Processing international/eu/titles (3 items)


100%|██████████| 3/3 [00:00<00:00, 12.38it/s]


Saved international/eu/titles with 3 pointers

Processing subcategory: orgs (2 items)

Processing international/eu/orgs (2 items)


100%|██████████| 2/2 [00:00<00:00, 13.64it/s]


Saved international/eu/orgs with 2 pointers

Processing category: international_nordic (Path: international/nordic)

Processing subcategory: titles (3 items)

Processing international/nordic/titles (3 items)


100%|██████████| 3/3 [00:00<00:00, 13.37it/s]


Saved international/nordic/titles with 3 pointers

Processing subcategory: orgs (4 items)

Processing international/nordic/orgs (4 items)


100%|██████████| 4/4 [00:00<00:00, 13.44it/s]


Saved international/nordic/orgs with 4 pointers

Processing category: international_regional_danish (Path: international/regional_danish)

Processing subcategory: orgs (15 items)

Processing international/regional_danish/orgs (15 items)


100%|██████████| 15/15 [00:01<00:00, 11.44it/s]


Saved international/regional_danish/orgs with 15 pointers

Processing subcategory: titles (11 items)

Processing international/regional_danish/titles (11 items)


100%|██████████| 11/11 [00:01<00:00, 10.66it/s]


Saved international/regional_danish/titles with 11 pointers

Processing category: hierarchy_expert_level (Path: hierarchy/expert_level)

Processing subcategory: titles (10 items)

Processing hierarchy/expert_level/titles (10 items)


100%|██████████| 10/10 [00:00<00:00, 13.49it/s]


Saved hierarchy/expert_level/titles with 10 pointers

Processing subcategory: indicators (14 items)

Processing hierarchy/expert_level/indicators (14 items)


100%|██████████| 14/14 [00:01<00:00, 13.35it/s]


Saved hierarchy/expert_level/indicators with 14 pointers

Processing category: hierarchy_senior_level (Path: hierarchy/senior_level)

Processing subcategory: indicators (16 items)

Processing hierarchy/senior_level/indicators (16 items)


100%|██████████| 16/16 [00:01<00:00, 13.64it/s]


Saved hierarchy/senior_level/indicators with 16 pointers

Processing subcategory: titles (12 items)

Processing hierarchy/senior_level/titles (12 items)


100%|██████████| 12/12 [00:00<00:00, 13.93it/s]


Saved hierarchy/senior_level/titles with 12 pointers

Processing category: hierarchy_top_level (Path: hierarchy/top_level)

Processing subcategory: indicators (21 items)

Processing hierarchy/top_level/indicators (21 items)


100%|██████████| 21/21 [00:01<00:00, 13.62it/s]


Saved hierarchy/top_level/indicators with 21 pointers

Processing subcategory: titles (23 items)

Processing hierarchy/top_level/titles (23 items)


100%|██████████| 23/23 [00:01<00:00, 13.57it/s]


Saved hierarchy/top_level/titles with 23 pointers

Processing complete!


Classifier trainer: eu, expert_level etc. as categories, not subcategories. keywords, titles, orgs as values, not subcategories.

In [30]:
#Classifier trainer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from google.cloud import storage
from tqdm import tqdm
import pickle
from collections import defaultdict

def combine_batch_embeddings():
    """Load and combine batch embeddings with memory management and duplicate prevention"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Track processed files and unique embeddings
    processed_files = set()
    unique_pointers = set()
    all_embeddings = []
    total_count = 0

    print("Scanning for embedding files...")
    # Only look for files in the new structure
    valid_prefixes = [
        'embeddings_analysis/market_categories/',
        'embeddings_analysis/hierarchy/',
        'embeddings_analysis/international/',
        'embeddings_analysis/organization/'
    ]

    embedding_files = []
    for prefix in valid_prefixes:
        files = [
            blob.name for blob in bucket.list_blobs(prefix=prefix)
            if '_embeddings' in blob.name and '_similarity' not in blob.name
        ]
        embedding_files.extend(files)

    print(f"Found {len(embedding_files)} embedding files")
    batch_size = 10  # Process 10 files at a time

    for i in range(0, len(embedding_files), batch_size):
        batch_files = embedding_files[i:i + batch_size]
        batch_embeddings = []

        print(f"\nProcessing batch {i//batch_size + 1} of {(len(embedding_files) + batch_size - 1)//batch_size}")

        for file_name in batch_files:
            try:
                if file_name in processed_files:
                    continue

                blob = bucket.blob(file_name)
                blob.download_to_filename('/content/temp_batch.pkl')
                batch_df = pd.read_pickle('/content/temp_batch.pkl')

                # Check for duplicate pointers
                new_pointers = set(batch_df['pointer'])
                duplicate_count = len(new_pointers & unique_pointers)
                if duplicate_count > 0:
                    print(f"Found {duplicate_count} duplicates in {file_name}")
                    batch_df = batch_df[~batch_df['pointer'].isin(unique_pointers)]

                if len(batch_df) == 0:
                    continue

                # Extract categories from file path
                path_parts = file_name.split('/')
                main_category = path_parts[2]  # market_categories, hierarchy, international, organization
                sub_category = path_parts[3]   # specific subcategory

                # Handle sub_type for market_categories
                if main_category == 'market_categories' and len(path_parts) > 4:
                    sub_type = path_parts[4].split('_')[0]  # Get the type (titles, orgs, keywords)
                else:
                    sub_type = 'default'

                # Add category information
                batch_df['main_category'] = main_category
                batch_df['sub_category'] = sub_category
                batch_df['sub_type'] = sub_type

                batch_embeddings.append(batch_df)
                processed_files.add(file_name)

                print(f"Added {len(batch_df)} unique embeddings from {main_category}/{sub_category}")
                total_count += len(batch_df)

            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                continue

        # Combine batch and clear memory
        if batch_embeddings:
            combined_batch = pd.concat(batch_embeddings, ignore_index=True)
            all_embeddings.append(combined_batch)

        # Clear memory
        del batch_embeddings
        import gc
        gc.collect()

    # Final combination
    if not all_embeddings:
        raise ValueError("No embedding files were successfully loaded!")

    final_df = pd.concat(all_embeddings, ignore_index=True)
    print(f"\nFinal Statistics:")
    print(f"Total unique embeddings: {len(final_df)}")
    print("\nMain categories:", sorted(final_df['main_category'].unique()))
    print("\nSub-categories by main category:")
    for main_cat in sorted(final_df['main_category'].unique()):
        subcats = sorted(final_df[final_df['main_category'] == main_cat]['sub_category'].unique())
        print(f"\n{main_cat}:")
        for subcat in subcats:
            count = len(final_df[(final_df['main_category'] == main_cat) &
                               (final_df['sub_category'] == subcat)])
            print(f"  - {subcat}: {count} embeddings")

    return final_df

def train_hierarchical_classifier(data_df, batch_size=10000):
    """Train classifier with memory-efficient batching"""
    # Convert embeddings to numpy array in batches
    print("Converting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(data_df), batch_size):
        batch = data_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = data_df['main_category'].values

    print(f"Prepared {len(X)} samples for training")

    # Continue with classifier training as before...
    # [rest of the training code remains the same]

def train_hierarchical_classifier(data_df, batch_size=10000, min_samples=2):
    """Train classifier with memory-efficient batching and minimum sample filtering"""
    print("\nPreparing training data...")

    # Filter categories with too few samples
    category_counts = data_df['main_category'].value_counts()
    valid_categories = category_counts[category_counts >= min_samples].index
    print(f"\nFound {len(valid_categories)} categories with {min_samples}+ samples")
    print("Removing categories:", set(data_df['main_category'].unique()) - set(valid_categories))

    filtered_df = data_df[data_df['main_category'].isin(valid_categories)].copy()

    # Convert embeddings to numpy array in batches
    print("\nConverting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(filtered_df), batch_size):
        batch = filtered_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = filtered_df['main_category'].values

    print(f"\nPrepared {len(X)} samples for training")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_main, test_size=0.2, random_state=42, stratify=y_main
    )

    # Train main classifier
    print("\nTraining main category classifier...")
    main_classifier = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        class_weight='balanced'
    )
    main_classifier.fit(X_train, y_train)

    # Evaluate main classifier
    print("\nMain Category Classification:")
    print("Cross-validation scores:")
    cv_scores = cross_val_score(main_classifier, X, y_main, cv=5)
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

    y_pred = main_classifier.predict(X_test)
    print("\nTest set performance:")
    print(classification_report(y_test, y_pred))

    # Train subcategory classifiers
    sub_classifiers = {}
    for main_cat in valid_categories:
        cat_mask = filtered_df['main_category'] == main_cat
        cat_data = filtered_df[cat_mask]

        # Check if subcategories have enough samples
        subcat_counts = cat_data['sub_category'].value_counts()
        valid_subcats = subcat_counts[subcat_counts >= min_samples].index

        if len(valid_subcats) > 1:  # Need at least 2 valid subcategories
            print(f"\nTraining {main_cat} subcategory classifier...")

            sub_X = np.stack(cat_data[cat_data['sub_category'].isin(valid_subcats)]['embedding'].values)
            sub_y = cat_data[cat_data['sub_category'].isin(valid_subcats)]['sub_category']

            try:
                sub_clf = LogisticRegression(
                    multi_class='multinomial',
                    max_iter=1000,
                    class_weight='balanced'
                )
                sub_clf.fit(sub_X, sub_y)
                sub_classifiers[main_cat] = sub_clf

                # Evaluate if enough samples
                if len(sub_y) >= 10:  # Minimum for 5-fold CV
                    cv_scores = cross_val_score(sub_clf, sub_X, sub_y, cv=min(5, len(sub_y)))
                    print(f"Mean CV score: {cv_scores.mean():.3f}")
            except Exception as e:
                print(f"Error training subcategory classifier for {main_cat}: {str(e)}")
                continue

    return {
        'main_classifier': main_classifier,
        'sub_classifiers': sub_classifiers,
        'valid_categories': valid_categories
    }

def main():
    # Load embeddings
    print("Loading and combining embeddings...")
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

def save_classifiers(classifiers, prefix='eliteness'):
    """Save trained classifiers to GCS"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Save locally first
    with open(f'/content/{prefix}_classifiers.pkl', 'wb') as f:
        pickle.dump(classifiers, f)

    # Upload to GCS
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    blob = bucket.blob(f'classifiers/{prefix}_classifier_{timestamp}.pkl')
    blob.upload_from_filename(f'/content/{prefix}_classifiers.pkl')
    print(f"Saved classifiers to GCS")

def predict_eliteness(text, classifiers, tokenizer, model):
    """Predict eliteness categories for new text"""
    # Get embedding
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[0][1].numpy()

    # Predict main category
    main_category = classifiers['main_classifier'].predict([embedding])[0]

    # Predict subcategory if available
    sub_category = None
    if main_category in classifiers['sub_classifiers']:
        sub_category = classifiers['sub_classifiers'][main_category].predict([embedding])[0]

    return {
        'text': text,
        'main_category': main_category,
        'sub_category': sub_category
    }

def main():
    # Load and combine embeddings
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

if __name__ == "__main__":
    classifiers, data_df = main()

Scanning for embedding files...
Found 108 embedding files

Processing batch 1 of 11
Added 80 unique embeddings from agriculture_food/keywords
Added 52 unique embeddings from agriculture_food/orgs
Added 14 unique embeddings from agriculture_food/titles
Added 71 unique embeddings from appliances/keywords
Added 37 unique embeddings from appliances/orgs
Added 27 unique embeddings from appliances/titles
Added 54 unique embeddings from architecture/keywords
Added 40 unique embeddings from architecture/orgs
Added 39 unique embeddings from architecture/titles
Added 78 unique embeddings from aviation/keywords

Processing batch 2 of 11
Added 25 unique embeddings from aviation/orgs
Added 21 unique embeddings from aviation/titles
Added 90 unique embeddings from design/keywords
Added 28 unique embeddings from design/orgs
Added 28 unique embeddings from design/titles
Added 66 unique embeddings from education/keywords
Added 101 unique embeddings from education/orgs
Added 62 unique embeddings from edu




Main Category Classification:
Cross-validation scores:




Mean CV score: 0.898 (+/- 0.027)

Test set performance:
                             precision    recall  f1-score   support

           agriculture_food       0.87      0.93      0.90        29
                 appliances       0.74      0.52      0.61        27
               architecture       0.68      0.70      0.69        27
               associations       0.95      0.96      0.96       588
                   aviation       0.95      0.76      0.84        25
                     design       0.65      0.69      0.67        29
                  education       0.96      0.93      0.95        46
energy_and_green_transition       0.89      0.81      0.85        42
                         eu       0.80      1.00      0.89         4
               expert_level       0.95      1.00      0.97        19
                    finance       0.83      0.81      0.82        48
                 healthcare       0.98      0.98      0.98        44
                hospitality       0.89      0.



Mean CV score: 0.842

Training education subcategory classifier...




Mean CV score: 0.843

Training tech subcategory classifier...




Mean CV score: 0.520

Training healthcare subcategory classifier...




Mean CV score: 0.710

Training welfare subcategory classifier...




Mean CV score: 0.700

Training energy_and_green_transition subcategory classifier...




Mean CV score: 0.635

Training politics subcategory classifier...




Mean CV score: 0.957

Training regulatory subcategory classifier...




Mean CV score: 0.793

Training industry subcategory classifier...




Mean CV score: 0.578

Training top_level subcategory classifier...
Mean CV score: 1.000

Training hospitality subcategory classifier...




Mean CV score: 0.698

Training agriculture_food subcategory classifier...




Mean CV score: 0.548

Training design subcategory classifier...




Mean CV score: 0.780

Training maritime_and_shipping subcategory classifier...




Mean CV score: 0.611

Training appliances subcategory classifier...




Mean CV score: 0.830

Training architecture subcategory classifier...




Mean CV score: 0.699

Training real_estate subcategory classifier...




Mean CV score: 0.700

Training aviation subcategory classifier...




Mean CV score: 0.750

Training tourism subcategory classifier...




Mean CV score: 0.661

Training senior_level subcategory classifier...
Mean CV score: 1.000

Training regional_danish subcategory classifier...




Mean CV score: 1.000

Training union_labour subcategory classifier...




Mean CV score: 0.772

Training expert_level subcategory classifier...
Mean CV score: 0.875

Training nordic subcategory classifier...




Mean CV score: 1.000

Training eu subcategory classifier...
Mean CV score: 1.000




Saved classifiers to GCS


Model testing on data set from Google News API

In [34]:
#nyt data fra google api
from transformers import pipeline, AutoTokenizer, AutoModel
from collections import defaultdict
import spacy
import pandas as pd
import torch
import logging
from google.cloud import storage
import pickle
import numpy as np
from typing import Dict, Tuple, List


def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier_20241215_214712.pkl')
        with open('/content/classifier_20241215_214712.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

class NewsAnalyzer:
    def __init__(self):
        self.classifiers = load_classifier()
        self.tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
        self.model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")
        self.nlp = spacy.load('da_core_news_lg')
        self.sentiment_pipeline = pipeline("sentiment-analysis",
                                        model="DGurgurov/xlm-r_danish_sentiment")

    def clean_text(self, text: str) -> str:
        """Clean text by fixing encoding issues and removing truncation markers"""
        if not isinstance(text, str):
            return ""

        # Fix common encoding issues
        replacements = {
            'Ã¸': 'ø',
            'Ã¦': 'æ',
            'Ã¥': 'å',
            'Ã˜': 'Ø',
            'Ã†': 'Æ',
            'Ã…': 'Å'
        }

        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove truncation marker and everything after it
        if '[+' in text and 'chars]' in text:
            text = text.split('[+')[0].strip()

        return text

    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        return embedding

    def classify_text(self, text: str) -> Dict:
        """Classify a piece of text using the trained classifier"""
        try:
            if not text:
                return {'category': 'unknown', 'confidence': 0.0}

            embedding = self.get_embedding(text)
            main_classifier = self.classifiers['main_classifier']
            pred_class = main_classifier.predict([embedding])[0]
            confidence = np.max(main_classifier.predict_proba([embedding])[0])

            return {'category': pred_class, 'confidence': confidence}
        except Exception as e:
            logging.error(f"Error in text classification: {str(e)}")
            return {'category': 'unknown', 'confidence': 0.0}

    def analyze_sentiment(self, text: str) -> Dict:
        """Analyze sentiment of cleaned text"""
        try:
            if not text:
                return {'label': 'L', 'confidence': 0.0}

            cleaned_text = self.clean_text(text)
            result = self.sentiment_pipeline(cleaned_text[:512])[0]

            label_map = {
                'LABEL_1': 'P',
                'LABEL_0': 'N'
            }
            return {
                'label': label_map.get(result['label'], 'L'),
                'confidence': result['score']
            }
        except Exception as e:
            logging.error(f"Error in sentiment analysis: {str(e)}")
            return {'label': 'L', 'confidence': 0.0}

    def get_combined_sentiment(self, sentiments: List[Dict]) -> Dict:
        """Calculate combined sentiment from multiple analyses"""
        valid_sentiments = [s for s in sentiments if s['label'] != 'L']
        if not valid_sentiments:
            return {'label': 'L', 'confidence': 0.0}

        # Weight sentiments by confidence
        weighted_sum = sum(1 if s['label'] == 'P' else -1 * s['confidence']
                         for s in valid_sentiments)
        avg_confidence = sum(s['confidence'] for s in valid_sentiments) / len(valid_sentiments)

        return {
            'label': 'P' if weighted_sum > 0 else 'N',
            'confidence': avg_confidence
        }

    def analyze_article(self, title: str, description: str, content: str) -> Dict:
        """Analyze full article including title, description, and content"""
        # Clean texts
        clean_title = self.clean_text(title)
        clean_desc = self.clean_text(description)
        clean_content = self.clean_text(content)

        # Split content into sentences
        doc = self.nlp(clean_content)
        sentences = [sent.text for sent in doc.sents]

        # Classify each sentence
        category_counts = defaultdict(int)

        # Classify title
        title_class = self.classify_text(clean_title)
        category_counts[title_class['category']] += 1

        # Classify each sentence
        for sentence in sentences:
            classification = self.classify_text(sentence)
            category_counts[classification['category']] += 1

        # Get primary categories based on counts
        primary_category = max(category_counts.items(), key=lambda x: x[1])

        # Get sentiment for each part
        sentiments = {
            'title': self.analyze_sentiment(clean_title),
            'description': self.analyze_sentiment(clean_desc),
            'content': self.analyze_sentiment(clean_content)
        }

        # Calculate combined sentiment
        combined_sentiment = self.get_combined_sentiment(list(sentiments.values()))
        sentiments['combined'] = combined_sentiment

        return {
            'category_counts': dict(category_counts),
            'primary_category': primary_category[0],
            'category_frequency': primary_category[1],
            'sentiment': sentiments
        }

def analyze_news_dataset(file_path: str) -> Dict:
    analyzer = NewsAnalyzer()
    results = {}

    df = pd.read_csv(file_path)
    print(f"Analyzing {len(df)} articles...")

    for idx, row in df.iterrows():
        print(f"Processing article {idx + 1}/{len(df)}")
        results[idx] = {
            'source': row['source'],
            'title': row['title'],
            'publishedAt': row['publishedAt'],
            'analysis': analyzer.analyze_article(
                row['title'],
                row['description'],
                row['content']
            )
        }

    return results

def display_results(results: Dict):
    for article_id, data in results.items():
        print("\n" + "="*50)
        print(f"Source: {data['source']}")
        print(f"Title: {data['title']}")
        print(f"Date: {data['publishedAt']}")

        analysis = data['analysis']
        print(f"\nPrimary Category: {analysis['primary_category']} "
              f"(frequency: {analysis['category_frequency']})")

        print("\nCategory Distribution:")
        for category, count in sorted(analysis['category_counts'].items(),
                                    key=lambda x: x[1], reverse=True):
            print(f"{category}: {count}")

        print("\nSentiment Analysis:")
        sentiments = analysis['sentiment']
        for part, sent in sentiments.items():
            print(f"{part.capitalize()}: {sent['label']} "
                  f"(confidence: {sent['confidence']:.2f})")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    results = analyze_news_dataset('/content/drive/MyDrive/NewsMarketAnalysis/danish_news_multiple_topics.csv')
    display_results(results)

Loaded classifier from classifiers/eliteness_classifier_20241215_214712.pkl


config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Primary Category: private (frequency: 2)

Category Distribution:
private: 2
public: 1
associations: 1

Sentiment Analysis:
Title: N (confidence: 0.99)
Description: N (confidence: 0.99)
Content: P (confidence: 0.98)
Combined: N (confidence: 0.98)

Source: {'id': None, 'name': 'Politiken.dk'}
Title: Danske medier: Politisk handling er nødvendig. Sådan kan vi sammen tæmme techgiganterne
Date: 2024-12-12T10:07:01Z

Primary Category: private (frequency: 2)

Category Distribution:
private: 2
public: 1
associations: 1

Sentiment Analysis:
Title: N (confidence: 0.99)
Description: N (confidence: 0.99)
Content: P (confidence: 0.98)
Combined: N (confidence: 0.98)

Source: {'id': None, 'name': 'Politiken.dk'}
Title: Danske Medier: Politisk handling er nødvendig. Sådan kan vi sammen tæmme techgigantern
Date: 2024-12-12T10:07:01Z

Primary Category: private (frequency: 2)

Category Distribution:
private: 2
public: 1
associations: 1

Sen

Weigting the classifier

In [18]:
#Classifier trainer with weights and market boosting
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from google.cloud import storage
import pickle
from collections import defaultdict

def combine_batch_embeddings():
    """Load and combine batch embeddings with memory management and duplicate prevention."""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Track processed files and unique embeddings
    processed_files = set()
    unique_pointers = set()
    all_embeddings = []
    total_count = 0

    print("Scanning for embedding files...")
    valid_prefixes = [
        'embeddings_analysis/market_categories/',
        'embeddings_analysis/hierarchy/',
        'embeddings_analysis/international/',
        'embeddings_analysis/organization/'
    ]

    embedding_files = []
    for prefix in valid_prefixes:
        files = [
            blob.name for blob in bucket.list_blobs(prefix=prefix)
            if '_embeddings' in blob.name and '_similarity' not in blob.name
        ]
        embedding_files.extend(files)

    print(f"Found {len(embedding_files)} embedding files")
    batch_size = 10  # Process 10 files at a time

    for i in range(0, len(embedding_files), batch_size):
        batch_files = embedding_files[i:i + batch_size]
        batch_embeddings = []

        print(f"\nProcessing batch {i // batch_size + 1} of {(len(embedding_files) + batch_size - 1) // batch_size}")
        for file_name in batch_files:
            try:
                if file_name in processed_files:
                    continue

                blob = bucket.blob(file_name)
                blob.download_to_filename('/content/temp_batch.pkl')
                batch_df = pd.read_pickle('/content/temp_batch.pkl')

                # Remove duplicates
                new_pointers = set(batch_df['pointer'])
                batch_df = batch_df[~batch_df['pointer'].isin(unique_pointers)]
                unique_pointers.update(new_pointers)

                # Extract category information
                path_parts = file_name.split('/')
                main_category = path_parts[2]
                sub_category = path_parts[3]

                batch_df['main_category'] = main_category
                batch_df['sub_category'] = sub_category

                batch_embeddings.append(batch_df)
                processed_files.add(file_name)

                print(f"Added {len(batch_df)} embeddings from {main_category}/{sub_category}")
                total_count += len(batch_df)

            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                continue

        # Combine batch and clear memory
        if batch_embeddings:
            combined_batch = pd.concat(batch_embeddings, ignore_index=True)
            all_embeddings.append(combined_batch)

    if not all_embeddings:
        raise ValueError("No embedding files were successfully loaded!")

    final_df = pd.concat(all_embeddings, ignore_index=True)
    print(f"\nFinal combined embeddings: {len(final_df)}")
    return final_df

def train_hierarchical_classifier(data_df, batch_size=10000, min_samples=2):
    """Train classifier with priority for market categories"""
    print("\nPreparing training data...")

    # Define market/domain categories
    market_categories = [
        'energy_and_green_transition', 'welfare', 'maritime_and_shipping',
        'agriculture_food', 'union_labour', 'real_estate', 'finance', 'industry',
        'tech', 'regulatory', 'education', 'healthcare', 'politics', 'aviation',
        'design', 'architecture', 'hospitality', 'tourism', 'appliances'
    ]

    # Filter categories with too few samples
    category_counts = data_df['main_category'].value_counts()
    valid_categories = category_counts[category_counts >= min_samples].index
    print(f"\nFound {len(valid_categories)} categories with {min_samples}+ samples")

    filtered_df = data_df[data_df['main_category'].isin(valid_categories)].copy()

    # Assign class weights (boost weights for market categories)
    class_weights = {category: 1.0 for category in valid_categories}
    for category in market_categories:
        if category in class_weights:
            class_weights[category] = 2.0  # Double the weight for market categories

    # Convert embeddings to numpy array
    print("\nConverting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(filtered_df), batch_size):
        batch = filtered_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = filtered_df['main_category'].values

    print(f"\nPrepared {len(X)} samples for training")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_main, test_size=0.2, random_state=42, stratify=y_main
    )

    # Train main classifier
    print("\nTraining main category classifier with adjusted weights...")
    main_classifier = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        class_weight=class_weights
    )
    main_classifier.fit(X_train, y_train)

    # Evaluate main classifier
    print("\nMain Category Classification:")
    print("Cross-validation scores:")
    cv_scores = cross_val_score(main_classifier, X, y_main, cv=5)
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

    y_pred = main_classifier.predict(X_test)
    print("\nTest set performance:")
    print(classification_report(y_test, y_pred))

    return {'main_classifier': main_classifier, 'valid_categories': valid_categories}

def predict_eliteness(text, classifiers, tokenizer, model, threshold_adjustment=0.2):
    """Predict eliteness categories for new text with market bias"""
    def boost_market_categories(probs, classes):
        """Lower threshold for market categories"""
        boosted_probs = probs.copy()
        for i, cls in enumerate(classes):
            if cls in market_categories:
                boosted_probs[i] += threshold_adjustment
        return boosted_probs / boosted_probs.sum()  # Normalize

    # Define market categories
    market_categories = [
        'energy_and_green_transition', 'welfare', 'maritime_and_shipping',
        'agriculture_food', 'union_labour', 'real_estate', 'finance', 'industry',
        'tech', 'regulatory', 'education', 'healthcare', 'politics', 'aviation',
        'design', 'architecture', 'hospitality', 'tourism', 'appliances'
    ]

    # Define market keywords for rule-based adjustments
    market_keywords = {
        'energy_and_green_transition': ['solar', 'vind', 'genanvendelig', 'grøn energi', 'sustainability', 'BÆREDYGTIG', 'Bæredygtig', 'Bæredygtige', 'ENERGI', 'Energi', 'Energichef', 'Energie', 'Energiformand', 'Energirådgiver', 'Energis', 'Energispecialist', 'GRØN', 'GRøn', 'Grøn', 'Grøne', 'Grøns', 'KLIMA', 'KLIMAs', 'Klima', 'Klimachef', 'Klimadirektør', 'Klimarådgiver', 'MILJØ', 'MIljø', 'Miljø', 'Miljøchef', 'Miljødirektør', 'Miljøformand', 'Miljøkonsulent', 'Miljøs', 'OMSTILLING', 'Omstilling', 'Omstillings', 'Overenergi', 'POWER', 'Power', 'Powers', 'VINDKRAFT', 'Vedvarende', 'Vindkraft', 'bæredygtig', 'bæredygtige', 'bæredygtigs', 'energi', 'energianalytiker', 'energiansvarlig', 'energichef', 'energidirektør', 'energie', 'energikonsulent', 'energikoordinator', 'energileder', 'energirådgiver', 'energis', 'energispecialist', 'først Miljø', 'først grøn', 'gruppe miljø', 'grøn', 'grøne', 'klima', 'klimaanalytiker', 'klimaansvarlig', 'klimachef', 'klimadirektør', 'klimaformand', 'klimakonsulent', 'klimakoordinator', 'klimaleder', 'klimarådgiver', 'klimas', 'mellem Klima', 'mellem Miljø', 'mellem bæredygtig', 'mellem energi', 'mellem grøn', 'mellem klima', 'mellem miljø', 'mellem vindkraft', 'mellemgrøn', 'miljø', 'miljøansvarlig', 'miljøchef', 'miljødirektør', 'miljøe', 'miljøformand', 'miljøkonsulent', 'miljøkoordinator', 'miljøleder', 'miljørådgiver', 'miljøs', 'omstilling', 'omstillings', 'over Miljø', 'over energi', 'over grøn', 'over klima', 'over miljø', 'over vedvarende', 'power', 'powers', 'specialmiljø', 'topmiljø', 'under Energi', 'under Grøn', 'under Klima', 'under Miljø', 'under Omstilling', 'under bæredygtig', 'under energi', 'under klima', 'under miljø', 'under omstilling', 'under vedvarende', 'underenergi', 'vedvarende', 'vindkraft'],
        'finance': ['investment', 'loan', 'bank', 'capital', 'stock', 'finance', 'AKTIE', 'AKTIER', 'Aktie', 'Aktieanalytiker', 'Aktiechef', 'Aktier', 'Akties', 'Børs', 'Børschef', 'Børsdirektør', 'Børse', 'FINANS', 'FUSION', 'Finans', 'Finansanalytiker', 'Finanschef', 'Finansdirektør', 'Fusion', 'Fusions', 'Investering', 'Investerings', 'MARKED', 'Marked', 'Markeds', 'Obligation', 'Obligatione', 'Obligations', 'Opkøb', 'RENTE', 'Rente', 'Renter', 'Rentes', 'Toprente', 'VALUTA', 'Valuta', 'aktie', 'aktieanalytiker', 'aktiechef', 'aktier', 'aktierådgiver', 'akties', 'aktiespecialist', 'børs', 'børsanalytiker', 'børschef', 'central marked', 'finans', 'finansanalytiker', 'finansansvarlig', 'finanschef', 'finansdirektør', 'finansrådgiver', 'fusion', 'fusione', 'fusions', 'først rente', 'gruppe aktie', 'hovedaktie', 'hovedmarked', 'investering', 'investerings', 'koncernfinans', 'marked', 'markede', 'markedkoordinator', 'markedleder', 'markeds', 'mellem Finans', 'mellem finans', 'mellem investering', 'mellem marked', 'mellem rente', 'obligation', 'obligatione', 'obligations', 'opkøb', 'opkøbe', 'opkøbs', 'over Finans', 'over aktie', 'over investering', 'over marked', 'over rente', 'overinvestering', 'overrente', 'rente', 'renteanalytiker', 'renter', 'rentes', 'under Finans', 'under finans', 'under opkøb', 'underinvestering', 'valuta', 'valutaanalytiker', 'valutachef', 'valutas'],
        'agriculture_food': ['farming', 'crops', 'food production', 'harvest', 'agriculture', 'EKSPORT', 'Eksport', 'FØDEVARER', 'Fødevare', 'Fødevarechef', 'Fødevaredirektør', 'Fødevarer', 'Fødevares', 'Fødevaresikkerhed', 'Gård', 'Gårde', 'Gårds', 'Hovedgård', 'LANDBRUG', 'Landbrug', 'Landbrugs', 'MARK', 'Mark', 'Marke', 'Markformand', 'Marks', 'Mejeri', 'Mejerichef', 'Mejerie', 'Mejeris', 'Mellemgård', 'Overgård', 'Overmark', 'Topmark', 'chef Mark', 'eksport', 'eksportchef', 'eksportdirektør', 'eksportrådgiver', 'eksports', 'fødevare', 'fødevarechef', 'fødevarekonsulent', 'fødevarer', 'fødevares', 'fødevaresikkerhed', 'først Mark', 'general Mark', 'gård', 'gårde', 'gårds', 'hovedgård', 'landbrug', 'landbruge', 'landbrugs', 'mark', 'marke', 'markformand', 'marks', 'mejeri', 'mejerichef', 'mejeridirektør', 'mejeris', 'mellem Landbrug', 'mellem Mark', 'mellem eksport', 'mellem fødevare', 'mellem landbrug', 'mellem mark', 'mellem økologi', 'over Mark', 'over eksport', 'over fødevare', 'over fødevaresikkerhed', 'over landbrug', 'over mark', 'over økologi', 'speciallandbrug', 'under Landbrug', 'ØKOLOGI', 'Økologi', 'Økologichef', 'økologi', 'økologichef', 'økologikonsulent'],
        'tech': ['technology', 'software', 'hardware', 'innovation', 'Automatisering', 'DATA', 'DIGITAL', 'DIGITALE', 'DIgital', 'Data', 'Datas', 'Digital', 'Digitalanalytiker', 'Digitalchef', 'Digitale', 'Digitals', 'General Data', 'IT', 'ITE', 'ITR', 'ITS', 'ITs', 'Innovation', 'Innovations', 'It', 'Itr', 'Its', 'Koncern IT', 'Kunstig intelligens', 'Over It', 'Senior IT', 'Senior it', 'Senior-IT', 'Senior-It', 'SeniorDigital', 'Software', 'Softwaredirektør', 'Softwares', 'Special-software', 'TECH', 'TECHs', 'TEKNOLOGI', 'Tech', 'Techs', 'Teknologi', 'Teknologichef', 'Teknologis', 'Under IT', 'automatisering', 'automatiserings', 'central it', 'chef software', 'data', 'dataanalytiker', 'dataansvarlig', 'datachef', 'datakonsulent', 'datas', 'dataspecialist', 'digital', 'digitalchef', 'digitale', 'digitalrådgiver', 'digitals', 'først data', 'gruppe IT', 'gruppe it', 'innovation', 'innovations', 'it', 'its', 'koncern it', 'koncern-it', 'kunstig intelligens', 'mellem IT', 'mellem Tech', 'mellem innovation', 'mellem it', 'mellem software', 'mellem tech', 'mellem teknologi', 'over IT', 'over It', 'over Software', 'over data', 'over digital', 'over it', 'over kunstig intelligens', 'over tech', 'over teknologi', 'senior-IT', 'senior-it', 'software', 'softwarechef', 'softwares', 'special software', 'specialteknologi', 'tech', 'techs', 'teknologi', 'teknologianalytiker', 'teknologichef', 'teknologidirektør', 'teknologikonsulent', 'teknologikoordinator', 'teknologirådgiver', 'teknologis', 'teknologispecialist', 'under IT', 'under it'],
        'welfare': ['Beskæftigelse', 'Beskæftigelses', 'BØRN', 'BØRNE', 'Børn', 'Børne', 'Børns', 'INTEGRATION', 'Integration', 'Integrations', 'KOMMUNE', 'KOMMUNER', 'KOmmune', 'Kommune', 'Kommunekoordinator', 'Kommuner', 'Kommunes', 'OMSORG', 'Omsorg', 'Omsorgs', 'Over kommune', 'Regionkommune', 'SOCIAL', 'SOCIALE', 'Social', 'Socialchef', 'Socialdirektør', 'Sociale', 'Socialformand', 'Socialkonsulent', 'Socialrådgiver', 'Socials', 'Specialbørn', 'UNGE', 'UNGER', 'UNge', 'Unge', 'Ungechef', 'Ungedirektør', 'Ungeformand', 'Ungekoordinator', 'Unger', 'Ungerådgiver', 'Unges', 'VELFÆRD', 'Velfærd', 'Velfærds', 'beskæftigelse', 'beskæftigelser', 'beskæftigelses', 'børn', 'børne', 'børns', 'først børn', 'først velfærd', 'gruppe børn', 'gruppe social', 'gruppe unge', 'gruppe ældre', 'hovedbeskæftigelse', 'integration', 'integratione', 'integrations', 'kOMMUNE', 'kommune', 'kommunechef', 'kommunedirektør', 'kommunekoordinator', 'kommuner', 'kommunes', 'mellem Børn', 'mellem Kommune', 'mellem beskæftigelse', 'mellem børn', 'mellem integration', 'mellem kommune', 'mellem omsorg', 'mellem social', 'mellem unge', 'mellem velfærd', 'mellem Ældre', 'mellem ældre', 'mellembørn', 'omsorg', 'omsorgs', 'over Børn', 'over Social', 'over beskæftigelse', 'over børn', 'over integration', 'over social', 'over unge', 'over velfærd', 'over ældre', 'seniorkommune', 'social', 'socialansvarlig', 'socialchef', 'socialdirektør', 'sociale', 'socialkonsulent', 'socialkoordinator', 'socialleder', 'socialrådgiver', 'specialbørn', 'under Børn', 'under Social', 'under beskæftigelse', 'under børn', 'under integration', 'under social', 'under unge', 'under Ældre', 'under ældre', 'underbeskæftigelse', 'unge', 'ungechef', 'ungedirektør', 'ungeformand', 'ungekonsulent', 'ungekoordinator', 'ungeleder', 'unger', 'ungerådgiver', 'unges', 'velfærd', 'velfærds', 'ÆLDRE', 'ÆLdre', 'Ældre', 'Ældrechef', 'Ældredirektør', 'Ældreformand', 'Ældrekonsulent', 'Ældres', 'ældre', 'ældrechef', 'ældredirektør', 'ældrekonsulent', 'ældrer', 'ældrerådgiver', 'ældres'],
        'maritime_and_shipping': ['Container', 'Containere', 'Containers', 'FRAGT', 'Fragt', 'HAVN', 'Havn', 'Havne', 'Havns', 'Logistik', 'Logistikchef', 'Logistikdirektør', 'Logistiks', 'MARITIM', 'Maritim', 'Maritime', 'OFFSHORE', 'Offshore', 'Offshores', 'SKIB', 'SKIBE', 'Shipping', 'Shippinganalytiker', 'Shippings', 'Skib', 'Skibe', 'Skibs', 'Søfart', 'Søfarts', 'container', 'containere', 'containers', 'fragt', 'fragte', 'havn', 'havne', 'havns', 'hovedhavn', 'hovedskib', 'logistik', 'logistikchef', 'logistikdirektør', 'logistikkoordinator', 'logistikleder', 'maritim', 'maritimchef', 'maritime', 'mellem havn', 'mellem skib', 'offshore', 'over havn', 'over offshore', 'shipping', 'shippinganalytiker', 'skib', 'skibe', 'skibs', 'specialcontainer', 'specialskib', 'søfart', 'søfarts', 'under fragt'],
        'union_labour': ['ARBEJDSMARKED', 'Arbejdsmarked', 'Arbejdsmarkeds', 'Arbejdsret', 'Fagbevægelse', 'Fagbevægelser', 'Fagbevægelses', 'Faglig', 'Faglige', 'Forhandling', 'Først medlem', 'Medlem', 'Medlems', 'Overenskomst', 'arbejdsmarked', 'arbejdsmarkede', 'arbejdsmarkeds', 'arbejdsret', 'arbejdsrets', 'central overenskomst', 'fagbevægelse', 'fagbevægelser', 'fagbevægelses', 'faglig', 'faglige', 'fagligkonsulent', 'forhandling', 'forhandlings', 'først faglig', 'først medlem', 'gruppe medlem', 'gruppemedlem', 'hovedforhandling', 'hovedoverenskomst', 'juniormedlem', 'medlem', 'medlems', 'mellem fagbevægelse', 'mellem faglig', 'mellem overenskomst', 'overenskomst', 'overenskomstchef', 'seniorarbejdsmarked', 'seniormedlem', 'specialfaglig', 'topfaglig', 'topmedlem', 'under faglig', 'under forhandling', 'under overenskomst'],
        'real_estate': ['Administration', 'Administrations', 'BOLIG', 'BYGGERI', 'Bolig', 'Boligchef', 'Boligdirektør', 'Boligs', 'Byggeri', 'Byggeris', 'Bygherre', 'Bygherrer', 'Bygherrerådgiver', 'Centraladministration', 'Developer', 'Developers', 'Ejendom', 'Ejendoms', 'General Administration', 'Gruppeadministration', 'Investering', 'Investerings', 'UDLEJNING', 'Udlejning', 'Udlejnings', 'Under administration', 'administration', 'administrations', 'bolig', 'boligchef', 'bolige', 'boligformand', 'boligkonsulent', 'boligkoordinator', 'boligrådgiver', 'boligs', 'byggeri', 'byggerie', 'byggeris', 'bygherre', 'bygherreansvarlig', 'bygherrer', 'bygherrerådgiver', 'bygherres', 'central administration', 'centraladministration', 'developer', 'developere', 'developers', 'ejendom', 'ejendoms', 'gruppeadministration', 'hovedadministration', 'investering', 'investerings', 'mellem administration', 'mellem bolig', 'mellem bygherre', 'mellem investering', 'mellem udlejning', 'over administration', 'over bolig', 'over byggeri', 'over investering', 'over udlejning', 'overadministration', 'overinvestering', 'senior bolig', 'seniorbolig', 'udlejning', 'udlejnings', 'under Administration', 'under administration', 'under byggeri', 'underinvestering'],
        'industry':  ['ERHVERV', 'Erhverv', 'Erhvervs', 'FABRIKS', 'Fabrik', 'Fabriks', 'INDUSTRI', 'Industri', 'Industrie', 'Industris', 'Koncern', 'Koncernchef', 'Koncerndirektør', 'MARKED', 'Marked', 'Markeds', 'Overproduktion', 'PRODUKTION', 'Produktion', 'Produktions', 'Senior Erhverv', 'SeniorErhverv', 'Seniorerhverv', 'Specialproduktion', 'Supply Chain', 'Supply chain', 'VIRKSOMHED', 'VIrksomhed', 'Virksomhed', 'Virksomheds', 'central marked', 'central produktion', 'erhverv', 'erhverve', 'erhvervleder', 'erhvervs', 'fabrik', 'fabriks', 'først virksomhed', 'hovederhverv', 'hovedfabrik', 'hovedmarked', 'hovedproduktion', 'hovedvirksomhed', 'industri', 'industriansvarlig', 'industrie', 'industris', 'koncern', 'koncernansvarlig', 'koncernchef', 'koncerndirektør', 'koncerne', 'koncerns', 'marked', 'markede', 'markedkoordinator', 'markedleder', 'markeds', 'mellem erhverv', 'mellem fabrik', 'mellem industri', 'mellem marked', 'mellem produktion', 'mellem virksomhed', 'over erhverv', 'over industri', 'over marked', 'over produktion', 'overproduktion', 'produktion', 'produktions', 'specialfabrik', 'specialproduktion', 'specialvirksomhed', 'supply chain', 'under Koncern', 'under produktion', 'underproduktion', 'virksomhed', 'virksomhede', 'virksomheds'],
        'regulatory': ['Beskatning', 'Centralforvaltning', 'Forvaltning', 'Forvaltnings', 'Først-politik', 'LOVGIVNING', 'Lovgivning', 'Myndighed', 'Myndigheds', 'POLITIK', 'PolitiK', 'Politik', 'Regulering', 'Seniorpolitik', 'Tilsyn', 'Tilsyns', 'beskatning', 'beskatnings', 'central lovgivning', 'central myndighed', 'centralforvaltning', 'forvaltning', 'forvaltninge', 'forvaltnings', 'først-politik', 'lovgivning', 'lovgivnings', 'mellem beskatning', 'mellem forvaltning', 'mellem lovgivning', 'mellem politik', 'mellem tilsyn', 'myndighed', 'myndighedr', 'myndigheds', 'over lovgivning', 'over politik', 'overbeskatning', 'overmyndighed', 'overregulering', 'politik', 'politike', 'politiks', 'regulering', 'reguleringr', 'regulerings', 'seniorforvaltning', 'seniorpolitik', 'specialregulering', 'tilsyn', 'tilsynchef', 'tilsyne', 'tilsyns', 'toppolitik', 'under Politik', 'under forvaltning', 'under regulering', 'under tilsyn', 'underregulering'],
        'education': ['ELEV', 'Elev', 'Elevformand', 'Forskning', 'Forsknings', 'Forskningspecialist', 'Læring', 'Lærings', 'Pædagogik', 'STUDENT', 'Specialpædagogik', 'Specialuddannelse', 'Specialundervisning', 'Student', 'Students', 'Topstudent', 'UDDANNELSE', 'UNDERVISNING', 'Uddannelse', 'Uddannelser', 'Uddannelses', 'Under uddannelse', 'Undervisning', 'Undervisnings', 'elev', 'elevansvarlig', 'eleve', 'elevformand', 'elevs', 'forskning', 'forskningleder', 'forsknings', 'gruppe undervisning', 'gruppeundervisning', 'hoveduddannelse', 'læring', 'lærings', 'mellem elev', 'mellem forskning', 'mellem læring', 'mellem pædagogik', 'mellem uddannelse', 'mellem undervisning', 'mellemuddannelse', 'over forskning', 'over uddannelse', 'over undervisning', 'overuddannelse', 'pædagogik', 'special-pædagogik', 'special-undervisning', 'specialelev', 'specialpædagogik', 'specialuddannelse', 'specialundervisning', 'student', 'students', 'topforskning', 'uddannelse', 'uddannelser', 'uddannelses', 'under forskning', 'under uddannelse', 'undervisning', 'undervisninge', 'undervisnings'],
        'healthcare': ['BEHANDLING', 'BIOTEK', 'Behandling', 'Behandlings', 'Biotek', 'DIABETES', 'Diabetes', 'Diabetesformand', 'Først behandling', 'Insulin', 'Insulins', 'Klinisk', 'Kliniske', 'MEDICIN', 'MEDICINE', 'Medicin', 'Medicine', 'Medicins', 'Patient', 'Patients', 'SUNDHED', 'SYGDOM', 'Sundhed', 'Sundheds', 'Sygdom', 'Sygdome', 'behandling', 'behandlings', 'biotek', 'biotekanalytiker', 'biotekdirektør', 'bioteks', 'diabetes', 'først behandling', 'først medicin', 'først sundhed', 'gruppebehandling', 'insulin', 'klinisk', 'kliniske', 'medicin', 'medicine', 'medicins', 'mellem Sundhed', 'mellem behandling', 'mellem biotek', 'mellem medicin', 'mellem patient', 'mellem sundhed', 'mellem sygdom', 'mellembehandling', 'over behandling', 'over medicin', 'over sundhed', 'over sygdom', 'over-behandling', 'overbehandling', 'patient', 'patientansvarlig', 'patiente', 'patients', 'special behandling', 'specialbehandling', 'sundhed', 'sundheddirektør', 'sundheds', 'sundhedspecialist', 'sygdom', 'sygdome', 'sygdoms', 'under Sundhed', 'under behandling', 'under sygdom', 'underbehandling'],
        'politics': ['Beslutning', 'Beslutnings', 'DEMOKRATI', 'Demokrati', 'Demokratis', 'Demokratispecialist', 'Først-politik', 'LOVGIVNING', 'Lovgivning', 'POLITIK', 'PolitiK', 'Politik', 'REFORM', 'Reform', 'Reforms', 'Samfund', 'Samfunde', 'Samfunds', 'Seniorpolitik', 'Under valg', 'VALG', 'Valg', 'Valganalytiker', 'Valgansvarlig', 'Valgformand', 'Valgkonsulent', 'Valgs', 'beslutning', 'beslutninge', 'beslutnings', 'central beslutning', 'central lovgivning', 'demokrati', 'demokratie', 'demokratileder', 'demokratirådgiver', 'demokratis', 'først valg', 'først-politik', 'førstvalg', 'general beslutning', 'gruppebeslutning', 'lovgivning', 'lovgivnings', 'mellem demokrati', 'mellem lovgivning', 'mellem politik', 'mellem samfund', 'mellem valg', 'over beslutning', 'over demokrati', 'over lovgivning', 'over politik', 'over valg', 'politik', 'politike', 'politiks', 'reform', 'reformpræsident', 'reforms', 'samfund', 'samfunde', 'samfunds', 'seniorpolitik', 'toppolitik', 'under Politik', 'under reform', 'under valg', 'valg', 'valganalytiker', 'valgansvarlig', 'valgchef', 'valge', 'valgformand', 'valgkonsulent', 'valgrådgiver', 'valgs'],
        'aviation': ['Airline', 'Airlines', 'Aviation', 'FLY', 'Fly', 'Flye', 'Flyr', 'Flys', 'Koncern Sikkerhed', 'LUFTHAVN', 'Luftfart', 'Lufthavn', 'Lufthavne', 'Lufthavns', 'Over transport', 'REJSE', 'REJSER', 'Rejse', 'Rejseleder', 'Rejser', 'Rejses', 'SIKKERHED', 'Sikkerhed', 'Sikkerheds', 'Specialtransport', 'TRANSPORT', 'Transport', 'Transporte', 'Transports', 'Under rejse', 'airline', 'airlines', 'aviation', 'fly', 'flye', 'flyleder', 'flyr', 'flys', 'først fly', 'først rejse', 'grupperejse', 'hovedfly', 'hovedlufthavn', 'hovedtransport', 'luftfart', 'luftfarts', 'lufthavn', 'lufthavne', 'lufthavns', 'mellem fly', 'mellem luftfart', 'mellem lufthavn', 'mellem sikkerhed', 'over lufthavn', 'over rejse', 'over sikkerhed', 'over transport', 'rejse', 'rejsedirektør', 'rejsekonsulent', 'rejseleder', 'rejser', 'rejses', 'sikkerhed', 'sikkerhedchef', 'sikkerhede', 'sikkerheds', 'specialfly', 'transport', 'transportansvarlig', 'transportchef', 'transportdirektør', 'transporte', 'transportkoordinator', 'transportrådgiver', 'under Transport', 'under rejse', 'under transport'],
        'design': ['BRUGER', 'Bruger', 'Brugere', 'DESIGN', 'Design', 'Designchef', 'Designdirektør', 'Designe', 'Designs', 'Først bruger', 'Junior Design', 'KUNST', 'Kreativ', 'Kreative', 'Kunst', 'Kunstkonsulent', 'Kunsts', 'MODE', 'MODER', 'Mode', 'Moder', 'Modes', 'Produkt', 'Produktchef', 'Produktrådgiver', 'Produktspecialist', 'Special design', 'Under Kunst', 'bruger', 'brugere', 'brugers', 'design', 'designDirektør', 'designchef', 'designdirektør', 'designe', 'designs', 'først bruger', 'først design', 'gruppe bruger', 'hovedprodukt', 'koncern bruger', 'kreativ', 'kreative', 'kunst', 'kunstansvarlig', 'kunstchef', 'kunstdirektør', 'kunste', 'kunstformand', 'kunstkonsulent', 'kunstleder', 'kunstrådgiver', 'kunsts', 'mellem Design', 'mellem bruger', 'mellem design', 'mellem kunst', 'mellem mode', 'mellem produkt', 'mellem æstetik', 'mode', 'modedirektør', 'modekonsulent', 'moder', 'modes', 'over bruger', 'over design', 'over kunst', 'over mode', 'over æstetik', 'over-produkt', 'overbruger', 'produkt', 'produktansvarlig', 'produktchef', 'produktdirektør', 'produktkoordinator', 'produkts', 'produktspecialist', 'special design', 'specialprodukt', 'top bruger', 'top-design', 'topdesign', 'topprodukt', 'under Kunst', 'under bruger', 'under design', 'Æstetik', 'æstetik', 'æstetiks'],
        'architecture': ['ARKITEKTUR', 'Arkitektur', 'BYGGERI', 'Byggeri', 'Byggeris', 'Byplanlægning', 'Byrum', 'Byrums', 'DESIGN', 'Design', 'Designchef', 'Designdirektør', 'Designe', 'Designs', 'Junior Design', 'Landskab', 'Landskabs', 'Special design', 'arkitektur', 'arkitekturs', 'byggeri', 'byggerie', 'byggeris', 'byplanlægning', 'byplanlægnings', 'byrum', 'byrums', 'design', 'designDirektør', 'designchef', 'designdirektør', 'designe', 'designs', 'først design', 'landskab', 'landskabe', 'landskabs', 'mellem Design', 'mellem arkitektur', 'mellem design', 'mellem landskab', 'mellem æstetik', 'over arkitektur', 'over byggeri', 'over byplanlægning', 'over design', 'over æstetik', 'special design', 'top-design', 'topdesign', 'under byggeri', 'under design', 'Æstetik', 'æstetik', 'æstetiks'],
        'hospitality': ['Central Hotel', 'Chef-Mad', 'Gastronomi', 'HOTEL', 'HOTELS', 'Hotel', 'Hoteldirektør', 'Hotels', 'Koncernservice', 'MAD', 'MADE', 'MADS', 'MADs', 'MAds', 'Mad', 'Made', 'Mads', 'OPLEVELSE', 'Oplevelse', 'Oplevelser', 'Oplevelses', 'Over Hotel', 'Overnatning', 'RESTAURANT', 'Restaurant', 'Restaurantchef', 'Restaurante', 'Restaurants', 'SERVICE', 'Senior Service', 'Senior-service', 'Service', 'Servicechef', 'Serviceleder', 'Servicer', 'Servicerådgiver', 'Services', 'først mad', 'gastronomi', 'gastronomis', 'general service', 'hotel', 'hotelchef', 'hoteldirektør', 'hotels', 'hovedrestaurant', 'koncern service', 'koncernservice', 'mad', 'made', 'madr', 'mads', 'madspecialist', 'mellem Hotel', 'mellem Restaurant', 'mellem gastronomi', 'mellem hotel', 'mellem mad', 'mellem restaurant', 'mellem service', 'mellemmad', 'oplevelse', 'oplevelser', 'oplevelses', 'over Hotel', 'over gastronomi', 'over hotel', 'over mad', 'over oplevelse', 'over restaurant', 'over service', 'overnatning', 'overnatnings', 'restaurant', 'restaurantchef', 'restaurante', 'restaurantleder', 'restaurants', 'seniorservice', 'service', 'servicechef', 'servicedirektør', 'servicekoordinator', 'serviceleder', 'servicer', 'servicerådgiver', 'services', 'servicespecialist', 'specialrestaurant', 'top-restaurant', 'topgastronomi', 'topoplevelse', 'toprestaurant', 'topservice', 'under Hotel', 'under overnatning', 'under service'],
        'tourism': ['Attraktion', 'Destination', 'Destinations', 'FERIE', 'Ferie', 'Ferier', 'OPLEVELSE', 'Oplevelse', 'Oplevelser', 'Oplevelses', 'Overturisme', 'REJSE', 'REJSER', 'Rejse', 'Rejseleder', 'Rejser', 'Rejses', 'TURISME', 'TURIST', 'Turisme', 'Turismechef', 'Turismekonsulent', 'Turismes', 'Turist', 'Turistchef', 'Turistdirektør', 'Turistkonsulent', 'Turists', 'Under ferie', 'Under rejse', 'attraktion', 'destination', 'destinations', 'ferie', 'feriekonsulent', 'ferier', 'ferierådgiver', 'først ferie', 'først rejse', 'grupperejse', 'hovedattraktion', 'hovedferie', 'mellem Destination', 'mellem ferie', 'mellem turisme', 'oplevelse', 'oplevelser', 'oplevelses', 'over ferie', 'over oplevelse', 'over rejse', 'rejse', 'rejsedirektør', 'rejsekonsulent', 'rejseleder', 'rejser', 'rejses', 'topattraktion', 'topoplevelse', 'turisme', 'turismechef', 'turismedirektør', 'turismekonsulent', 'turismes', 'turist', 'turistchef', 'turistdirektør', 'turiste', 'turistformand', 'turistkonsulent', 'turistkoordinator', 'turists', 'under ferie', 'under rejse'],
        'appliances': ['Elektronik', 'Hvidevare', 'Hvidevarer', 'Innovation', 'Innovations', 'Koncernudvikling', 'KØKKEN', 'Køkken', 'Køkkenchef', 'Køkkenleder', 'Køkkens', 'Produkt', 'Produktchef', 'Produktrådgiver', 'Produktspecialist', 'TEKNISKE', 'Teknisk', 'Tekniske', 'UDVIKLING', 'Udvikling', 'Udviklings', 'Under udvikling', 'central køkken', 'centralkøkken', 'elektronik', 'hovedprodukt', 'hovedudvikling', 'hvidevare', 'hvidevarer', 'innovation', 'innovations', 'koncernudvikling', 'køkken', 'køkkenansvarlig', 'køkkenchef', 'køkkene', 'køkkenleder', 'køkkens', 'køkkenspecialist', 'mellem Teknisk', 'mellem elektronik', 'mellem innovation', 'mellem køkken', 'mellem produkt', 'mellem teknisk', 'mellem udvikling', 'over elektronik', 'over køkken', 'over udvikling', 'over-produkt', 'produkt', 'produktansvarlig', 'produktchef', 'produktdirektør', 'produktkoordinator', 'produkts', 'produktspecialist', 'specialprodukt', 'specialudvikling', 'teknisk', 'tekniskchef', 'tekniske', 'topprodukt', 'udvikling', 'udviklingchef', 'udviklinge', 'udviklings', 'under køkken', 'under teknisk', 'under udvikling', 'underudvikling']
    }

    def adjust_prediction_with_keywords(text, predicted_category, confidence, market_keywords, boost_factor=0.1):
        """Adjust prediction confidence based on presence of market keywords."""
        for category, keywords in market_keywords.items():
            if any(keyword.lower() in text.lower() for keyword in keywords):
                if category != predicted_category:
                    confidence *= (1 - boost_factor)  # Slightly reduce original category confidence
                return category, confidence + boost_factor  # Boost confidence for market category
        return predicted_category, confidence

    # Get embedding
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[0][1].numpy()



    main_category = classes[np.argmax(boosted_probs)]
    confidence = max(boosted_probs)

    # Apply rule-based adjustments
    adjusted_category, adjusted_confidence = adjust_prediction_with_keywords(
        text, main_category, confidence, market_keywords
    )

    return {
        'text': text,
        'main_category': adjusted_category,
        'confidence': adjusted_confidence
    }

def main():
    # Load embeddings
    print("Loading and combining embeddings...")
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

def save_classifiers(classifiers, prefix='eliteness'):
    """Save trained classifiers to GCS"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Save locally first
    with open(f'/content/{prefix}_classifiers.pkl', 'wb') as f:
        pickle.dump(classifiers, f)

    # Upload to GCS
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    blob = bucket.blob(f'classifiers/{prefix}_classifier_{timestamp}.pkl')
    blob.upload_from_filename(f'/content/{prefix}_classifiers.pkl')
    print(f"Saved classifiers to GCS")

if __name__ == "__main__":
    classifiers, data_df = main()



Loading and combining embeddings...
Scanning for embedding files...
Found 123 embedding files

Processing batch 1 of 13
Added 80 embeddings from agriculture_food/keywords
Added 10 embeddings from agriculture_food/orgs
Added 9 embeddings from agriculture_food/titles
Added 71 embeddings from appliances/keywords
Added 29 embeddings from appliances/orgs
Added 25 embeddings from appliances/titles
Added 54 embeddings from architecture/keywords
Added 9 embeddings from architecture/orgs
Added 39 embeddings from architecture/titles
Added 78 embeddings from aviation/keywords

Processing batch 2 of 13
Added 5 embeddings from aviation/orgs
Added 17 embeddings from aviation/titles
Added 48 embeddings from design/keywords
Added 6 embeddings from design/orgs
Added 24 embeddings from design/titles
Added 66 embeddings from education/keywords
Added 85 embeddings from education/orgs
Added 58 embeddings from education/titles
Added 116 embeddings from energy_and_green_transition/keywords
Added 26 embedding




Main Category Classification:
Cross-validation scores:




Mean CV score: 0.909 (+/- 0.040)

Test set performance:
                             precision    recall  f1-score   support

           agriculture_food       0.94      0.85      0.89        20
                 appliances       0.72      0.72      0.72        25
               architecture       0.90      0.90      0.90        20
               associations       0.95      0.96      0.95       584
                   aviation       0.84      0.80      0.82        20
                     design       0.83      0.94      0.88        16
                  education       0.98      0.95      0.96        42
energy_and_green_transition       0.92      0.83      0.87        29
                         eu       0.00      0.00      0.00         1
               expert_level       1.00      0.67      0.80         3
                    finance       0.93      0.93      0.93        44
                 healthcare       0.89      0.97      0.93        34
                hospitality       0.96      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved classifiers to GCS


In [31]:
#nyt data fra google api
from transformers import pipeline, AutoTokenizer, AutoModel
from collections import defaultdict
import spacy
import pandas as pd
import torch
import logging
from google.cloud import storage
import pickle
import numpy as np
from typing import Dict, Tuple, List


def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier_20241216_132412.pkl')
        with open('/content/classifier_20241216_132412.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

class NewsAnalyzer:
    def __init__(self):
        self.classifiers = load_classifier()
        self.tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
        self.model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")
        self.nlp = spacy.load('da_core_news_lg')
        self.sentiment_pipeline = pipeline("sentiment-analysis",
                                        model="DGurgurov/xlm-r_danish_sentiment")

    def clean_text(self, text: str) -> str:
        """Clean text by fixing encoding issues and removing truncation markers"""
        if not isinstance(text, str):
            return ""

        # Fix common encoding issues
        replacements = {
            'Ã¸': 'ø',
            'Ã¦': 'æ',
            'Ã¥': 'å',
            'Ã˜': 'Ø',
            'Ã†': 'Æ',
            'Ã…': 'Å'
        }

        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove truncation marker and everything after it
        if '[+' in text and 'chars]' in text:
            text = text.split('[+')[0].strip()

        return text

    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        return embedding

    def classify_text(self, text: str) -> Dict:
        """Classify a piece of text using the trained classifier"""
        try:
            if not text:
                return {'category': 'unknown', 'confidence': 0.0}

            embedding = self.get_embedding(text)
            main_classifier = self.classifiers['main_classifier']
            pred_class = main_classifier.predict([embedding])[0]
            confidence = np.max(main_classifier.predict_proba([embedding])[0])

            return {'category': pred_class, 'confidence': confidence}
        except Exception as e:
            logging.error(f"Error in text classification: {str(e)}")
            return {'category': 'unknown', 'confidence': 0.0}

    def analyze_sentiment(self, text: str) -> Dict:
        """Analyze sentiment of cleaned text"""
        try:
            if not text:
                return {'label': 'L', 'confidence': 0.0}

            cleaned_text = self.clean_text(text)
            result = self.sentiment_pipeline(cleaned_text[:512])[0]

            label_map = {
                'LABEL_1': 'P',
                'LABEL_0': 'N'
            }
            return {
                'label': label_map.get(result['label'], 'L'),
                'confidence': result['score']
            }
        except Exception as e:
            logging.error(f"Error in sentiment analysis: {str(e)}")
            return {'label': 'L', 'confidence': 0.0}

    def get_combined_sentiment(self, sentiments: List[Dict]) -> Dict:
        """Calculate combined sentiment from multiple analyses"""
        valid_sentiments = [s for s in sentiments if s['label'] != 'L']
        if not valid_sentiments:
            return {'label': 'L', 'confidence': 0.0}

        # Weight sentiments by confidence
        weighted_sum = sum(1 if s['label'] == 'P' else -1 * s['confidence']
                         for s in valid_sentiments)
        avg_confidence = sum(s['confidence'] for s in valid_sentiments) / len(valid_sentiments)

        return {
            'label': 'P' if weighted_sum > 0 else 'N',
            'confidence': avg_confidence
        }

    def analyze_article(self, title: str, description: str, content: str) -> Dict:
        """Analyze full article including title, description, and content"""
        # Clean texts
        clean_title = self.clean_text(title)
        clean_desc = self.clean_text(description)
        clean_content = self.clean_text(content)

        # Split content into sentences
        doc = self.nlp(clean_content)
        sentences = [sent.text for sent in doc.sents]

        # Classify each sentence
        category_counts = defaultdict(int)

        # Classify title
        title_class = self.classify_text(clean_title)
        category_counts[title_class['category']] += 1

        # Classify each sentence
        for sentence in sentences:
            classification = self.classify_text(sentence)
            category_counts[classification['category']] += 1

        # Get primary categories based on counts
        primary_category = max(category_counts.items(), key=lambda x: x[1])

        # Get sentiment for each part
        sentiments = {
            'title': self.analyze_sentiment(clean_title),
            'description': self.analyze_sentiment(clean_desc),
            'content': self.analyze_sentiment(clean_content)
        }

        # Calculate combined sentiment
        combined_sentiment = self.get_combined_sentiment(list(sentiments.values()))
        sentiments['combined'] = combined_sentiment

        return {
            'category_counts': dict(category_counts),
            'primary_category': primary_category[0],
            'category_frequency': primary_category[1],
            'sentiment': sentiments
        }

def analyze_news_dataset(file_path: str) -> Dict:
    analyzer = NewsAnalyzer()
    results = {}

    df = pd.read_csv(file_path)
    print(f"Analyzing {len(df)} articles...")

    for idx, row in df.iterrows():
        print(f"Processing article {idx + 1}/{len(df)}")
        results[idx] = {
            'source': row['source'],
            'title': row['title'],
            'publishedAt': row['publishedAt'],
            'analysis': analyzer.analyze_article(
                row['title'],
                row['description'],
                row['content']
            )
        }

    return results

def display_results(results: Dict):
    for article_id, data in results.items():
        print("\n" + "="*50)
        print(f"Source: {data['source']}")
        print(f"Title: {data['title']}")
        print(f"Date: {data['publishedAt']}")

        analysis = data['analysis']
        print(f"\nPrimary Category: {analysis['primary_category']} "
              f"(frequency: {analysis['category_frequency']})")

        print("\nCategory Distribution:")
        for category, count in sorted(analysis['category_counts'].items(),
                                    key=lambda x: x[1], reverse=True):
            print(f"{category}: {count}")

        print("\nSentiment Analysis:")
        sentiments = analysis['sentiment']
        for part, sent in sentiments.items():
            print(f"{part.capitalize()}: {sent['label']} "
                  f"(confidence: {sent['confidence']:.2f})")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    results = analyze_news_dataset('/content/drive/MyDrive/NewsMarketAnalysis/danish_news_multiple_topics.csv')
    display_results(results)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
private: 1

Sentiment Analysis:
Title: N (confidence: 1.00)
Description: N (confidence: 1.00)
Content: P (confidence: 0.56)
Combined: N (confidence: 0.85)

Source: {'id': None, 'name': 'Www.dr.dk'}
Title: Seks nordiske organisationer går sammen for at 'beskytte børnene mod techgiganternes overgreb'
Date: 2024-12-12T13:57:00Z

Primary Category: associations (frequency: 4)

Category Distribution:
associations: 4
public: 1

Sentiment Analysis:
Title: N (confidence: 0.89)
Description: P (confidence: 0.97)
Content: P (confidence: 0.93)
Combined: P (confidence: 0.93)

Source: {'id': None, 'name': 'Politiken.dk'}
Title: Danske medier: Politisk handling er nødvendig. Sådan kan vi sammen tæmme techgiganterne
Date: 2024-12-12T10:07:01Z

Primary Category: public (frequency: 3)

Category Distribution:
public: 3
private: 1

Sentiment Analysis:
Title: N (confidence: 0.99)
Description: N (confidence: 0.99)
Content: P (confidence: 0.98)


2nd attempt at providing weights on the classifier

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from google.cloud import storage
import pickle
from collections import defaultdict
import torch
import sys

# Add debug printing function
def debug_print(message):
    """Print debug message with timestamp"""
    print(f"[DEBUG] {message}", flush=True)

def main():
    try:
        # Load embeddings with debug output
        debug_print("Starting embeddings loading process...")
        data_df = combine_batch_embeddings()
        debug_print(f"Successfully loaded embeddings. Shape: {data_df.shape}")

        # Train classifiers with debug output
        debug_print("Starting classifier training...")
        classifiers = train_hierarchical_classifier(data_df)
        debug_print("Classifier training completed")

        # Save classifiers with debug output
        debug_print("Starting classifier saving process...")
        save_classifiers(classifiers)
        debug_print("Classifier saving completed")

        return classifiers, data_df

    except Exception as e:
        debug_print(f"Error in main function: {str(e)}")
        debug_print(f"Error type: {type(e)}")
        import traceback
        debug_print(f"Traceback: {traceback.format_exc()}")
        raise

if __name__ == "__main__":
    debug_print("Starting main program execution...")
    try:
        classifiers, data_df = main()
        debug_print("Program completed successfully")
    except Exception as e:
        debug_print(f"Program failed with error: {str(e)}")
        sys.exit(1)
#Classifier trainer with weights and market boosting version 2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from google.cloud import storage
import pickle
from collections import defaultdict
import torch # import torch here

def combine_batch_embeddings():
    """Load and combine batch embeddings with memory management and duplicate prevention."""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Track processed files and unique embeddings
    processed_files = set()
    unique_pointers = set()
    all_embeddings = []
    total_count = 0

    print("Scanning for embedding files...")
    valid_prefixes = [
        'embeddings_analysis/market_categories/',
        'embeddings_analysis/hierarchy/',
        'embeddings_analysis/international/',
        'embeddings_analysis/organization/'
    ]

    embedding_files = []
    for prefix in valid_prefixes:
        files = [
            blob.name for blob in bucket.list_blobs(prefix=prefix)
            if '_embeddings' in blob.name and '_similarity' not in blob.name
        ]
        embedding_files.extend(files)

    print(f"Found {len(embedding_files)} embedding files")
    batch_size = 10  # Process 10 files at a time

    for i in range(0, len(embedding_files), batch_size):
        batch_files = embedding_files[i:i + batch_size]
        batch_embeddings = []

        print(f"\nProcessing batch {i // batch_size + 1} of {(len(embedding_files) + batch_size - 1) // batch_size}")
        for file_name in batch_files:
            try:
                if file_name in processed_files:
                    continue

                blob = bucket.blob(file_name)
                blob.download_to_filename('/content/temp_batch.pkl')
                batch_df = pd.read_pickle('/content/temp_batch.pkl')

                # Remove duplicates
                new_pointers = set(batch_df['pointer'])
                batch_df = batch_df[~batch_df['pointer'].isin(unique_pointers)]
                unique_pointers.update(new_pointers)

                # Extract category information
                path_parts = file_name.split('/')
                main_category = path_parts[2]
                sub_category = path_parts[3]

                batch_df['main_category'] = main_category
                batch_df['sub_category'] = sub_category

                batch_embeddings.append(batch_df)
                processed_files.add(file_name)

                print(f"Added {len(batch_df)} embeddings from {main_category}/{sub_category}")
                total_count += len(batch_df)

            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                continue

        # Combine batch and clear memory
        if batch_embeddings:
            combined_batch = pd.concat(batch_embeddings, ignore_index=True)
            all_embeddings.append(combined_batch)

    if not all_embeddings:
        raise ValueError("No embedding files were successfully loaded!")

    final_df = pd.concat(all_embeddings, ignore_index=True)
    print(f"\nFinal combined embeddings: {len(final_df)}")
    return final_df

def predict_with_market_boost(probs, classes, market_categories, boost=0.1):
    """Boost probabilities for market categories"""
    market_boost = np.zeros_like(probs)
    for i, category in enumerate(classes):
        if category in market_categories:
            market_boost[:, i] = boost

    # Normalize probabilities
    boosted_probs = probs + market_boost
    return boosted_probs / boosted_probs.sum(axis=1, keepdims=True)

def train_hierarchical_classifier(data_df, batch_size=10000, min_samples=2):
    """Train classifier with enhanced focus on market categories"""
    print("\nPreparing training data...")

    # Define market/domain categories with importance levels
    market_categories = {
        # Core market categories (highest weight)
        'energy_and_green_transition': 3.0,
        'finance': 3.0,
        'tech': 3.0,
        'industry': 3.0,

        # Secondary market categories (medium-high weight)
        'maritime_and_shipping': 2.5,
        'agriculture_food': 2.5,
        'real_estate': 2.5,

        # Supporting market categories (medium weight)
        'hospitality': 2.0,
        'tourism': 2.0,
        'aviation': 2.0,
        'appliances': 2.0,
        'architecture': 2.0,
        'design': 2.0,
        'education': 2.0,
        'healthcare': 2.0,
        'welfare': 2.0,
        'union_labour': 2.0,
        'regulatory': 2.0,
        'politics': 2.0
    }

    # Filter categories with too few samples
    category_counts = data_df['main_category'].value_counts()
    valid_categories = category_counts[category_counts >= min_samples].index
    print(f"\nFound {len(valid_categories)} categories with {min_samples}+ samples")

    filtered_df = data_df[data_df['main_category'].isin(valid_categories)].copy()

    # Enhanced class weighting strategy with market focus
    class_weights = {}
    total_samples = len(filtered_df)

    for category in valid_categories:
        category_samples = len(filtered_df[filtered_df['main_category'] == category])
        base_weight = total_samples / (len(valid_categories) * category_samples)

        if category in market_categories:
            # Apply market category boost
            class_weights[category] = base_weight * market_categories[category]
        else:
            # Reduce weight for non-market categories
            class_weights[category] = base_weight * 0.5

    # Normalize weights to prevent extreme values
    max_weight = max(class_weights.values())
    class_weights = {k: min(v/max_weight * 3.0, 3.0) for k, v in class_weights.items()}

    # Convert embeddings to numpy array
    print("\nConverting embeddings to numpy array...")
    X_batches = []
    for i in range(0, len(filtered_df), batch_size):
        batch = filtered_df.iloc[i:i + batch_size]
        X_batch = np.stack(batch['embedding'].values)
        X_batches.append(X_batch)

    X = np.concatenate(X_batches)
    y_main = filtered_df['main_category'].values

    # Split data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_main, test_size=0.2, random_state=42, stratify=y_main
    )

    # Train main classifier with enhanced parameters
    print("\nTraining market-focused classifier...")
    main_classifier = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        class_weight=class_weights,
        C=0.8  # Slightly stronger regularization
    )
    main_classifier.fit(X_train, y_train)

    # Test predictions with market boost
    y_pred_proba = main_classifier.predict_proba(X_test)
    y_pred_proba_boosted = predict_with_market_boost(
        y_pred_proba,
        main_classifier.classes_,
        market_categories
    )
    y_pred = main_classifier.classes_[np.argmax(y_pred_proba_boosted, axis=1)]

    # Calculate category-specific metrics
    market_cats = [cat for cat in main_classifier.classes_ if cat in market_categories]
    market_mask = np.isin(y_test, market_cats)

    print("\nOverall Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    print("\nMarket Categories Performance:")
    if any(market_mask):
        print(classification_report(
            y_test[market_mask],
            y_pred[market_mask],
            labels=market_cats,
            zero_division=0
        ))
    else:
        print("No market categories in test set")

    return {
        'main_classifier': main_classifier,
        'valid_categories': valid_categories,
        'market_categories': market_categories,
        'class_weights': class_weights
    }

def predict_eliteness(embedding, classifiers):
    """Predict eliteness categories with market focus"""
    if isinstance(embedding, list):
        embedding = np.array(embedding)
    if len(embedding.shape) == 1:
        embedding = embedding.reshape(1, -1)

    # Get base predictions
    base_probs = classifiers['main_classifier'].predict_proba(embedding)

    # Apply market boost
    boosted_probs = predict_with_market_boost(
        base_probs,
        classifiers['main_classifier'].classes_,
        classifiers['market_categories']
    )

    predicted_idx = np.argmax(boosted_probs, axis=1)
    confidence = boosted_probs[0, predicted_idx[0]]
    predicted_category = classifiers['main_classifier'].classes_[predicted_idx[0]]

    return {
        'main_category': predicted_category,
        'confidence': float(confidence),
        'is_market': predicted_category in classifiers['market_categories']
    }

def main():
    # Load embeddings
    print("Loading and combining embeddings...")
    data_df = combine_batch_embeddings()

    # Train classifiers
    classifiers = train_hierarchical_classifier(data_df)

    # Save classifiers
    save_classifiers(classifiers)

    return classifiers, data_df

def save_classifiers(classifiers, prefix='eliteness'):
    """Save trained classifiers to GCS"""
    storage_client = storage.Client()
    bucket = storage_client.bucket("eliteness")

    # Save locally first
    with open(f'/content/{prefix}_classifiers.pkl', 'wb') as f:
        pickle.dump(classifiers, f)

    # Upload to GCS
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    blob = bucket.blob(f'classifiers/{prefix}_classifier_{timestamp}.pkl')
    blob.upload_from_filename(f'/content/{prefix}_classifiers.pkl')
    print(f"Saved classifiers to GCS")

[DEBUG] Starting main program execution...
[DEBUG] Starting embeddings loading process...
Scanning for embedding files...
Found 123 embedding files

Processing batch 1 of 13
Added 80 embeddings from agriculture_food/keywords
Added 10 embeddings from agriculture_food/orgs
Added 9 embeddings from agriculture_food/titles
Added 71 embeddings from appliances/keywords
Added 29 embeddings from appliances/orgs
Added 25 embeddings from appliances/titles
Added 54 embeddings from architecture/keywords
Added 9 embeddings from architecture/orgs
Added 39 embeddings from architecture/titles
Added 78 embeddings from aviation/keywords

Processing batch 2 of 13
Added 5 embeddings from aviation/orgs
Added 17 embeddings from aviation/titles
Added 48 embeddings from design/keywords
Added 6 embeddings from design/orgs
Added 24 embeddings from design/titles
Added 66 embeddings from education/keywords
Added 85 embeddings from education/orgs
Added 58 embeddings from education/titles
Added 116 embeddings from e




Overall Classification Report:
                             precision    recall  f1-score   support

           agriculture_food       0.68      0.95      0.79        20
                 appliances       0.66      0.76      0.70        25
               architecture       0.91      1.00      0.95        20
               associations       0.95      0.89      0.92       584
                   aviation       0.68      0.85      0.76        20
                     design       0.88      0.94      0.91        16
                  education       0.87      0.93      0.90        42
energy_and_green_transition       0.67      0.83      0.74        29
                         eu       0.00      0.00      0.00         1
               expert_level       1.00      0.67      0.80         3
                    finance       0.75      1.00      0.85        44
                 healthcare       0.74      1.00      0.85        34
                hospitality       0.89      0.96      0.93        26
 

Testing improved version of classifier on data

In [47]:
#nyt data fra google api
from transformers import pipeline, AutoTokenizer, AutoModel
from collections import defaultdict
import spacy
import pandas as pd
import torch
import logging
from google.cloud import storage
import pickle
import numpy as np
from typing import Dict, Tuple, List


def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier_20241216_210643.pkl')
        with open('/content/classifier_20241216_210643.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

class NewsAnalyzer:
    def __init__(self):
        self.classifiers = load_classifier()
        self.tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
        self.model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")
        self.nlp = spacy.load('da_core_news_lg')
        self.sentiment_pipeline = pipeline("sentiment-analysis",
                                        model="DGurgurov/xlm-r_danish_sentiment")

    def clean_text(self, text: str) -> str:
        """Clean text by fixing encoding issues and removing truncation markers"""
        if not isinstance(text, str):
            return ""

        # Fix common encoding issues
        replacements = {
            'Ã¸': 'ø',
            'Ã¦': 'æ',
            'Ã¥': 'å',
            'Ã˜': 'Ø',
            'Ã†': 'Æ',
            'Ã…': 'Å'
        }

        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove truncation marker and everything after it
        if '[+' in text and 'chars]' in text:
            text = text.split('[+')[0].strip()

        return text

    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        return embedding

    def classify_text(self, text: str) -> Dict:
        """Classify a piece of text using the trained classifier"""
        try:
            if not text:
                return {'category': 'unknown', 'confidence': 0.0}

            embedding = self.get_embedding(text)
            main_classifier = self.classifiers['main_classifier']
            pred_class = main_classifier.predict([embedding])[0]
            confidence = np.max(main_classifier.predict_proba([embedding])[0])

            return {'category': pred_class, 'confidence': confidence}
        except Exception as e:
            logging.error(f"Error in text classification: {str(e)}")
            return {'category': 'unknown', 'confidence': 0.0}

    def analyze_sentiment(self, text: str) -> Dict:
        """Analyze sentiment of cleaned text"""
        try:
            if not text:
                return {'label': 'L', 'confidence': 0.0}

            cleaned_text = self.clean_text(text)
            result = self.sentiment_pipeline(cleaned_text[:512])[0]

            label_map = {
                'LABEL_1': 'P',
                'LABEL_0': 'N'
            }
            return {
                'label': label_map.get(result['label'], 'L'),
                'confidence': result['score']
            }
        except Exception as e:
            logging.error(f"Error in sentiment analysis: {str(e)}")
            return {'label': 'L', 'confidence': 0.0}

    def get_combined_sentiment(self, sentiments: List[Dict]) -> Dict:
        """Calculate combined sentiment from multiple analyses"""
        valid_sentiments = [s for s in sentiments if s['label'] != 'L']
        if not valid_sentiments:
            return {'label': 'L', 'confidence': 0.0}

        # Weight sentiments by confidence
        weighted_sum = sum(1 if s['label'] == 'P' else -1 * s['confidence']
                         for s in valid_sentiments)
        avg_confidence = sum(s['confidence'] for s in valid_sentiments) / len(valid_sentiments)

        return {
            'label': 'P' if weighted_sum > 0 else 'N',
            'confidence': avg_confidence
        }

    def analyze_article(self, title: str, description: str, content: str) -> Dict:
        """Analyze full article including title, description, and content"""
        # Clean texts
        clean_title = self.clean_text(title)
        clean_desc = self.clean_text(description)
        clean_content = self.clean_text(content)

        # Split content into sentences
        doc = self.nlp(clean_content)
        sentences = [sent.text for sent in doc.sents]

        # Classify each sentence
        category_counts = defaultdict(int)

        # Classify title
        title_class = self.classify_text(clean_title)
        category_counts[title_class['category']] += 1

        # Classify each sentence
        for sentence in sentences:
            classification = self.classify_text(sentence)
            category_counts[classification['category']] += 1

        # Get primary categories based on counts
        primary_category = max(category_counts.items(), key=lambda x: x[1])

        # Get sentiment for each part
        sentiments = {
            'title': self.analyze_sentiment(clean_title),
            'description': self.analyze_sentiment(clean_desc),
            'content': self.analyze_sentiment(clean_content)
        }

        # Calculate combined sentiment
        combined_sentiment = self.get_combined_sentiment(list(sentiments.values()))
        sentiments['combined'] = combined_sentiment

        return {
            'category_counts': dict(category_counts),
            'primary_category': primary_category[0],
            'category_frequency': primary_category[1],
            'sentiment': sentiments
        }

def analyze_news_dataset(file_path: str) -> Dict:
    analyzer = NewsAnalyzer()
    results = {}

    df = pd.read_csv(file_path)
    print(f"Analyzing {len(df)} articles...")

    for idx, row in df.iterrows():
        print(f"Processing article {idx + 1}/{len(df)}")
        results[idx] = {
            'source': row['source'],
            'title': row['title'],
            'publishedAt': row['publishedAt'],
            'analysis': analyzer.analyze_article(
                row['title'],
                row['description'],
                row['content']
            )
        }

    return results

def display_results(results: Dict):
    for article_id, data in results.items():
        print("\n" + "="*50)
        print(f"Source: {data['source']}")
        print(f"Title: {data['title']}")
        print(f"Date: {data['publishedAt']}")

        analysis = data['analysis']
        print(f"\nPrimary Category: {analysis['primary_category']} "
              f"(frequency: {analysis['category_frequency']})")

        print("\nCategory Distribution:")
        for category, count in sorted(analysis['category_counts'].items(),
                                    key=lambda x: x[1], reverse=True):
            print(f"{category}: {count}")

        print("\nSentiment Analysis:")
        sentiments = analysis['sentiment']
        for part, sent in sentiments.items():
            print(f"{part.capitalize()}: {sent['label']} "
                  f"(confidence: {sent['confidence']:.2f})")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    results = analyze_news_dataset('/content/drive/MyDrive/NewsMarketAnalysis/danish_news_multiple_topics.csv')
    display_results(results)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Category Distribution:
public: 2
private: 1
associations: 1

Sentiment Analysis:
Title: P (confidence: 0.99)
Description: N (confidence: 0.99)
Content: N (confidence: 0.99)
Combined: N (confidence: 0.99)

Source: {'id': None, 'name': 'Politiken.dk'}
Title: Forskels­behandlingen af syge kvinder er enorm: Nu lanceres tre nye ideer
Date: 2024-12-12T07:58:54Z

Primary Category: associations (frequency: 2)

Category Distribution:
associations: 2
private: 1
public: 1

Sentiment Analysis:
Title: P (confidence: 0.58)
Description: N (confidence: 0.99)
Content: N (confidence: 0.99)
Combined: N (confidence: 0.86)

Source: {'id': None, 'name': 'GlobeNewswire'}
Title: CGTN: Nye tider: En ny højde i Macaos økonomiske diversificering
Date: 2024-12-12T05:58:00Z

Primary Category: finance (frequency: 2)

Category Distribution:
finance: 2
private: 1

Sentiment Analysis:
Title: P (confidence: 0.90)
Description: P (confidence: 0.94)
Content:

GPT zeroshot classification vs. eliteness classifier

In [63]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import numpy as np
from google.cloud import storage
import pickle
import logging

def load_and_prepare_data(csv_path):
    """
    Load the CSV file and prepare the data for comparison with debug information
    """
    # Read the CSV with explicit encoding and showing debug info
    print("\nAttempting to load CSV file...")
    try:
        # Try reading with utf-8 encoding
        df = pd.read_csv(csv_path, encoding='utf-8')
    except UnicodeDecodeError:
        # If utf-8 fails, try with latin-1
        print("UTF-8 encoding failed, trying latin-1...")
        df = pd.read_csv(csv_path, encoding='latin-1')

    print("\nDataFrame Info:")
    print(df.info())

    print("\nFirst few rows:")
    print(df.head())

    print("\nColumn names:")
    print(df.columns.tolist())

    # Clean the source column if it contains dictionary-like strings
    if 'source' in df.columns:
        print("\nCleaning source column...")
        df['source'] = df['source'].apply(lambda x: str(x).replace("{'id': None, 'name': '", "").replace("'}", ""))

    # Get category mappings
    category_mappings = get_category_mappings()

    # Check for categories in the data
    print("\nChecking for categories in data:")
    for category in category_mappings.keys():
        if category in df.columns:
            print(f"Found {category} - Sample values: {df[category].value_counts().head()}")
        else:
            print(f"Missing category: {category}")

    # Initialize eliteness columns with None
    for category in category_mappings.keys():
        df[f'{category}_eliteness'] = None

    return df, category_mappings

# ... [rest of the code remains the same] ...

def main(csv_path):
    """
    Main function to run the comparison analysis
    """
    # Load classifiers dictionary
    classifiers_dict = load_classifier()
    print("\nLoaded classifiers:", list(classifiers_dict.keys()))

    # Load and prepare data
    df, category_mappings = load_and_prepare_data(csv_path)

    # Apply classifier predictions
    print("\nApplying classifier predictions...")
    df = apply_classifier_predictions(df, classifiers_dict, category_mappings)

    # Check predictions
    print("\nChecking predictions:")
    for category in category_mappings.keys():
        col_name = f'{category}_eliteness'
        if col_name in df.columns:
            print(f"\n{col_name} predictions:")
            print(df[col_name].value_counts().head())

    # Perform comparison
    results = compare_classifications(df, category_mappings)

    # Print results
    print_results(results)

    return results, df

# Example usage
if __name__ == "__main__":
    csv_path = "/content/drive/MyDrive/NewsData/GPT_zeroshotNewsAnalyzer/Combined_Classified_Danish_News.csv"
    results, df = main(csv_path)

Loaded classifier from classifiers/eliteness_classifier_20241216_210643.pkl

Loaded classifiers: ['main_classifier', 'valid_categories', 'market_categories', 'class_weights']

Attempting to load CSV file...

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   source                615 non-null    object
 1   author                197 non-null    object
 2   title                 615 non-null    object
 3   description           608 non-null    object
 4   url                   615 non-null    object
 5   publishedAt           615 non-null    object
 6   content               615 non-null    object
 7   Markets/Domains       446 non-null    object
 8   Organizational Forms  102 non-null    object
 9   Elite Hierarchy       85 non-null     object
 10  International         23 non-null     object
dtypes: object(11)
me

In [66]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import numpy as np
from google.cloud import storage
import pickle
import logging
import torch
from transformers import AutoTokenizer, AutoModel
import spacy

def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier_20241216_210643.pkl')
        with open('/content/classifier_20241216_210643.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

def clean_text(text: str) -> str:
    """Clean text by fixing encoding issues and removing truncation markers"""
    if not isinstance(text, str):
        return ""

    # Fix common encoding issues
    replacements = {
        'Ã¸': 'ø',
        'Ã¦': 'æ',
        'Ã¥': 'å',
        'Ã˜': 'Ø',
        'Ã†': 'Æ',
        'Ã…': 'Å'
    }

    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove truncation marker and everything after it
    if '[+' in text and 'chars]' in text:
        text = text.split('[+')[0].strip()

    return text

def get_embeddings(texts: list, tokenizer, model) -> np.ndarray:
    """Get BERT embeddings for a list of texts"""
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

def load_and_prepare_data(csv_path):
    """
    Load the CSV file and prepare the data for comparison
    """
    # Read the CSV
    df = pd.read_csv(csv_path)

    # Clean the source column
    df['source'] = df['source'].apply(lambda x: str(x).replace("{'id': None, 'name': '", "").replace("'}", ""))

    # Clean the content
    df['cleaned_content'] = df['content'].apply(clean_text)

    # Split multiple categories into lists
    for col in ['Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']:
        if col in df.columns:
            df[col] = df[col].fillna('')
            df[col] = df[col].apply(lambda x: [cat.strip() for cat in str(x).split(',')] if pd.notna(x) and x != '' else [])

    print("\nSample of prepared data:")
    print(df[['title', 'Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']].head())

    return df

def apply_classifier_predictions(df, classifiers_dict):
    """
    Apply the main classifier to the content
    """
    # Initialize BERT model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
    model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")

    # Get the main classifier and valid categories
    main_classifier = classifiers_dict['main_classifier']
    valid_categories = classifiers_dict['valid_categories']
    market_categories = classifiers_dict['market_categories']

    print("\nValid categories:", valid_categories)
    print("\nMarket categories:", market_categories)

    try:
        # Get embeddings for all texts
        print("\nGenerating BERT embeddings...")
        embeddings = get_embeddings(df['cleaned_content'].tolist(), tokenizer, model)
        print("Embeddings shape:", embeddings.shape)

        # Make predictions
        predictions = main_classifier.predict(embeddings)
        probabilities = main_classifier.predict_proba(embeddings)
        print("\nPrediction shape:", predictions.shape)
        print("Sample predictions:", predictions[:5])

        # Initialize eliteness columns
        df['Markets/Domains_eliteness'] = None
        df['Organizational Forms_eliteness'] = None
        df['Elite Hierarchy_eliteness'] = None
        df['International_eliteness'] = None

        # Map each prediction to the appropriate category column
        for i, pred in enumerate(predictions):
            if pred in market_categories:
                df.loc[i, 'Markets/Domains_eliteness'] = pred
            elif pred in ['associations', 'private', 'public']:
                df.loc[i, 'Organizational Forms_eliteness'] = pred
            elif pred in ['expert_level', 'senior_level', 'top_level']:
                df.loc[i, 'Elite Hierarchy_eliteness'] = pred
            elif pred in ['eu', 'nordic', 'regional_danish']:
                df.loc[i, 'International_eliteness'] = pred

        # Print some statistics about the predictions
        print("\nPrediction distribution:")
        for col in ['Markets/Domains_eliteness', 'Organizational Forms_eliteness',
                   'Elite Hierarchy_eliteness', 'International_eliteness']:
            print(f"\n{col}:")
            print(df[col].value_counts().head())

    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        raise

    return df

def compare_classifications(df):
    """
    Compare classifications between the eliteness classifier and ChatGPT
    """
    categories = ['Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']
    comparison_results = {}

    for category in categories:
        print(f"\nComparing {category}...")

        # Get predictions and actual values
        pred_col = f'{category}_eliteness'

        # Convert list predictions back to comma-separated strings for comparison
        gpt_values = df[category].apply(lambda x: ','.join(sorted(x)) if isinstance(x, list) else '')
        pred_values = df[pred_col].fillna('')

        # Calculate metrics
        valid_mask = (gpt_values != '') & (pred_values != '')
        valid_gpt = gpt_values[valid_mask]
        valid_pred = pred_values[valid_mask]

        if len(valid_gpt) > 0:
            agreement = (valid_gpt == valid_pred).mean() * 100
            try:
                kappa = cohen_kappa_score(valid_gpt, valid_pred)
            except:
                kappa = np.nan

            comparison_results[category] = {
                'total_samples': len(df),
                'valid_comparisons': len(valid_gpt),
                'agreement': agreement,
                'kappa': kappa,
                'sample_comparisons': pd.DataFrame({
                    'Title': df.loc[valid_mask, 'title'].head(),
                    'GPT': valid_gpt.head(),
                    'Classifier': valid_pred.head()
                })
            }

            print(f"\nSample comparisons for {category}:")
            print(comparison_results[category]['sample_comparisons'])

    return comparison_results

def print_results(comparison_results):
    """
    Print the comparison results in a formatted way
    """
    print("\nClassification Comparison Results:")
    print("=" * 50)

    for category, results in comparison_results.items():
        print(f"\nCategory: {category}")
        print("-" * 30)
        print(f"Total samples: {results['total_samples']}")
        print(f"Valid comparisons: {results['valid_comparisons']}")
        print(f"Agreement: {results['agreement']:.2f}%")
        print(f"Cohen's Kappa: {results['kappa']:.3f}")
        print("\nSample Comparisons:")
        print(results['sample_comparisons'])

def main(csv_path):
    """
    Main function to run the comparison analysis
    """
    # Load classifier
    classifiers_dict = load_classifier()
    print("\nClassifier components:", list(classifiers_dict.keys()))

    # Load and prepare data
    df = load_and_prepare_data(csv_path)

    # Apply classifier predictions
    df = apply_classifier_predictions(df, classifiers_dict)

    # Perform comparison
    results = compare_classifications(df)

    # Print results
    print_results(results)

    return results, df

# Example usage
if __name__ == "__main__":
    csv_path = "/content/drive/MyDrive/NewsData/GPT_zeroshotNewsAnalyzer/Combined_Classified_Danish_News.csv"
    results, df = main(csv_path)

Loaded classifier from classifiers/eliteness_classifier_20241216_210643.pkl

Classifier components: ['main_classifier', 'valid_categories', 'market_categories', 'class_weights']

Sample of prepared data:
                                               title  \
0  Dansk Ride Forbund skal på jagt efter ny direktør   
1  Nørgaard: Danmarks nye kampskrift mod tech-gig...   
2  Fordobling i antal idrætsbørnehaver: 'Børn er ...   
3  Rusland sender igen missiler og droner mod Ukr...   
4  Ukraine melder igen om stort angreb på energif...   

                 Markets/Domains Organizational Forms Elite Hierarchy  \
0                             []                   []  [senior_level]   
1               [tech, politics]             [public]              []   
2           [welfare, education]                   []  [expert_level]   
3  [energy_and_green_transition]                   []              []   
4  [energy_and_green_transition]                   []              []   

  International  
0 

In [72]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import numpy as np
from google.cloud import storage
import pickle
import logging
import torch
from transformers import AutoTokenizer, AutoModel
import spacy

def load_classifier():
    """Load trained classifier from GCS"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket("eliteness")

        blobs = list(bucket.list_blobs(prefix='classifiers/'))
        latest_classifier = max(blobs, key=lambda x: x.name)

        latest_classifier.download_to_filename('/content/classifier_20241216_210643.pkl')
        with open('/content/classifier_20241216_210643.pkl', 'rb') as f:
            classifiers = pickle.load(f)

        print(f"Loaded classifier from {latest_classifier.name}")
        return classifiers

    except Exception as e:
        logging.error(f"Error loading classifier: {str(e)}")
        raise

def clean_text(text: str) -> str:
    """Clean text by fixing encoding issues and removing truncation markers"""
    if not isinstance(text, str):
        return ""

    # Fix common encoding issues
    replacements = {
        'Ã¸': 'ø',
        'Ã¦': 'æ',
        'Ã¥': 'å',
        'Ã˜': 'Ø',
        'Ã†': 'Æ',
        'Ã…': 'Å'
    }

    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove truncation marker and everything after it
    if '[+' in text and 'chars]' in text:
        text = text.split('[+')[0].strip()

    return text

def get_embeddings(texts: list, tokenizer, model) -> np.ndarray:
    """Get BERT embeddings for a list of texts"""
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state[0][1].numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

def load_and_prepare_data(csv_path):
    """
    Load the CSV file and prepare the data for comparison
    """
    # Read the CSV
    df = pd.read_csv(csv_path)

    # Clean the source column
    df['source'] = df['source'].apply(lambda x: str(x).replace("{'id': None, 'name': '", "").replace("'}", ""))

    # Clean the content
    df['cleaned_content'] = df['content'].apply(clean_text)

    # Split multiple categories into lists
    for col in ['Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']:
        if col in df.columns:
            df[col] = df[col].fillna('')
            df[col] = df[col].apply(lambda x: [cat.strip() for cat in str(x).split(',')] if pd.notna(x) and x != '' else [])

    print("\nSample of prepared data:")
    print(df[['title', 'Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']].head())

    return df

def apply_classifier_predictions(df, classifiers_dict):
    """
    Apply the main classifier to the content
    """
    # Initialize BERT model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
    model = AutoModel.from_pretrained("Maltehb/danish-bert-botxo")

    # Get the main classifier and valid categories
    main_classifier = classifiers_dict['main_classifier']
    valid_categories = classifiers_dict['valid_categories']
    market_categories = classifiers_dict['market_categories']

    print("\nValid categories:", valid_categories)
    print("\nMarket categories:", market_categories)

    try:
        # Get embeddings for all texts
        print("\nGenerating BERT embeddings...")
        embeddings = get_embeddings(df['cleaned_content'].tolist(), tokenizer, model)
        print("Embeddings shape:", embeddings.shape)

        # Make predictions
        predictions = main_classifier.predict(embeddings)
        probabilities = main_classifier.predict_proba(embeddings)
        print("\nPrediction shape:", predictions.shape)
        print("Sample predictions:", predictions[:5])

        # Initialize eliteness columns
        df['Markets/Domains_eliteness'] = None
        df['Organizational Forms_eliteness'] = None
        df['Elite Hierarchy_eliteness'] = None
        df['International_eliteness'] = None

        # Map each prediction to the appropriate category column
        for i, pred in enumerate(predictions):
            if pred in market_categories:
                df.loc[i, 'Markets/Domains_eliteness'] = pred
            elif pred in ['associations', 'private', 'public']:
                df.loc[i, 'Organizational Forms_eliteness'] = pred
            elif pred in ['expert_level', 'senior_level', 'top_level']:
                df.loc[i, 'Elite Hierarchy_eliteness'] = pred
            elif pred in ['eu', 'nordic', 'regional_danish']:
                df.loc[i, 'International_eliteness'] = pred

        # Print some statistics about the predictions
        print("\nPrediction distribution:")
        for col in ['Markets/Domains_eliteness', 'Organizational Forms_eliteness',
                   'Elite Hierarchy_eliteness', 'International_eliteness']:
            print(f"\n{col}:")
            print(df[col].value_counts().head())

    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        raise

    return df

def compare_classifications(df):
    """
    Compare classifications between the eliteness classifier and ChatGPT
    """
    categories = ['Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']
    comparison_results = {}

    for category in categories:
        print(f"\nComparing {category}...")

        # Get predictions and actual values
        pred_col = f'{category}_eliteness'

        # Convert list predictions back to comma-separated strings for comparison
        gpt_values = df[category].apply(lambda x: ','.join(sorted(x)) if isinstance(x, list) else '')
        pred_values = df[pred_col].fillna('')

        # Calculate metrics
        valid_mask = (gpt_values != '') & (pred_values != '')
        valid_gpt = gpt_values[valid_mask]
        valid_pred = pred_values[valid_mask]

        if len(valid_gpt) > 0:
            agreement = (valid_gpt == valid_pred).mean() * 100
            try:
                kappa = cohen_kappa_score(valid_gpt, valid_pred)
            except:
                kappa = np.nan

            comparison_results[category] = {
                'total_samples': len(df),
                'valid_comparisons': len(valid_gpt),
                'agreement': agreement,
                'kappa': kappa,
                'sample_comparisons': pd.DataFrame({
                    'Title': df.loc[valid_mask, 'title'].head(),
                    'GPT': valid_gpt.head(),
                    'Classifier': valid_pred.head()
                })
            }

            print(f"\nSample comparisons for {category}:")
            print(comparison_results[category]['sample_comparisons'])

    return comparison_results

def print_results(comparison_results):
    """
    Print the comparison results in a formatted way
    """
    print("\nClassification Comparison Results:")
    print("=" * 50)

    for category, results in comparison_results.items():
        print(f"\nCategory: {category}")
        print("-" * 30)
        print(f"Total samples: {results['total_samples']}")
        print(f"Valid comparisons: {results['valid_comparisons']}")
        print(f"Agreement: {results['agreement']:.2f}%")
        print(f"Cohen's Kappa: {results['kappa']:.3f}")
        print("\nSample Comparisons:")
        print(results['sample_comparisons'])

def analyze_classification_distribution(df):
    """
    Analyze the distribution of classifications between GPT and the classifier
    """
    categories = ['Markets/Domains', 'Organizational Forms', 'Elite Hierarchy', 'International']
    distribution_stats = {}

    for category in categories:
        # Get GPT and classifier values
        gpt_col = category
        classifier_col = f'{category}_eliteness'

        # Convert list predictions back to strings for comparison
        gpt_values = df[gpt_col].apply(lambda x: ','.join(x) if isinstance(x, list) and len(x) > 0 else '')
        classifier_values = df[classifier_col].fillna('')

        # Count different scenarios
        total = len(df)
        gpt_classified = (gpt_values != '').sum()
        classifier_classified = (classifier_values != '').sum()
        both_classified = ((gpt_values != '') & (classifier_values != '')).sum()
        neither_classified = ((gpt_values == '') & (classifier_values == '')).sum()
        only_gpt = ((gpt_values != '') & (classifier_values == '')).sum()
        only_classifier = ((gpt_values == '') & (classifier_values != '')).sum()

        # Calculate percentages
        distribution_stats[category] = {
            'total_articles': total,
            'articles_with_gpt_classification': {
                'count': gpt_classified,
                'percentage': (gpt_classified/total)*100
            },
            'articles_with_classifier_classification': {
                'count': classifier_classified,
                'percentage': (classifier_classified/total)*100
            },
            'articles_with_both_classifications': {
                'count': both_classified,
                'percentage': (both_classified/total)*100
            },
            'articles_with_neither_classification': {
                'count': neither_classified,
                'percentage': (neither_classified/total)*100
            },
            'articles_with_only_gpt': {
                'count': only_gpt,
                'percentage': (only_gpt/total)*100
            },
            'articles_with_only_classifier': {
                'count': only_classifier,
                'percentage': (only_classifier/total)*100
            }
        }

        # For articles where both systems made classifications
        if both_classified > 0:
            matching_predictions = (gpt_values == classifier_values)[
                (gpt_values != '') & (classifier_values != '')
            ].sum()

            distribution_stats[category]['matching_classifications'] = {
                'count': matching_predictions,
                'percentage_of_both': (matching_predictions/both_classified)*100
            }

        # Get sample of articles with missing classifications
        distribution_stats[category]['sample_missing_both'] = df[
            (gpt_values == '') & (classifier_values == '')
        ]['title'].head().tolist()

        distribution_stats[category]['sample_missing_gpt'] = df[
            (gpt_values == '') & (classifier_values != '')
        ]['title'].head().tolist()

        distribution_stats[category]['sample_missing_classifier'] = df[
            (gpt_values != '') & (classifier_values == '')
        ]['title'].head().tolist()

    return distribution_stats

def print_distribution_analysis(stats):
    """
    Print the distribution analysis in a readable format
    """
    print("\nClassification Distribution Analysis")
    print("=" * 50)

    for category, data in stats.items():
        print(f"\nCategory: {category}")
        print("-" * 30)
        print(f"Total articles: {data['total_articles']}")

        print("\nClassification coverage:")
        print(f"GPT classifications: {data['articles_with_gpt_classification']['count']} "
              f"({data['articles_with_gpt_classification']['percentage']:.1f}%)")
        print(f"Classifier classifications: {data['articles_with_classifier_classification']['count']} "
              f"({data['articles_with_classifier_classification']['percentage']:.1f}%)")
        print(f"Both systems: {data['articles_with_both_classifications']['count']} "
              f"({data['articles_with_both_classifications']['percentage']:.1f}%)")
        print(f"Neither system: {data['articles_with_neither_classification']['count']} "
              f"({data['articles_with_neither_classification']['percentage']:.1f}%)")
        print(f"Only GPT: {data['articles_with_only_gpt']['count']} "
              f"({data['articles_with_only_gpt']['percentage']:.1f}%)")
        print(f"Only Classifier: {data['articles_with_only_classifier']['count']} "
              f"({data['articles_with_only_classifier']['percentage']:.1f}%)")

        if 'matching_classifications' in data:
            print(f"\nMatching classifications: {data['matching_classifications']['count']} "
                  f"({data['matching_classifications']['percentage_of_both']:.1f}% of articles with both)")

        print("\nSample articles missing both classifications:")
        for title in data['sample_missing_both'][:3]:
            print(f"- {title}")

        print("\nSample articles missing GPT classification:")
        for title in data['sample_missing_gpt'][:3]:
            print(f"- {title}")

        print("\nSample articles missing classifier classification:")
        for title in data['sample_missing_classifier'][:3]:
            print(f"- {title}")



def main(csv_path):
    """
    Main function to run the comparison analysis
    """
    # Load classifier
    classifiers_dict = load_classifier()
    print("\nClassifier components:", list(classifiers_dict.keys()))

    # Load and prepare data
    df = load_and_prepare_data(csv_path)

    # Apply classifier predictions
    df = apply_classifier_predictions(df, classifiers_dict)

    # Perform comparison
    results = compare_classifications(df)

    # Print results
    print_results(results)

    print("\nAnalyzing classification distribution...")
    distribution_stats = analyze_classification_distribution(df)
    print_distribution_analysis(distribution_stats)

    #return results, df
    return results, df, distribution_stats
# Example usage
if __name__ == "__main__":
    csv_path = "/content/drive/MyDrive/NewsData/GPT_zeroshotNewsAnalyzer/Combined_Classified_Danish_News.csv"
    results, df, distribution_stats = main(csv_path)

Loaded classifier from classifiers/eliteness_classifier_20241216_210643.pkl

Classifier components: ['main_classifier', 'valid_categories', 'market_categories', 'class_weights']

Sample of prepared data:
                                               title  \
0  Dansk Ride Forbund skal på jagt efter ny direktør   
1  Nørgaard: Danmarks nye kampskrift mod tech-gig...   
2  Fordobling i antal idrætsbørnehaver: 'Børn er ...   
3  Rusland sender igen missiler og droner mod Ukr...   
4  Ukraine melder igen om stort angreb på energif...   

                 Markets/Domains Organizational Forms Elite Hierarchy  \
0                             []                   []  [senior_level]   
1               [tech, politics]             [public]              []   
2           [welfare, education]                   []  [expert_level]   
3  [energy_and_green_transition]                   []              []   
4  [energy_and_green_transition]                   []              []   

  International  
0 