For some analyses not related to Doublettenerkennung


## Setup

In [2]:
import pandas as pd
from helper_functions.file_io_functions import load_processed_data
import probablepeople as pp
import pickle


In [2]:
# Assuming the data has been processed and pickled with other Doublettenanalyse notebooks

data_dfs_sonstiges = load_processed_data(file_path="data/calculated/personen_organisationen_dfs_processed_with_sonstiges_personen.pickle")

df_personen = data_dfs_sonstiges["personen"]

Reminder: Name column should always be FIRSTNAME + LASTNAME, not the other way around

In [5]:
df_personen["Name_original"]

0         Karim Fassatoui
1            Simon Kramis
2          Patrick Leplat
3           Joerg Schwarz
4           Patrick Storz
               ...       
38853     David Deslarzes
38854         Ivan Schmid
38855      Mustafa Tuskan
38856       Mathieu Raoux
38857    Eric Freienstein
Name: Name_original, Length: 38858, dtype: object

Probablepeople library:  produces a lot of false positives. Seems confused when names have more than two words


In [13]:
def check_name_order(name):
    try:
        parsed_name = pp.parse(name)
        print(f"Parsed name: {parsed_name}")  # Debug print
        
        # Check if any part is tagged as GivenName or FirstInitial
        has_given_name = any(tag.startswith('GivenName') or tag == 'FirstInitial' for _, tag in parsed_name)
        
        # Check if any part is tagged as Surname
        has_surname = any(tag == 'Surname' for _, tag in parsed_name)
        
        # If we have both, check their order
        if has_given_name and has_surname:
            given_name_index = next((i for i, (_, tag) in enumerate(parsed_name) if tag.startswith('GivenName') or tag == 'FirstInitial'), -1)
            surname_index = next((i for i, (_, tag) in enumerate(parsed_name) if tag == 'Surname'), -1)
            
            if given_name_index != -1 and surname_index != -1:
                return surname_index < given_name_index  # Incorrect if surname comes before given name
        
        # If we don't have both or can't determine order, assume it's correct
        return False
    except pp.RepeatedLabelError:
        print(f"Warning: Couldn't parse name '{name}'")
        return False


# Apply the function to the first few rows for testing
# for name in df_personen['Name_original'].head(10):
#     result = check_name_order(name)
#     print(f"Name: {name}, Incorrect: {result}")
    
    
df_personen['Name_incorrect'] = df_personen['Name_original'].apply(check_name_order)

# print(df_personen[['Name_original', 'Name_incorrect']].head(10))
print("\nTotal incorrect names:", df_personen['Name_incorrect'].sum())

Parsed name: [('Karim', 'CorporationName'), ('Fassatoui', 'CorporationName')]
Parsed name: [('Simon', 'GivenName'), ('Kramis', 'Surname')]
Parsed name: [('Patrick', 'GivenName'), ('Leplat', 'Surname')]
Parsed name: [('Joerg', 'CorporationName'), ('Schwarz', 'CorporationName')]
Parsed name: [('Patrick', 'GivenName'), ('Storz', 'Surname')]
Parsed name: [('johann', 'GivenName'), ('Frain', 'Surname')]
Parsed name: [('Pedro', 'GivenName'), ('Dias', 'Surname')]
Parsed name: [('Paolo', 'GivenName'), ('Ferrara', 'Surname')]
Parsed name: [('Tobias', 'GivenName'), ('Bichsel', 'Surname')]
Parsed name: [('Valentin', 'GivenName'), ('Wepfer', 'Surname')]
Parsed name: [('Florian', 'GivenName'), ('Vollmer', 'Surname')]
Parsed name: [('Loris', 'GivenName'), ('Bernardini', 'Surname')]
Parsed name: [('Lorenzo', 'GivenName'), ('De', 'Surname'), ('Pietri', 'Surname')]
Parsed name: [('Bruno', 'GivenName'), ('Dietrich', 'Surname')]
Parsed name: [('Fabian', 'GivenName'), ('Schafflützel', 'Surname')]
Parsed na

In [14]:
columns_to_keep = ["ReferenceID", "Name_original", "Objekt_link"]
df_personen[df_personen["Name_incorrect"] == True][columns_to_keep].to_excel("data/calculated/personen_with_incorrect_name_order.xlsx", engine="openpyxl", index=False)

## Spacy

In [3]:
import spacy

# Load the larger multilingual model
nlp = spacy.load("xx_sent_ud_sm")

def check_name_order(name):
    doc = nlp(name)
    tokens = [token for token in doc]
    
    if len(tokens) < 2:
        print(f"Warning: Not enough name parts for '{name}'")
        return False
    
    # Check if the first token is more likely to be a last name
    first_token = tokens[0]
    last_token = tokens[-1]
    
    factors = 0
    
    # 1. Capitalization (all caps more likely for last names)
    if first_token.text.isupper():
        factors += 1
    
    # 2. Length (last names tend to be longer)
    if len(first_token.text) > len(last_token.text):
        factors += 1
    
    # 3. Part-of-speech tag (proper nouns are more likely to be last names)
    if first_token.pos_ == "PROPN" and last_token.pos_ != "PROPN":
        factors += 1
    
    # 4. Check for common first name prefixes
    common_first_name_prefixes = ['van', 'von', 'de', 'du', 'di', 'da']
    if any(name.lower().startswith(prefix) for prefix in common_first_name_prefixes):
        factors -= 1
    
    # 5. Check for common last name suffixes
    common_last_name_suffixes = ['jr', 'sr', 'ii', 'iii', 'iv']
    if any(name.lower().endswith(suffix) for suffix in common_last_name_suffixes):
        factors -= 1
    
    # Consider the name incorrect if more factors suggest the first token is a last name
    return factors >= 2

# First, install the new model:
# !python -m spacy download xx_sent_ud_sm

# Apply the function to the first few rows for testing
for name in df_personen['Name_original'].head(10):
    result = check_name_order(name)
    print(f"Name: {name}, Incorrect: {result}")

# Apply to the whole dataset
df_personen['Name_incorrect'] = df_personen['Name_original'].apply(check_name_order)

print("\nTotal incorrect names:", df_personen['Name_incorrect'].sum())

In [None]:
columns_to_keep = ["ReferenceID", "Name_original", "Objekt_link"]
df_personen[df_personen["Name_incorrect"] == True][columns_to_keep].to_excel("data/calculated/personen_with_incorrect_name_order.xlsx", engine="openpyxl", index=False)

In [8]:
df_personen[df_personen["Name_incorrect"] == True]

Unnamed: 0,ReferenceID,Objekt,Name,Typ,Aktiv,UID_CHID,Zeile2,Zeile2Laenge,Zeile3,Zeile3Laenge,...,Produkt_RefID,Geschaeftspartner,Verknuepfungsart_list,VerknuepftesObjekt_list,VerknuepftesObjektID_list,Geschaeftspartner_list,UID_CHID_check,score,score_details,Name_incorrect
371,07086432-D904-46A1-87B0-F38D3EC51630,PETER BERGKVIST,bergkvist peter,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Telavox],[65162450-6094-431D-B5C9-E37194D5FE16],[],0,50,Verknuepfungsart 50,True
427,081F5A0C-7419-4F55-A2C3-1B0B0CB2D395,WEISS STEFAN,stefan weiss,Person,True,5485940,,0,,0,...,[],[],[Administrator],[FixTrade GmbH],[0BDB4C3C-A52F-4B4F-B689-4B7E00C5B95E],[],2,390,"Geschaeftsobjekte 60, UID 100, Verknuepfungsar...",True
1668,1D40C0FE-E31B-4EBE-9562-1F6BE72A2CB6,NIBA TERENCE,terence niba,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Crédit Populaire SA],[1FB55C6D-6EB9-425C-A269-4AF9C5868050],[],0,50,Verknuepfungsart 50,True
1768,1F1627A0-2143-4F70-A2BA-D87B96C8DE12,CARLIER BENEDICTE,benedicte carlier,Person,True,,,0,,0,...,[],[],[],[],[],[],0,30,Geschaeftsobjekte 30,True
2902,3272CFC1-1D6F-4B05-834B-67A53E11C733,IMMER FRANCK,franck immer,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[DIALARME SA],[A1C90A5C-DE9C-476F-A9F4-4704B6951821],[],0,50,Verknuepfungsart 50,True
3014,343FAD0C-3DC8-4D7D-A093-9F788D9F8F04,KREBS GENEVIEVE,genevieve krebs,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Krebs Geneviève],[26D6D9B9-1B03-40B1-8A21-4262B205D21D],[],0,50,Verknuepfungsart 50,True
3549,3E3C4FF2-7683-477B-948C-14CA02712EE0,Gionata PEDRETTI,pedretti gionata,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Ordine dei Veterinari Ticinesi],[31C30350-490B-4E9F-BA80-03CBD59C3885],[],0,60,"Verknuepfungsart 50, ObjektZeiger 10",True
3824,4307B927-13F0-488C-BFA9-DDBA419A97EA,BADEL NADINE,nadine badel,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Coris - N.Badel],[90BF6EB4-6CF5-4348-8FA0-FFBD02B1DEF7],[],0,50,Verknuepfungsart 50,True
4583,4FE692E3-7085-4CAC-A60F-7CA4C628F632,BUGNON CHRISTOPHE,christophe bugnon,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[Administrator],[cogefin SA],[5246DB59-4584-42E8-8F44-C013F522565A],[],1,180,"UID 50, Verknuepfungsart 100, ObjektZeiger 10,...",True
4782,5312d6ec-f4ec-4374-8af7-3df3e7a243ed,SONI RAJKUMAR JAIRAM,rajkumar jairam soni,Person,True,11563752,,0,,0,...,[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10",True


In [6]:
df_personen.head()

Unnamed: 0,ReferenceID,Objekt,Name,Typ,Aktiv,UID_CHID,Zeile2,Zeile2Laenge,Zeile3,Zeile3Laenge,...,Produkt_RefID,Geschaeftspartner,Verknuepfungsart_list,VerknuepftesObjekt_list,VerknuepftesObjektID_list,Geschaeftspartner_list,UID_CHID_check,score,score_details,Name_incorrect
0,0000D9C7-6B29-4B1F-9FA0-3685B47682D3,Fassatoui Karim,karim fassatoui,Person,True,,,0,,0,...,[],[],[Sonstiges],[Fassatoui Karim],[966E1EA1-F258-4F75-A030-350CFAEEBC15],[],0,0,,False
1,0003c120-f044-4df6-84ef-330fb82d911e,Kramis Simon,simon kramis,Person,True,,,0,,0,...,[],[],[],[],[],[],0,120,"Servicerole_string 100, Email 20",False
2,0004ABC1-956E-4EFD-A6B1-EE142BE3EF09,Leplat Patrick,patrick leplat,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[RCI Finance SA],[D604A8C4-A867-44F3-BE02-FE9E8FEEBC6C],[],0,50,Verknuepfungsart 50,False
3,00066A4F-0A0C-48BC-B0BF-30CBC471C1BE,Schwarz Joerg,joerg schwarz,Person,True,,,0,,0,...,[],[],[Mitarbeiter],[Girgin Switzerland AG],[AD4E87D0-A044-4AD8-8AC1-26C94A362812],[],0,50,Verknuepfungsart 50,False
4,0008BCBB-7FCE-4A26-9407-4880934BA14B,Storz Patrick,patrick storz,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[Administrator],[Great Stuff GmbH],[6436C7FD-2BCD-4692-B6DF-0908DA51D4D0],[],1,170,"UID 50, Verknuepfungsart 100, Email 20",False


## Nameparser

In [6]:
import nameparser

def check_name_order(df):
    def parse_name(name):
        parsed_name = nameparser.HumanName(name)
        if parsed_name.first and parsed_name.last:
            if name.startswith(parsed_name.first):
                return False  # Correct order
            else:
                return True  # Incorrect order
        else:
            return True  # Unable to determine

    df['Name_incorrect'] = df['Name_original'].apply(parse_name)
    return df


df_personen = check_name_order(df_personen)

In [7]:
print(df_personen[['Name_original', 'Name_incorrect']].head())

     Name_original  Name_incorrect
0  Karim Fassatoui           False
1     Simon Kramis           False
2   Patrick Leplat           False
3    Joerg Schwarz           False
4    Patrick Storz           False


## Transformers


In [4]:
# Import the necessary module
from transformers import pipeline

# Download and load the pre-trained model and tokenizer
# Note: This may take some time to download if it's the first time
nlp = pipeline("ner", model="xlm-roberta-base", tokenizer="xlm-roberta-base")

def check_name_order(df):
    def parse_name(name):
        inputs = nlp(name)
        first_names = []
        last_names = []
        for input in inputs:
            if input is not None and "entities" in input:
                for entity in input["entities"]:
                    if entity is not None and "label" in entity and entity["label"] == "PER":
                        if entity["start_position"] < len(name) / 2:
                            first_names.append(entity["word"])
                        else:
                            last_names.append(entity["word"])
        if first_names and last_names:
            return False  # Correct order
        return True  # Incorrect order or unable to determine

    df['Name_incorrect'] = df['Name_original'].apply(parse_name)
    return df

# Apply the function to df_personen
df_personen = check_name_order(df_personen)

# Print the first few rows to verify
print(df_personen[['Name_original', 'Name_incorrect']].head())

# Print the total number of incorrect names
print(f"\nTotal incorrect names: {df_personen['Name_incorrect'].sum()}")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


     Name_original  Name_incorrect
0  Karim Fassatoui            True
1     Simon Kramis            True
2   Patrick Leplat            True
3    Joerg Schwarz            True
4    Patrick Storz            True

Total incorrect names: 38858


## GPT - API

In [3]:
import pandas as pd
from openai import OpenAI

api_key = 'INSERT OPENAI API KEY HERE'
client = OpenAI(api_key=api_key)

In [4]:
# maybe improved version
import time
import json

def check_name_order_and_company_batch(names):
    messages = [
    {"role": "system", "content": "You are a helpful assistant. Respond only with the requested format, without any additional text."},
    {"role": "user", "content": f"""For each name in this list {names}, provide three answers separated by commas:
1) The original name as provided, without any changes.
2) Is the name in correct order (first name followed by last name)? Answer 'Yes' or 'No'. Ignore capitalization when making this determination. Consider the name to be in the correct order if the first part appears to be a first name, even if there are two first names or abbreviations.
3) Is it a company name or person name? Answer 'Company' or 'Person'.
Separate each name's results with a semicolon. Do not include any other text in your response."""}
]
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=messages,
                model="gpt-4o-2024-08-06",
                temperature=0.7,
            )
            response_text = chat_completion.choices[0].message.content.strip()
            
            # Split the response and filter out any empty results
            results = [result.split(',') for result in response_text.split(';') if result.strip()]
            
            # Ensure each result has exactly three parts
            results = [result for result in results if len(result) == 3]
            
            return results
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Error occurred: {e}. Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print(f"Max retries reached. Skipping this batch.")
                return []


def process_names(df, batch_size=100, checkpoint_file='checkpoint.json'):
    results = []
    start_index = 0
    total_processed = 0

    # Load checkpoint if exists
    try:
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
        start_index = checkpoint['last_processed_index'] + 1
        results = checkpoint['results']
        total_processed = len(results)
        print(f"Resuming from index {start_index}")
    except FileNotFoundError:
        print("Starting from the beginning")

    for i in range(start_index, len(df), batch_size):
        if total_processed >= len(df):
            print("All names have been processed.")
            break

        end_index = min(i + batch_size, len(df))
        batch_names = df['Name_original'][i:end_index].tolist()
        batch_results = check_name_order_and_company_batch(batch_names)
        
        # Only add new results
        new_results = batch_results[:len(df) - total_processed]
        results.extend(new_results)
        total_processed += len(new_results)

        # Save checkpoint
        checkpoint = {
            'last_processed_index': end_index - 1,
            'results': results
        }
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f)

        print(f"Processed up to index {end_index - 1}")
        time.sleep(1)  # Add a 1-second delay between batches

    return results

# Process the names
results = process_names(df_personen)


# just to be safe, store raw results
with open('output/results_gpt4o_full2.pickle', 'wb') as f:
    pickle.dump(results, f)

# Ensure results match DataFrame length
results = results[:len(df_personen)]


Starting from the beginning
Processed up to index 99
Processed up to index 199
Processed up to index 299
Processed up to index 399
Processed up to index 499
Processed up to index 599
Processed up to index 699
Processed up to index 799
Processed up to index 899
Processed up to index 999
Processed up to index 1099
Processed up to index 1199
Processed up to index 1299
Processed up to index 1399
Processed up to index 1499
Processed up to index 1599
Processed up to index 1699
Processed up to index 1799
Processed up to index 1899
Processed up to index 1999
Processed up to index 2099
Processed up to index 2199
Processed up to index 2299
Processed up to index 2399
Processed up to index 2499
Processed up to index 2599
Processed up to index 2699
Processed up to index 2799
Processed up to index 2899
Processed up to index 2999
Processed up to index 3099
Processed up to index 3199
Processed up to index 3299
Processed up to index 3399
Processed up to index 3499
Processed up to index 3599
Processed u

In [5]:
# Merge results to df_personen

# Create new columns with default values
df_personen['Name_incorrect'] = False
df_personen['Company_name'] = False

# Create a dictionary for faster lookup
name_dict = {row['Name_original'].strip(): i for i, row in df_personen.iterrows()}

# Loop through results
for result in results:
    name, order, entity_type = result
    name = name.strip()
    if name in name_dict:
        index = name_dict[name]
        df_personen.at[index, 'Name_incorrect'] = order.strip().lower() == 'no'
        df_personen.at[index, 'Company_name'] = entity_type.strip().lower() == 'company'

# After the loop, you may want to fill any remaining None values
# df_personen['Name_correct'].fillna(False, inplace=True)
# df_personen['Company_name'].fillna(False, inplace=True)

# Print some statistics to verify the results
print(f"Total names processed: {len(results)}")
print(f"Names found in DataFrame: {df_personen['Name_incorrect'].notnull().sum()}")
print(f"Incorrect names: {(df_personen['Name_incorrect']).sum()}")
print(f"Company names: {df_personen['Company_name'].sum()}")

Total names processed: 38861
Names found in DataFrame: 38864
Incorrect names: 563
Company names: 149


In [6]:
df_personen_incorrect = df_personen[df_personen["Name_incorrect"] == True]
df_personen_sind_organisationen = df_personen[df_personen["Company_name"] == True]

In [7]:
cols_to_keep = ["ReferenceID", "Name_original", "Objekt_link", "address_full", "Versandart", "EMailAdresse", "VerknuepftesObjekt", "Verknuepfungsart", "VerknuepftesObjektID", "Produkt_rolle", "Produkt_RefID", "Geschaeftspartner", "Servicerole_string"]
# df_personen_incorrect[cols_to_keep].to_excel("output/Personen_VorNachnamen_vertauscht.xlsx", engine="openpyxl", index=False)



with pd.ExcelWriter('output/Personen_Name_checks4.xlsx', engine='openpyxl') as writer:
    df_personen_incorrect[cols_to_keep].to_excel(writer, sheet_name='Namen_vertauscht', index=False)
    df_personen_sind_organisationen[cols_to_keep].to_excel(writer, sheet_name='Organisationsnamen', index=False)

In [32]:
results

[['Karim Fassatoui', ' Yes', ' Person'],
 [' Simon Kramis', ' Yes', ' Person'],
 [' Patrick Leplat', ' Yes', ' Person'],
 [' Joerg Schwarz', ' Yes', ' Person'],
 [' Patrick Storz', ' Yes', ' Person'],
 [' johann Frain', ' No', ' Person'],
 [' Pedro Dias', ' Yes', ' Person'],
 [' Paolo Ferrara', ' Yes', ' Person'],
 [' Tobias Bichsel', ' Yes', ' Person'],
 [' Valentin Wepfer', ' Yes', ' Person'],
 [' Florian Vollmer', ' Yes', ' Person'],
 [' Loris Bernardini', ' Yes', ' Person'],
 [' Lorenzo De Pietri', ' Yes', ' Person'],
 [' Bruno Dietrich', ' Yes', ' Person'],
 [' Fabian Schafflützel', ' Yes', ' Person'],
 [' Roger Geser', ' Yes', ' Person'],
 [' KENNETH COSTANTINO', ' No', ' Person'],
 [' Mirco Beerli', ' Yes', ' Person'],
 [' Luca Tagli', ' Yes', ' Person'],
 [' Jürg Rietmann', ' Yes', ' Person'],
 [' Corinne Pfister', ' Yes', ' Person'],
 [' Markus Keller', ' Yes', ' Person'],
 [' Marco Furrer', ' Yes', ' Person'],
 [' Andreas Grüner', ' Yes', ' Person'],
 [' Sabine Schuitemaker',

In [37]:
results

for item in results:
    if item[1].strip().lower() == 'no':
        print(item)

[' johann Frain', ' No', ' Person']
[' KENNETH COSTANTINO', ' No', ' Person']
[' bernd wagner', ' No', ' Person']
[' Santana Fonseca Mathieu Marineide', ' No', ' Person']
[' arlo gonser', ' No', ' Person']
[' Raphael lutz', ' No', ' Person']
[' Patrick AUCHLIN', ' No', ' Person']
[' Pauline Elisabeth Bétrisey Nanchen', ' No', ' Person']
[' dummy doublette', ' No', ' Person']
[' . Buchhaltung', ' No', ' Company']
[' sté rapidofoos sa', ' No', ' Company']
[' manuela Schweighofer', ' No', ' Person']
[' Stockier', ' No', ' Person']
[' keine Person', ' No', ' Person']
[' marie-françoise trottet', ' No', ' Person']
[' pia bieri', ' No', ' Person']
[' Giger', ' No', ' Person']
[' maria-dolores lopez-germano', ' No', ' Person']
[' damien briet', ' No', ' Person']
[' Dixa Friends', ' No', ' Company']
[' . Arc Jurassien Déchets', ' No', ' Company']
[' juan klammer', ' No', ' Person']
[' keine Person', ' No', ' Person']
[' Hosting Hosting', ' No', ' Company']
[' Mike sebaut', ' No', ' Person']
['

In [24]:
df_results[df_results["Name_incorrect"] == True]

Unnamed: 0,Name,Name_order,Entity_type,Name_incorrect,company_name
5,johann Frain,No,Person,True,False
16,KENNETH COSTANTINO,No,Person,True,False
27,bernd wagner,No,Person,True,False
41,Santana Fonseca Mathieu Marineide,No,Person,True,False
63,arlo gonser,No,Person,True,False
...,...,...,...,...,...
38678,- -,No,Person,True,False
38692,- -,No,Person,True,False
38700,jonas buehler,No,Person,True,False
38740,anna munz,No,Person,True,False


In [20]:
# After processing the results and updating the DataFrame
# Save results to pickle
with open('output/Personen_Namen_check.pickle', 'wb') as f:
    pickle.dump(df_personen, f)

# You can also save just the results if you prefer
# with open('output/results.pickle', 'wb') as f:
#     pickle.dump(results, f)

print("Results saved to pickle file.")

# To load the pickle file later:
# with open('output/Personen_Namen_check.pickle', 'rb') as f:
#     loaded_df = pickle.load(f)

Results saved to pickle file.


In [6]:
with open('output/Personen_Namen_check.pickle', 'rb') as f:
    gpt_output = pickle.load(f)

In [7]:
gpt_output

Unnamed: 0,ReferenceID,Objekt,Name,Typ,Aktiv,UID_CHID,Zeile2,Zeile2Laenge,Zeile3,Zeile3Laenge,...,Produkt_rolle,Produkt_RefID,Geschaeftspartner,Verknuepfungsart_list,VerknuepftesObjekt_list,VerknuepftesObjektID_list,Geschaeftspartner_list,UID_CHID_check,score,score_details
0,0000D9C7-6B29-4B1F-9FA0-3685B47682D3,Fassatoui Karim,karim fassatoui,Person,True,,,0,,0,...,[],[],[],[Sonstiges],[Fassatoui Karim],[966E1EA1-F258-4F75-A030-350CFAEEBC15],[],0,0,
1,0003c120-f044-4df6-84ef-330fb82d911e,Kramis Simon,simon kramis,Person,True,,,0,,0,...,[],[],[],[],[],[],[],0,120,"Servicerole_string 100, Email 20"
2,0004ABC1-956E-4EFD-A6B1-EE142BE3EF09,Leplat Patrick,patrick leplat,Person,True,,,0,,0,...,[],[],[],[Mitarbeiter],[RCI Finance SA],[D604A8C4-A867-44F3-BE02-FE9E8FEEBC6C],[],0,50,Verknuepfungsart 50
3,00066A4F-0A0C-48BC-B0BF-30CBC471C1BE,Schwarz Joerg,joerg schwarz,Person,True,,,0,,0,...,[],[],[],[Mitarbeiter],[Girgin Switzerland AG],[AD4E87D0-A044-4AD8-8AC1-26C94A362812],[],0,50,Verknuepfungsart 50
4,0008BCBB-7FCE-4A26-9407-4880934BA14B,Storz Patrick,patrick storz,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Great Stuff GmbH],[6436C7FD-2BCD-4692-B6DF-0908DA51D4D0],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38853,ff390915-f6e8-43d6-a007-f61ce66932e0,Deslarzes David,david deslarzes,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[],[],[],[],1,80,"UID 50, ObjektZeiger 10, Email 20"
38854,ff82fc37-603c-4b1e-9262-192229d175b2,Schmid Ivan,ivan schmid,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Payrexx AG],[1b81f7e2-6e24-4b65-88f4-15c4cbf580b1],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
38855,ff85f15d-c67b-4407-9808-a183ca1d87b5,Tuskan Mustafa,mustafa tuskan,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Modissa AG],[46a10715-613b-449b-acba-b01c47fde57f],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
38856,ff9c0b4e-979f-47bc-9386-a5f27db33af9,Raoux Mathieu,mathieu raoux,Person,True,11100069,,0,,0,...,[],[],[],"[Administrator, Administrator, Administrator, ...","[GVVN - Groupe de vol à voile Neuchâtel, Group...","[31910cb0-818d-42c4-9a82-51009d22072e, CUSTOME...",[],2,810,"Geschaeftsobjekte 180, UID 100, Verknuepfungsa..."


# Singular Personen 
Personen die weder eine Verbindung zu anderen Organisationen, noch irgendwelche Produkte haben

In [3]:
import os
import datetime

# pickle_file_path = "../GraphViewerApp/data/calculated/personen_organisationen_dfs_processed.pickle"
pickle_file_path = "data/calculated/personen_organisationen_dfs_processed.pickle"

# Display the last modified time of the pickle file
last_modified_time = os.path.getmtime(pickle_file_path)
print("Last modified time:", datetime.datetime.fromtimestamp(last_modified_time))

with open(pickle_file_path, "rb") as file:
    dfs = pickle.load(file)
df_personen = dfs["personen"]
df_organisationen = dfs["organisationen"]

Last modified time: 2024-11-15 10:26:20.841578


In [4]:
df_singular_personen = df_personen[
    (df_personen["Produkt_rolle"].apply(lambda x: len(x) == 0)) &
    (df_personen["Verknuepfungsart_list"].apply(lambda x: x == [''])) &
    (df_personen["Geschaeftspartner_list"].apply(lambda x: len(x) == 0)) &
    (df_personen["Servicerole_string"] == "") &
    (df_personen["AnzahlGeschaeftsobjekte"] == 0)
]

df_singular_personen


Unnamed: 0,ReferenceID,Objekt,Name,Typ,Aktiv,UID_CHID,Zeile2,Zeile2Laenge,Zeile3,Zeile3Laenge,...,Produkt_rolle,Produkt_RefID,Geschaeftspartner,Verknuepfungsart_list,VerknuepftesObjekt_list,VerknuepftesObjektID_list,Geschaeftspartner_list,UID_CHID_check,score,score_details
24,0058455e-307a-4021-ac82-57276e017754,Boskovic Rada,rada boskovic,Person,True,9806205,,0,,0,...,[],[],[],[],[],[],[],2,220,"UID 100, Versandart 100, Email 20"
48,00A8F7A9-B8E0-4814-8CF8-22E47F1C5B70,Da Tos Gabriele,gabriele da tos,Person,True,7501556,,0,,0,...,[],[],[],[],[],[],[],2,220,"UID 100, Versandart 100, Email 20"
81,016A0287-9984-4D80-98C1-A768AE05E804,Schmalz Eric,eric schmalz,Person,True,8157583,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"
94,01ABA361-D9D3-49C2-A334-C16E3F22A75C,Egger Joëlle,joëlle egger,Person,True,7875343,,0,,0,...,[],[],[],[],[],[],[],2,220,"UID 100, Versandart 100, Email 20"
164,02f23790-bac9-41f3-9a1d-a0c17b357be7,Baños Francisco,francisco baños,Person,True,10047295,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38154,e86958b1-8ec5-4d7b-83e7-4dde36b00677,Beltrami Maurizio,maurizio beltrami,Person,True,8846613,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"
38241,f4cae53e-c592-4278-a5b0-429c6d42fde9,Auger Jean-François,jean-françois auger,Person,True,9380927,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"
38260,f8541ba2-fd50-4dd0-9e6e-3deb4417e681,Berthold Nicolas,nicolas berthold,Person,True,9203129,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"
38292,fc01aff1-5f74-40b9-b941-5988c195fc58,Nannini Elisa,elisa nannini,Person,True,9378881,,0,,0,...,[],[],[],[],[],[],[],2,210,"UID 100, Versandart 100, Email 10"


In [5]:
cols_to_keep=["ReferenceID", "Name_original", "Objekt_link", "UID_CHID", "address_full", "EMailAdresse", "Telefonnummer"]
df_singular_personen[cols_to_keep].to_excel('output/Personen_ohne_Verbindungen_ohne_Produkte.xlsx', index=False)

In [32]:
df_personen[df_personen["Produkt_rolle"].apply(lambda x: len(x) == 0)]

Unnamed: 0,ReferenceID,Objekt,Name,Typ,Aktiv,UID_CHID,Zeile2,Zeile2Laenge,Zeile3,Zeile3Laenge,...,Produkt_rolle,Produkt_RefID,Geschaeftspartner,Verknuepfungsart_list,VerknuepftesObjekt_list,VerknuepftesObjektID_list,Geschaeftspartner_list,UID_CHID_check,score,score_details
0,0003c120-f044-4df6-84ef-330fb82d911e,Kramis Simon,simon kramis,Person,True,,,0,,0,...,[],[],[],[],[],[],[],0,120,"Servicerole_string 100, Email 20"
1,0004ABC1-956E-4EFD-A6B1-EE142BE3EF09,Leplat Patrick,patrick leplat,Person,True,,,0,,0,...,[],[],[],[Mitarbeiter],[RCI Finance SA],[D604A8C4-A867-44F3-BE02-FE9E8FEEBC6C],[],0,50,Verknuepfungsart 50
2,00066A4F-0A0C-48BC-B0BF-30CBC471C1BE,Schwarz Joerg,joerg schwarz,Person,True,,,0,,0,...,[],[],[],[Mitarbeiter],[Girgin Switzerland AG],[AD4E87D0-A044-4AD8-8AC1-26C94A362812],[],0,50,Verknuepfungsart 50
3,0008BCBB-7FCE-4A26-9407-4880934BA14B,Storz Patrick,patrick storz,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Great Stuff GmbH],[6436C7FD-2BCD-4692-B6DF-0908DA51D4D0],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
4,0009AC82-AFBE-4A44-8C92-D2DB0EE4E736,Frain johann,johann frain,Person,True,,,0,,0,...,[],[],[],[Mitarbeiter],[Frain Johann],[A1422499-91B5-4D4C-9AA2-D66F3486D728],[],0,60,"Verknuepfungsart 50, ObjektZeiger 10"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38121,ff390915-f6e8-43d6-a007-f61ce66932e0,Deslarzes David,david deslarzes,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[],[],[],[],1,80,"UID 50, ObjektZeiger 10, Email 20"
38122,ff82fc37-603c-4b1e-9262-192229d175b2,Schmid Ivan,ivan schmid,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Payrexx AG],[1b81f7e2-6e24-4b65-88f4-15c4cbf580b1],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
38123,ff85f15d-c67b-4407-9808-a183ca1d87b5,Tuskan Mustafa,mustafa tuskan,Person,True,NotRegisteredCHId,,0,,0,...,[],[],[],[Administrator],[Modissa AG],[46a10715-613b-449b-acba-b01c47fde57f],[],1,170,"UID 50, Verknuepfungsart 100, Email 20"
38124,ff9c0b4e-979f-47bc-9386-a5f27db33af9,Raoux Mathieu,mathieu raoux,Person,True,11100069,,0,,0,...,[],[],[],"[Administrator, Administrator, Administrator, ...","[Groupe de vol à voile, GVVN - Groupe de vol à...","[CUSTOMER#619241, 31910cb0-818d-42c4-9a82-5100...",[],2,810,"Geschaeftsobjekte 180, UID 100, Verknuepfungsa..."
