# Imports

In [1]:
import warnings
import pandas as pd
import numpy as np
from helper_functions.file_io_functions import detect_raw_files, load_processed_data, create_excel_file_from_dict
from helper_functions.cleanup_functions import raw_cleanup
from helper_functions.edges_clusters import find_name_adresse_doubletten
from helper_functions.filter_muster_organisationen import general_exclusion_criteria_personen, find_portal_vs_physisch_doublette, find_email_doubletten
from helper_functions.filter_muster_personen import filter_personen_connected_to_same_organisation, split_groups_mitarbeiter_admnistrator
from helper_functions.analyses_formatting import final_touch_batch, final_touch

# This extension will cause imported modules to be reloaded if there were changes made.
%load_ext autoreload
%autoreload 2

# Disable some annoying warnings. 
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

# Set the global number formatting
np.set_printoptions(precision=2, suppress=True)

## Loading Data & Basic Cleanup

In [14]:
# To check if all required Expertensuche files exist.
raw_files, error_message = detect_raw_files("../GraphViewerApp/data/")
if error_message:
    print(error_message)

In [15]:
# Needs to be run only once, does processing for personen_ and organisationen_ analyses. Processed data can then be loaded from cell below this.

df_organisationen, df_personen = raw_cleanup(raw_files, skip_hyperlink_step=True)  # Takes >5 min
_, df_personen_inkl_sonstiges = raw_cleanup(raw_files, remove_personen_Sonstiges=False, skip_hyperlink_step=True)

Reading excel files and extracting hyperlinks (takes several minutes)...
Basic cleanup Organisationen & Personen...
Aggregating additional Expertensuchen...
Storing dataframes as pickle...
Reading excel files and extracting hyperlinks (takes several minutes)...
Basic cleanup Organisationen & Personen...
Aggregating additional Expertensuchen...
Storing dataframes as pickle...


### Optionally, skip above processing and load cleaned data

Uncomment lines below

In [3]:
data_dfs = load_processed_data() #standard
# data_dfs_sonstiges = load_processed_data(file_path="data/calculated/personen_organisationen_dfs_processed_with_sonstiges_personen.pickle")

df_personen = data_dfs["personen"]
df_organisationsrollen = data_dfs["organisationsrollen"] # ! Its called organisationsrollen historically, but contains Produkt infos for Personen as well!
# df_organisationen = data_dfs["organisationen"]

# df_personen_inkl_sonstiges = data_dfs_sonstiges["personen"]

# Find and filter Doubletten

In [17]:
# All potential doubletten (same Name + Address)
# personen_doubletten = find_name_adresse_doubletten(df_personen, organisationen=False)

# Alternative: Use abbreviated first names for matching.
personen_doubletten = find_name_adresse_doubletten(df_personen, organisationen=False, abbreviated_first_name=True)


In [18]:
# ----- Some general exclusion criteria: ------
# The resulting dataframe contains Personen connected to the same Organisation, that may have the following attributes:

no_Produkte = False
no_Geschaeftspartner = False
no_Servicerole = False

only_physisch = False
only_mitarbeiter = False  # No "Administrator" roles


personen_doubletten_filtered = general_exclusion_criteria_personen(personen_doubletten, no_Produkte=no_Produkte, no_Geschaeftspartner=no_Geschaeftspartner, no_Servicerole=no_Servicerole, only_physisch=only_physisch, only_mitarbeiter=only_mitarbeiter)

## Muster: Doublette Physisch

Alle Doubletten haben Versandart Physisch, sind mit derselben Organisation verknüpft. 

Keine Geschäftspartner. Keine Servicerollen. Aber können Produkte haben (Personenrolle und ProduktID werden dann angezeigt).

Separates sheet für folgende Fälle: 2+ Mitarbeiter, 1 Administrator und 1+ Mitarbeiter, oder 2+ Administrator (plus Mitarbeiter).

In [19]:
personen_doubletten_filtered = general_exclusion_criteria_personen(personen_doubletten, no_Produkte=False, no_Geschaeftspartner=True, no_Servicerole=True, only_physisch=True, only_mitarbeiter=False)

# Starting point for most analyses: Only consider groups where all members are connected to same organisation.
doubletten_same_org = filter_personen_connected_to_same_organisation(personen_doubletten_filtered, df_organisationen)

doublette_physisch_dict = split_groups_mitarbeiter_admnistrator(doubletten_same_org)

cols_to_keep = ["ReferenceID", "Name_original", "Objekt_link", "address_full", "Versandart", "EMailAdresse", "VerknuepftesObjekt", "Verknuepfungsart", "VerknuepftesObjektID", "Produkt_rolle", "Produkt_RefID", "cluster_id", "score_details", "score", "master"]
doublette_physisch_dict_formatted = final_touch_batch(doublette_physisch_dict, cols_to_keep=cols_to_keep, alphanumeric=True)

create_excel_file_from_dict(doublette_physisch_dict_formatted, output_file="output/Personendoubletten_physisch.xlsx")

## Muster: Doublette Portal

Identisch zu Doublette Physisch, aber mindestens eine der Doubletten hat Versandart Portal.

In [20]:
personen_doubletten_filtered = general_exclusion_criteria_personen(personen_doubletten, no_Produkte=False, no_Geschaeftspartner=True, no_Servicerole=True, only_physisch=False, only_mitarbeiter=False)
doubletten_same_org = filter_personen_connected_to_same_organisation(personen_doubletten_filtered, df_organisationen)

# Filter groups where at least one member has Versandart Portal.
doubletten_same_org = doubletten_same_org.groupby('cluster_id').filter(lambda x: (x['Versandart'] == 'Portal').any())

doublette_portal_dict = split_groups_mitarbeiter_admnistrator(doubletten_same_org)

cols_to_keep = ["ReferenceID", "Name_original", "Objekt_link", "address_full", "Versandart", "EMailAdresse", "VerknuepftesObjekt", "Verknuepfungsart", "VerknuepftesObjektID", "Produkt_rolle", "Produkt_RefID", "cluster_id", "score_details", "score", "master"]
doublette_portal_dict_formatted = final_touch_batch(doublette_portal_dict, cols_to_keep=cols_to_keep, alphanumeric=True)

create_excel_file_from_dict(doublette_portal_dict_formatted, output_file="output/Personendoubletten_portal.xlsx")

## Doubletten Physisch und Portal

Alle Doubletten mit selbem Name, Adresse und Email (zwei Varianten, exakter match, oder leere email erlauben), unabhängig von Verknüpfungen zu Organisationen.
Mindestens eine Doublette hat Versandart Portal und eine Versandart Physisch.

Können Servicerollen und Produkte haben und bei Geschäftspartnern vorkommen.

In [21]:
personen_physisch_vs_portal_only_nonempty_email = find_portal_vs_physisch_doublette(df_personen, strict_email=True)
personen_physisch_vs_portal_empty_email = find_portal_vs_physisch_doublette(df_personen, strict_email=False)

cols_to_keep = ["ReferenceID", "Name_original", "Objekt_link", "address_full", "Versandart", "EMailAdresse", "VerknuepftesObjekt", "Verknuepfungsart", "VerknuepftesObjektID", "Produkt_rolle", "Produkt_RefID", "Geschaeftspartner", "Servicerole_string", "cluster_id", "score_details", "score", "master"]

personen_physisch_vs_portal_only_nonempty_email = final_touch(personen_physisch_vs_portal_only_nonempty_email, cols_to_keep)
personen_physisch_vs_portal_empty_email = final_touch(personen_physisch_vs_portal_empty_email, cols_to_keep)

with pd.ExcelWriter('output/Personen_Portal_Vs_Physisch.xlsx', engine='openpyxl') as writer:
    personen_physisch_vs_portal_only_nonempty_email.to_excel(writer, sheet_name='nonempty_email_only', index=False)
    personen_physisch_vs_portal_empty_email.to_excel(writer, sheet_name='empty_email_allowed', index=False)

# Doubletten Portal - Email

Alle Doubletten mit selber Email (unabhängig von Name und Addressen), mindestens eine Doublette hat Versandart Portal.

Können Servicerollen und Produkte haben und bei Geschäftspartnern vorkommen.

Erweiterungen:

- UVEK liste enthält auch Personen mit Verknüpfungsart Sonstiges. Sind in separaten files inkludiert.

- Physisch email doubletten


In [22]:
personen_email_portal = find_email_doubletten(df_personen, portal=True)
# personen_email_pyhsisch = find_email_doubletten(df_personen, portal=False)
personen_email_portal_sonstiges = find_email_doubletten(df_personen_inkl_sonstiges, portal=True)
personen_email_pyhsisch_sonstiges = find_email_doubletten(df_personen_inkl_sonstiges, portal=False)

cols_to_keep = ["ReferenceID", "Name_original", "Objekt_link", "address_full", "Versandart", "EMailAdresse", "VerknuepftesObjekt", "Verknuepfungsart", "VerknuepftesObjektID", "Produkt_rolle", "Produkt_RefID", "Geschaeftspartner", "Servicerole_string", "cluster_id", "score_details", "score", "master"]
personen_email_portal = final_touch(personen_email_portal, cols_to_keep)
personen_email_physisch_sonstiges = final_touch(personen_email_pyhsisch_sonstiges, cols_to_keep)
personen_email_portal_sonstiges = final_touch(personen_email_portal_sonstiges, cols_to_keep)

personen_email_portal.to_excel("output/Personen_Email_Portal.xlsx", index=False)
personen_email_physisch_sonstiges.to_excel("output/Personen_Email_Physisch_inkl_VerknuepfungsartSonstiges.xlsx", index=False)
personen_email_portal_sonstiges.to_excel("output/Personen_Email_Portal_inkl_VerknuepfungsartSonstiges.xlsx", index=False)

# Playground

Abbreviated first names: Statistics about how much it would change clusters

In [23]:
personen_doubletten = find_name_adresse_doubletten(df_personen, organisationen=False)
personen_doubletten_abbrev = find_name_adresse_doubletten(df_personen, organisationen=False, abbreviated_first_name=True)

# Lets consider any doubletten with same name and address, but irrespective of whether they have Produkte, Geschäftspartner, etc.
df1 = general_exclusion_criteria_personen(personen_doubletten, no_Produkte=False, no_Geschaeftspartner=False, no_Servicerole=False, only_physisch=False, only_mitarbeiter=False)
df2 = general_exclusion_criteria_personen(personen_doubletten_abbrev, no_Produkte=False, no_Geschaeftspartner=False, no_Servicerole=False, only_physisch=False, only_mitarbeiter=False)

print(f'{len(df2) - len(df1)} additional Doubletten with same name and address (NOT necessarily to same organisation)')

df1a = filter_personen_connected_to_same_organisation(df1, df_organisationen)
df2a = filter_personen_connected_to_same_organisation(df2, df_organisationen)

print(f'{len(df2a) - len(df1a)} additional Doubletten with same name and address AND same organisation')

34 additional Doubletten with same name and address (NOT necessarily to same organisation)
29 additional Doubletten with same name and address AND same organisation


In [24]:
# Get unique names before and after abbreviation
unique_original = set(df1a['Name'].unique())
unique_abbreviated = set(df2a['Name'].unique())

# Determine new unique names in the abbreviated column that were not in the original
new_unique_names = unique_abbreviated - unique_original

# Display the new unique names
print("New unique names created by abbreviation:")
print(f"{len(new_unique_names)}: {new_unique_names}")

New unique names created by abbreviation:
29: {'annalise gross', 'erich geissmann', 'sydney weill', 'tom battaglia', 'raphael boullet', 'melvin belli', 'steve christe', 'rémi sebag', 'a. blaser', 'mickäel prince', 'rene schiefer', 'nicole fischli', 'mickaël prince', 'giusi raffa', 'melwin belli', 'maurico ernst', 'remi sebag', 'sidney weill', 'giuseppe raffa', 'walther knecht', 'anneliese gross', 'stève christe', 'e. geissmann', 'walter knecht', 'raphae boullet', 'nicole-fabienne fischli', 'rené schiefer', 'thomas battaglia', 'mauricio ernst'}


In [42]:
# Quick check to see if two excel files are the same:

# Load the Excel files
df1 = pd.read_excel('C:\\GitRepos\\Doublettenanalyse_scripts\\output\\Personen\\Personen_Portal_Vs_Physisch.xlsx')
df2 = pd.read_excel('C:\\GitRepos\\Doublettenanalyse_scripts\\output\\Personen_Portal_Vs_Physisch.xlsx')

# Check if both DataFrames are identical
if df1.equals(df2):
    print("The files are identical.")
else:
    print("The files are different.")

The files are identical.
