In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import codecs, json
import unicodedata
# pip install Unidecode  <OR> conda install Unidecode
import unidecode

In [3]:
#importing all csv files
domarar = pd.read_csv('csv/blak-domarar.csv', sep=';', header=0)
einstaklingar = pd.read_csv('csv/blak-einstaklingar.csv', sep=';', header=0)
forsvarsmenn = pd.read_csv('csv/blak-forsvarsmenn.csv', sep=';', header=0)
lid = pd.read_csv('csv/blak-lid.csv', sep=';', header=0)
lidimoti = pd.read_csv('csv/blak-lidimoti.csv', sep=';', header=0)
lidsmenn = pd.read_csv('csv/blak-lidsmenn.csv', sep=';', header=0)
lidsstjorar = pd.read_csv('csv/blak-lidsstjorar.csv', sep=';', header=0)
thjalfarar = pd.read_csv('csv/blak-thjalfarar.csv', sep=';', header=0)
mot = pd.read_csv('csv/blak-mot.csv', sep=';', header=0)

# drop all SyndarLids with an ID (SyndarlidID)
# (the reason for not dropping using SyndarLid is because I don't trust that column to be inserted correctly with [0,1])
lid = lid[lid['SyndarlidID'].isna()]
# then dropping those two columns because we don't want virtual teams
lid = lid.drop(columns=['SyndarLid', 'SyndarlidID'])

# All duplicated birthdays
duplicated_einstaklingar = einstaklingar[einstaklingar.duplicated(subset=['Nafn', 'Fdagur', 'Kyn'], keep=False)]
duplicated_fdagur_kyn_einstaklingar = einstaklingar[einstaklingar.duplicated(subset=['Fdagur', 'Kyn'], keep=False)]


In [4]:
# Add all entries that have duplicated birthdays, then filter that to first_name->birthday-><people entries>
duplicate_dict = defaultdict(dict)
for index, row in duplicated_fdagur_kyn_einstaklingar.iterrows():
    full_name = row['Nafn']
    #only get the first part of full name 
    first_name = full_name.split()[0]
    # make first name lowercase
    first_name_lowercase = first_name.lower()
    # encode icelandic letters to english
    first_name_to_english = unidecode.unidecode(first_name_lowercase)
    # split birthday into year month and day and ignore second part (sec, min, hour)
    Fdagur_date = row['Fdagur'].split()[0]
    
    if first_name_to_english in duplicate_dict.keys():
        if Fdagur_date in duplicate_dict[first_name_to_english].keys():
            #if first name and Fdagur (birthday) exist in dict then append to that key (birthday)
            duplicate_dict[first_name_to_english][Fdagur_date].append(row.values)
        else:
            #if first name exists but Fdagur (birthday) does not exist in dict
            duplicate_dict[first_name_to_english][Fdagur_date] = [row.values]
    else:
        #if Fdagur (birthday) does not exist in dict
        duplicate_dict[first_name_to_english][Fdagur_date] = [row.values]
        

In [27]:
# Remove all single birthday entries that are not duplicates
dict_removed_single_entries = defaultdict(dict)
for key, value in duplicate_dict.items():
    temp_dict = dict(value)
    for dict_key, dict_value in temp_dict.items():
        dict_len = len(dict_value)
        tmp2 = {dict_key: dict_value}
        if(dict_len > 1):
            dict_removed_single_entries[key] = tmp2

In [30]:
#dict_removed_single_entries

In [31]:
# Checking if two names are the same person
dict_duplicate_compare_team_members = defaultdict(dict)
for index, row in lidsmenn.iterrows():
    ids = row["EinstID"]
    
    if ids in duplicated_fdagur_kyn_einstaklingar["EinstID"]:
        # now we only view ids that exist for duplicated people
        if ids in dict_duplicate_compare_team_members.keys():
            dict_duplicate_compare_team_members[ids].append(row.values)
        else:
            dict_duplicate_compare_team_members[ids] = [row.values]


In [8]:
#dict_duplicate_compare_team_members

In [48]:
for key, value in dict_removed_single_entries.items():
    #get key and arrays for each person
    for dict_key, dict_value in dict(value).items():
        #get each array for person
        print("KEY: " + key + " BIRTHDAY: " + dict_key)
        for a in dict_value:
            print(a)
        print("--------------")

KEY: adalsteinn BIRTHDAY: 1981-03-03
[1964 'Aðalsteinn Eymundsson' '1981-03-03 00:00:00.000' 'kk ' nan nan nan
 nan nan '8212554' nan nan '2006-01-09 20:31:04.420' nan]
[1437 'Aðalsteinn Eymundsson' '1981-03-03 00:00:00.000' 'kk ' 'HK'
 'adalste@hi.is' nan nan nan '6182554' nan nan '2003-10-11 19:12:00.827'
 0.0]
--------------
KEY: aldis BIRTHDAY: 1996-06-22
[2949 'aldís anna höskuldsdóttir' '1996-06-22 00:00:00.000' 'kvk' 'K.A.'
 'aldis_anna@hotmail.com' nan nan nan '8681980' '4611143' nan
 '2010-02-25 13:52:40.360' 169.0]
[2950 'aldís anna höskuldsdóttir' '1996-06-22 00:00:00.000' 'kvk' 'K.A.'
 'aldis_anna@hotmail.com' nan nan nan '8681980' '4611143' nan
 '2010-02-25 13:52:42.780' 169.0]
--------------
KEY: alexander BIRTHDAY: 1990-12-05
[1578 'Alexander Stefánsson' '1990-12-05 00:00:00.000' 'kk ' 'HK'
 'alexanderstef@gmail.com' nan nan nan '8464158' '4626693' nan
 '2004-08-20 23:05:27.997' nan]
[2498 'Alexander Stefánsson' '1990-12-05 00:00:00.000' 'kk ' 'HK' nan nan
 nan nan '846 

In [None]:
#dict(duplicated_fdagur_kyn_einstaklingar)

# all duplicates
#duplicate_dict

# all duplicates for lúðvík
#duplicate_dict["ludvik"]

#duplicate_dict["ludvik"]['1969-03-31'][0]
#duplicate_dict["ludvik"]['1969-03-31'][1]

#duplicate_dict
#dict_removed_single_entries

In [21]:
reverted_back_to_dict = dict(duplicate_dict)
#reverted_back_to_dict

In [23]:
file_path = "json/einstaklingar_map.txt" ## your path variable
#duplicate_dict_json = json.dump(duplicate_dict, codecs.open(file_path, 'w', encoding='utf-8'), separators=(';', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
#json_obj = json.dumps(duplicate_dict, indent = 4)
#dumped = json.dumps(duplicate_dict, cls=NumpyEncoder)
#dumped
#pd.DataFrame(reverted_back_to_dict).to_csv(file_path, encoding='utf-8-sig')
#duplicate_dict_json = json.dump(reverted_back_to_dict, codecs.open(file_path, 'w', encoding='utf-8-sig'))

#json = json.dumps(reverted_back_to_dict)
#f = open(file_path,"w")
#f.write(str(reverted_back_to_dict))
#f.close()

=====================================================================================
=

In [187]:
#FINAL STEP (run after everything is done):

#duplicated people put into it's own csv to be browsed later
pd.DataFrame(duplicated_einstaklingar).to_csv("csv/new/duplicated-einstaklingar.csv", encoding='utf-8-sig')
pd.DataFrame(duplicate_dict).to_csv("json/duplicate-map.json", encoding='utf-8-sig')


#save as new csv inside csv/new
pd.DataFrame(domarar).to_csv("csv/new/blak-domarar.csv", encoding='utf-8-sig')
pd.DataFrame(einstaklingar).to_csv("csv/new/blak-einstaklingar.csv", encoding='utf-8-sig')
pd.DataFrame(forsvarsmenn).to_csv("csv/new/blak-forsvarsmenn.csv.csv", encoding='utf-8-sig')
pd.DataFrame(lid).to_csv("csv/new/blak-lid.csv", encoding='utf-8-sig')
pd.DataFrame(lidimoti).to_csv("csv/new/blak-lidimoti.csv", encoding='utf-8-sig')
pd.DataFrame(lidsmenn).to_csv("csv/new/blak-lidsmenn.csv", encoding='utf-8-sig')
pd.DataFrame(lidsstjorar).to_csv("csv/new/blak-lidsstjorar.csv", encoding='utf-8-sig')
pd.DataFrame(mot).to_csv("csv/new/blak-mot.csv", encoding='utf-8-sig')
pd.DataFrame(thjalfarar).to_csv("csv/new/blak-thjalfarar.csv", encoding='utf-8-sig')