# Create a name dataset
This dataset is going to be created by normalising merging 2 datasets:
[US Baby Names]('https://www.kaggle.com/datasets/kaggle/us-baby-names') and [French Baby Names](https://www.kaggle.com/datasets/haezer/french-baby-names)

In [2]:
import pandas as pd
import unicodedata 



In [4]:
df_fr = pd.read_csv("french/national_names.csv")
df_fr

Unnamed: 0,year,name,sex,count
0,1900,Abeline,F,3
1,1900,Abelle,F,3
2,1900,Ada,F,4
3,1900,Adelaide,F,194
4,1900,Adèle,F,661
...,...,...,...,...
601218,2018,Zoumana,M,8
601219,2018,Zyad,M,71
601220,2018,Zyan,M,17
601221,2018,Zyane,M,8


In [5]:
df_fr.name.unique().size

31708

In [6]:
df_us = pd.read_csv("us/NationalNames.csv")
df_us

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


In [7]:
df_us.Name.unique().size

93889

In [8]:
# to merge the names from both datasets
# first normalise the french names
# then select only the unique names from each dataset
# finally merge them, again selecting only the unique names
# save the resulting dataset as "names.csv"

In [9]:
# normalise french names
def strip_accents(string): 
    return "".join(c for c in unicodedata.normalize("NFD", string) if not unicodedata.combining(c)) 

french_unique_names = df_fr.name.unique()

In [17]:
french_names_normalised = [strip_accents(french_name.lower()) for french_name in french_unique_names] 

In [34]:
set(''.join(french_names_normalised))

{"'",
 '-',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'æ'}

In [37]:
# removing words with characters out of a-z limit.
french_names_filtered = [name for name in french_names_normalised if (name.find('-')<0) and (name.find("'")<0) and (name.find("æ")<0)]
print(f'Total: {len(french_names_normalised)} -> AFter filtering: {len(french_names_filtered)}')

Total: 31708 -> AFter filtering: 29269


In [44]:
us_names = [name.lower() for name in df_us.Name.unique()]

In [46]:
set(''.join(us_names))

{'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [47]:
print(f'french names size:{len(french_names_filtered)}')
print(f'us names size:{len(us_names)}')

french names size:29269
us names size:93889


In [49]:
full_list_names = set(french_names_filtered + us_names)
print(f'Total unique names: {len(full_list_names)}')

Total unique names: 107422


In [50]:
# save the resulting names in the file 'names.txt'
# open file in write mode
with open(r'./names.txt', 'w') as fp:
    for item in full_list_names:
    # write each name on a new line
        fp.write("%s\n" % item)