In [1]:
import pandas as pd
import numpy as np

In [2]:
colnames = [
    "geonameid",
    "name",
    "asciiname",
    "alternatenames",
    "latitude",
    "longitude",
    "feature class",
    "feature code",
    "country code",
    "cc2",
    "admin1 code",
    "admin2 code",
    "admin3 code",
    "admin4 code",
    "population",
    "elevation",
    "dem",
    "timezone",
    "modification date"
]

In [3]:
dtypes = {
    "geonameid": np.int32,
    "name": str,
    "asciiname": str,
    "alternatenames": str,
    "latitude": np.float32,
    "longitude": np.float32,
    "feature class": str,
    "feature code": str,
    "country code": str,
    "cc2": str,
    "admin1 code": str,
    "admin2 code": str,
    "admin3 code": str,
    "admin4 code": str,
    "population": np.uint64,
    "elevation": np.float32,
    "dem": str,
    "timezone": str,
    "modification date": str    
}

In [18]:
geonames = pd.read_csv(
    "../../geonames/2021/cities500.txt",
    sep="\t",
    names=colnames,
    dtype=dtypes
)

In [19]:
geonames.shape

(196707, 19)

In [20]:
geonames.head()

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,3038999,Soldeu,Soldeu,,42.576881,1.66769,P,PPL,AD,,2,,,,602,,1832,Europe/Andorra,2017-11-06
1,3039154,El Tarter,El Tarter,"Ehl Tarter,Эл Тартер",42.579521,1.65362,P,PPL,AD,,2,,,,1052,,1721,Europe/Andorra,2012-11-03
2,3039163,Sant Julià de Lòria,Sant Julia de Loria,"San Julia,San Julià,Sant Julia de Loria,Sant J...",42.463718,1.49129,P,PPLA,AD,,6,,,,8022,,921,Europe/Andorra,2013-11-23
3,3039604,Pas de la Casa,Pas de la Casa,"Pas de la Kasa,Пас де ла Каса",42.54277,1.73361,P,PPL,AD,,3,,,,2363,2050.0,2106,Europe/Andorra,2008-06-09
4,3039678,Ordino,Ordino,"Ordino,ao er di nuo,orudino jiao qu,Ордино,オルデ...",42.556229,1.53319,P,PPLA,AD,,5,,,,3066,,1296,Europe/Andorra,2018-10-26


In [21]:
def to_lower(row, column):
    if not type(row[column]) == str:
        return row[column]
    
    return row[column].lower()

In [22]:
geonames["timezone"] = geonames.apply(to_lower, axis=1, column="timezone")
geonames["name"] = geonames.apply(to_lower, axis=1, column="name")
geonames["asciiname"] = geonames.apply(to_lower, axis=1, column="asciiname")
geonames["alternatenames"] = geonames.apply(to_lower, axis=1, column="alternatenames")

In [23]:
geonames.shape

(196707, 19)

In [24]:
countries_sp = pd.read_csv('../../countries_csv/lista_paises_espanol.csv',
                           sep="\t",
                           names=["Code", "Name", "-"]
                          )
del countries_sp["-"]

In [25]:
countries_en = pd.read_csv('../../countries_csv/lista_paises_ingles.csv',
                           sep="\t",
                           names=["Code", "Name", "-"]
                          )
del countries_en["-"]

In [26]:
countries = pd.concat([countries_sp, countries_en])

In [27]:
countries["Name"] = countries.apply(to_lower, axis=1, column="Name")

In [28]:
countries.drop_duplicates(inplace=True)

In [29]:
countries.head(2)

Unnamed: 0,Code,Name
0,AF,afganistán
1,AL,albania


In [37]:
countries = countries.drop_duplicates(subset="Code")

In [38]:
countries["Code"].nunique()

237

In [39]:
geonames = pd.merge(
    left=geonames,
    right=countries,
    how='inner',
    left_on='country code',
    right_on='Code'
)

In [40]:
geonames.head(2)

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,...,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date,Code,Name
0,3038999,soldeu,soldeu,,42.576881,1.66769,P,PPL,AD,,...,,,,602,,1832,europe/andorra,2017-11-06,AD,andorra
1,3039154,el tarter,el tarter,"ehl tarter,эл тартер",42.579521,1.65362,P,PPL,AD,,...,,,,1052,,1721,europe/andorra,2012-11-03,AD,andorra


In [41]:
geonames.shape

(195610, 21)

In [42]:
geonames_splitted_names = (
    geonames.set_index(geonames.columns.drop('alternatenames',1).tolist())
    .alternatenames.str.split(',', expand=True)
    .stack()
    .reset_index()
    .rename(columns={0:'alternatenames'})
    .loc[:, ["alternatenames", "latitude", "longitude", "country code", "population", "Name"]]
)

In [43]:
geonames_splitted_names = pd.merge(
    left=geonames_splitted_names,
    right=geonames.loc[:, ["asciiname", "latitude", "longitude", "country code", "population", "Name"]].rename(columns={"asciiname": "alternatenames"}),
    how='outer'
)

geonames_splitted_names = pd.merge(
    left=geonames_splitted_names,
    right=geonames.loc[:, ["name", "latitude", "longitude", "country code", "population", "Name"]].rename(columns={"name": "alternatenames"}),
    how='outer'
)

In [44]:
geonames_splitted_names["alternatenames"] = geonames_splitted_names.apply(to_lower, axis=1, column="alternatenames")

In [45]:
geonames_splitted_names.head()

Unnamed: 0,alternatenames,latitude,longitude,country code,population,Name
0,ehl tarter,42.579521,1.65362,AD,1052,andorra
1,эл тартер,42.579521,1.65362,AD,1052,andorra
2,san julia,42.463718,1.49129,AD,8022,andorra
3,san julià,42.463718,1.49129,AD,8022,andorra
4,sant julia de loria,42.463718,1.49129,AD,8022,andorra


In [46]:
geonames_splitted_names.dropna(subset=["country code"], inplace=True)

geonames_splitted_names.drop_duplicates(
    subset=["alternatenames", "Name"],
    inplace=True
)

geonames_splitted_names.shape

(801420, 6)

In [47]:
geonames_splitted_names.head(2)

Unnamed: 0,alternatenames,latitude,longitude,country code,population,Name
0,ehl tarter,42.579521,1.65362,AD,1052,andorra
1,эл тартер,42.579521,1.65362,AD,1052,andorra


In [48]:
geonames_splitted_names.rename(columns={'Name': 'country', 'alternatenames': 'city_name'}, inplace=True)

In [49]:
geonames_splitted_names.head(2)

Unnamed: 0,city_name,latitude,longitude,country code,population,country
0,ehl tarter,42.579521,1.65362,AD,1052,andorra
1,эл тартер,42.579521,1.65362,AD,1052,andorra


In [50]:
geonames_splitted_names.shape

(801420, 6)

In [53]:
geonames.to_csv("../../../../../unT/ffunes/geonames.csv")
geonames_splitted_names.to_csv('../../../../../unT/ffunes/geonames_parsed.csv')