# Formatting Geo Names
* This worksheet reads the text file we got from `geonames.org`, converts the data in the format we need and saves it to csv for future use. 
* This data is downloaded from https://download.geonames.org/export/dump/


In [25]:
import pandas as pd
import numpy as np
import matplotlib as plt

from pathlib import Path

In [26]:
data_dir = Path("..", "data")
column_headers = ["geonameid",
                  "name",
                  "asciiname",
                  "alternatenames",
                  "latitude",
                  "longitude",
                  "feature class",
                  "feature code",
                  "country code",
                  "cc2",
                  "admin1 code",
                  "admin2 code",
                  "admin3 code",
                  "admin4 code",
                  "population",
                  "elevation",
                  "dem",
                  "timezone",
                  "modification date",
                  ]
tabbed_data = pd.read_csv(Path(data_dir, "IN.txt"), sep="\t", names=column_headers)
tabbed_data.head()

  tabbed_data = pd.read_csv(Path(data_dir, "IN.txt"), sep="\t", names=column_headers)


Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,`1114940,Rāvi River,Ravi River,"Ravi,Ravi River,Rāvi,Rāvi River",30.62123,71.82683,H,STM,IN,"IN,PK",0.0,,,,0,,133,Asia/Kolkata,2023-11-07
1,1114942,Punjab Plains,Punjab Plains,Punjab Plains,30.0,75.0,T,PLN,IN,,0.0,,,,0,,206,Asia/Kolkata,2012-01-16
2,1114957,Jhelum River,Jhelum River,"Jhelum,Jhelum River,River Hydaspes,Veth River,...",31.16853,72.15066,H,STM,IN,PK,0.0,,,,0,,147,Asia/Kolkata,2020-11-11
3,1114958,Hindustan,Hindustan,"Hindustan,Hindustán",28.0,76.0,L,RGN,IN,"IN,PK",0.0,,,,0,,344,Asia/Kolkata,2006-06-23
4,1114965,Basantar River,Basantar River,"Basantar,Basantar Nala,Basantar Nāla,Basantar ...",32.47452,75.01449,H,STM,IN,,0.0,,,,0,,300,Asia/Kolkata,2021-09-07


* We just need `asciiname`, `latitude`, `longitude`, we can drop rest of the colums

In [27]:
tabbed_data.drop(columns=["geonameid",
                          "alternatenames",
                          "feature class",
                          "feature code",
                          "country code",
                          "cc2",
                          "admin1 code",
                          "admin2 code",
                          "admin3 code",
                          "admin4 code",
                          "population",
                          "elevation",
                          "dem",
                          "timezone",
                          "modification date",
                          ], inplace=True)

In [28]:
tabbed_data.head()

Unnamed: 0,name,asciiname,latitude,longitude
0,Rāvi River,Ravi River,30.62123,71.82683
1,Punjab Plains,Punjab Plains,30.0,75.0
2,Jhelum River,Jhelum River,31.16853,72.15066
3,Hindustan,Hindustan,28.0,76.0
4,Basantar River,Basantar River,32.47452,75.01449


In [29]:
## lets rename asciiname to name
tabbed_data.rename(columns={"asciiname":"ascii_name", "latitude":"lat","longitude":"long"}, inplace=True)

In [30]:
tabbed_data.head()

Unnamed: 0,name,ascii_name,lat,long
0,Rāvi River,Ravi River,30.62123,71.82683
1,Punjab Plains,Punjab Plains,30.0,75.0
2,Jhelum River,Jhelum River,31.16853,72.15066
3,Hindustan,Hindustan,28.0,76.0
4,Basantar River,Basantar River,32.47452,75.01449


In [31]:
## lets check for empty values
tabbed_data.isnull().sum()

name          1
ascii_name    1
lat           0
long          0
dtype: int64

In [33]:
tabbed_data[tabbed_data["name"].isnull()]

Unnamed: 0,name,ascii_name,lat,long
506582,,,31.52325,74.98279


In [35]:
## a manual reverse latlong lookup tells me this is `Bundala` so we'll just update that. 
tabbed_data.loc[tabbed_data["name"].isnull(),"name"] = "Bundala"
tabbed_data.loc[tabbed_data["ascii_name"].isnull(),"ascii_name"] = "Bundala"

In [36]:
tabbed_data.isnull().sum()

name          0
ascii_name    0
lat           0
long          0
dtype: int64

In [37]:
## save it as csv
tabbed_data.to_csv(Path(data_dir, "detailed_in.csv"), index=False)