# Notebook for extracting latitude and longitude from address

Datasets used in this notebook for extracting longitude and latitude:

Rehabilitation Centers: https://www.samhsa.gov/data/report/2023-national-directory-of-drug-and-alcohol-use-treatment-facilities

Mental Health Treatment Centers: https://www.samhsa.gov/data/report/2023-national-directory-of-mental-health-treatment-facilities

## Installing GeoPy library

In [None]:
!pip install geopy

## Importing other required libraries

In [2]:
import os
from os import path
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

## Importing the dataset containing all the addresses for different rehab centers

In [4]:
!wget -q -O rehab_dir.xlsx https://github.com/gauravshetty98/Gaurav-GIS-Repo/raw/main/national-directory-su-facilities-2023.xlsx

In [5]:
rehab_dir = pd.read_excel('rehab_dir.xlsx', sheet_name = 0)
print(rehab_dir.head())
print(rehab_dir.shape)

                                 Name1                                Name2  \
0       Shelby County Treatment Center                                  NaN   
1  Lighthouse of Tallapoosa County Inc  Substance Abuse Rehab Program/Resid   
2            South Central Alabama MHC                    Montezuma Complex   
3        Anniston Fellowship House Inc                                  NaN   
4               State Line Medical LLC                                  NaN   

                Street1 Street2            City State    Zip         Phone  \
0  750 Highway 31 South     NaN       Alabaster    AL  35007  205-216-0200   
1    36 Franklin Street     NaN  Alexander City    AL  35010  256-234-4894   
2     205 Academy Drive     NaN       Andalusia    AL  36420  334-428-5050   
3  106 East 22nd Street     NaN        Anniston    AL  36201  256-236-7229   
4     26928 Main Street     NaN         Ardmore    AL  35739  256-374-6537   

        Intake1 Intake2 Intake1a Intake2a  \
0          

## Inititalizing GeoPy Nominatim

In [4]:
loc = Nominatim(user_agent="Geopy Library")

getLoc = loc.geocode("Ä°zmir")

print("Latitude = ", getLoc.latitude, "\n")
print("Longitude = ", getLoc.longitude)

Latitude =  38.4237433 

Longitude =  27.1428019


## Running a loop through the dataframe to extract latitude and longitude for each address

In [10]:
lat_list = []
long_list = []
geocode = RateLimiter(loc.geocode, min_delay_seconds=1/20)

for i in range(0,rehab_dir.shape[0]):
    try:
        if len(str(rehab_dir.iloc[i,6])) == 4:
            getLoc = loc.geocode("0"+str(rehab_dir.iloc[i,6])+" , united states")
            lat_list.append(getLoc.latitude)
            long_list.append(getLoc.longitude)
        else:
            getLoc = loc.geocode(str(rehab_dir.iloc[i,6])+" , united states")
            lat_list.append(getLoc.latitude)
            long_list.append(getLoc.longitude)
        if i % 1000 == 0:
            print(i)
    except:
        lat_list.append(0)
        long_list.append(0)
        print("not found: ", i)

print(len(lat_list))
print(len(long_list))

0
1000
2000
3000
4000
5000
not found:  5155
6000
7000
not found:  7092
8000
9000
10000
11000
12000
12744
12744


In [11]:
rehab_dir['lat'] = lat_list
rehab_dir['long'] = long_list

In [10]:
state_list = ['AL', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'ID',
       'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN',
       'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND',
       'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
       'VA', 'WA', 'WV', 'WI', 'WY']


## Removing all the rehab centers for states which we are not considering

In [25]:
count_list = []
for i in range(rehab_dir.shape[0]):
    if rehab_dir.iloc[i,5] not in state_list:
        count_list.append(i)
print(count_list)

[128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309, 3310, 3311, 3312, 3313, 3314, 3315, 3316, 3317, 3318, 3319, 3320, 3321, 3322, 3323, 3324, 3325, 3326, 3327, 3328, 3329, 3330, 3331, 3332, 3333, 3334, 3335, 3336, 3337, 3338, 3339, 3340, 3341, 3342, 3343, 3344, 3345, 3346, 3347, 3348, 3349, 3350, 3351, 3352, 3353, 3354, 3355, 3356, 3357, 3358, 3359, 3360, 3361, 3362, 3363, 3364, 3365, 3366, 3367, 3368, 3369, 3370, 3371, 3372, 3373, 3374, 3375, 3376, 3377, 3378, 3379, 3380, 3381, 3382, 3383, 9306, 10575, 10576, 10577, 10578, 10579, 10580, 10581, 10582, 10583, 10584, 10585, 10586, 10587, 10588, 10589, 10590, 10591, 10592, 10593, 10594, 10595,

In [27]:
rehab_dir1 = rehab_dir.drop(index = count_list)
rehab_dir1 = rehab_dir1.reset_index()

## Removing some outliers present in the rehab list

In [115]:
outlier_list =[598, 611, 3757, 3761, 4076, 4757, 5002, 6939, 7813, 9128, 9763]

In [81]:
rehab_dir2 = rehab_dir1.drop(index = outlier_list)
rehab_dir2 = rehab_dir2.reset_index()

## Storing the final dataframe in CSV format

In [None]:
rehab_dir2.to_csv('rehab_dir_lat_long5.csv')

## Repeating the process of mental health treatment centers

In [1]:
!wget -q -O mh_dir.xlsx https://github.com/gauravshetty98/Gaurav-GIS-Repo/raw/main/national-directory-mh-facilities-2023.xlsx

In [3]:
mh_dir = pd.read_excel('mh_dir.xlsx', sheet_name = 0)
print(mh_dir.head())
print(mh_dir.shape)

                        Name1                                  Name2  \
0  SpectraCare Health Systems                    Henry County Clinic   
1  SpectraCare Health Systems             Henry County Day Treatment   
2   South Central Alabama MHC  Covington County Mental Health Center   
3   South Central Alabama MHC                      Montezuma Complex   
4           RMC Health System                Regional Medical Center   

                     Street1 Street2       City State    Zip         Phone  \
0            219 Dothan Road     NaN  Abbeville    AL  36310  800-951-4357   
1  1242 US Highway 431 South     NaN  Abbeville    AL  36310  800-951-4357   
2      19815 Bay Branch Road     NaN  Andalusia    AL  36420  334-222-2523   
3          205 Academy Drive     NaN  Andalusia    AL  36420  334-428-5050   
4       400 East 10th Street     NaN   Anniston    AL  36207  256-235-5121   

        Intake1       Intake2 Intake1a Intake2a  \
0           NaN           NaN      NaN      NaN

## Converting all the addresses into latitude and longitude

In [5]:
lat_list = []
long_list = []
geocode = RateLimiter(loc.geocode, min_delay_seconds=1/20)

for i in range(0,mh_dir.shape[0]):
    try:
        if len(str(mh_dir.iloc[i,6])) == 4:
            getLoc = loc.geocode("0"+str(mh_dir.iloc[i,6])+" , united states")
            lat_list.append(getLoc.latitude)
            long_list.append(getLoc.longitude)
        else:
            getLoc = loc.geocode(str(mh_dir.iloc[i,6])+" , united states")
            lat_list.append(getLoc.latitude)
            long_list.append(getLoc.longitude)
        if i % 1000 == 0:
            print(i)
    except:
        lat_list.append(0)
        long_list.append(0)
        print("not found: ", i)

print(len(lat_list))
print(len(long_list))

0
not found:  116
1000
2000
3000
not found:  3160
4000
5000
6000
7000
8000
8707
8707


In [7]:
mh_dir['lat'] = lat_list
mh_dir['long'] = long_list

## Removing states not used in final project

In [11]:
count_list = []
for i in range(mh_dir.shape[0]):
    if mh_dir.iloc[i,5] not in state_list:
        count_list.append(i)
print(count_list)

[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 5857, 6976, 6977, 6978, 6979, 6980, 6981, 6982, 6983, 6984, 6985, 6986, 6987, 6988, 6989, 6990, 6991, 6992, 6993, 6994, 6995, 6996, 6997, 6998, 6999, 7000, 7001, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7009, 7010, 7011, 7012, 7642, 7643, 7644, 7645]


In [12]:
mh_dir1 = mh_dir.drop(index = count_list)
mh_dir1 = mh_dir1.reset_index()

## Removing some outliers from the list

In [29]:
outlier = [18, 1562, 2139, 2140, 2356, 2358, 3064, 4302, 4700, 8179]

In [30]:
mh_dir2 = mh_dir1.drop(index = outlier)
mh_dir2 = mh_dir2.reset_index()

## Converting the final dataframe to CSV

In [31]:
mh_dir2.to_csv('mental_health_centers2.csv')