Copyright (C) 2020-2024 - Raytheon BBN Technologies Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0.

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied. See the License for the specific
language governing permissions and limitations under the License.

Distribution Statement "A" (Approved for Public Release,
Distribution Unlimited).

This material is based upon work supported by the Defense
Advanced Research Projects Agency (DARPA) under Contract No.
HR001119C0102.  The opinions, findings, and conclusions stated
herein are those of the authors and do not necessarily reflect
those of DARPA.


# GetIPGeoLocation

This file finds geolocations for IPs provided and then merges them with the existing geolocations database 

The previous database will be written over with the new database


In [None]:
import sys
import os
import pandas as pd
import datetime
import ipinfo
import ipinfo.exceptions
import re
import json
import urllib3
import requests
import sys

###  User Defined Variables

In [None]:
# Set value to 1 to see intermediate outputs for debugging. 0 otherwise (recommended)
debug = 0

# File name with full file path to config.json that contains ipinfo.io keys
config_filename = 'config.json'

# Location of file that contains all the IPs to extract geo location information for 
geo_locations_ip_file = 'outputs/IPsForGeoLocation.txt'

# (Optional) Set 1 if you have an existing Geolocations database to merge, 0 otherwise
existing_geolocations_db = 0

# (Optional) Location of the current geolocation database file with path, must be specified if previous input is 1
geo_location_db_file = 'db/full_geolocations_db.csv'

### Initialization of Other Variables

In [None]:
columnns = ['Source_Address', 'City', 'Region', 'Country', 'Longitude', 'Latitude', 'AS_Number', 'AS_Name', 'Timestamp']
source_geo_locations = pd.DataFrame(columns=columnns)

if debug == 1:
    source_geo_locations.head()

In [None]:
try: 
    # opening the file in read mode 
    my_file = open(geo_locations_ip_file, "r") 

    # reading the file 
    data = my_file.read() 

    # replacing end of line('/n') with ' ' and 
    # splitting the text it further when '.' is seen. 
    source_addresses = data.replace('\n', ' ').split(' ') 

    # printing the data 

    my_file.close() 
    
except:
    sys.exit(f'Problem with reading geolocations IP file ({geo_locations_ip_file}). Check if the path is correct')

# Remove last line if the last line is empty
if source_addresses[len(source_addresses)-1] == '':
    source_addresses.pop()
    
if debug == 1:
    print(len(source_addresses))


In [None]:
print ('Extracting IP Info Keys')

try:
    config = json.load(open(config_filename, 'r'))
    ipinfo_keys = config['ipinfo_keys']

    if debug == 1:
        print(ipinfo_keys)
except:
    sys.exit(f'Problem with reading config file ({config_filename}). Check if the path is correct')

In [None]:
def get_handler(ip_counts):
    handlers = {}
    keys_to_delete = []

    if not ipinfo_keys:
        return handlers

    for key in ipinfo_keys.keys():
        if ip_counts <= ipinfo_keys[key]:
            num_to_use = ip_counts
        else:
            num_to_use = ipinfo_keys[key]

        handlers[ipinfo.getHandler(key)] = num_to_use

        ipinfo_keys[key] -= num_to_use
        if ipinfo_keys[key] <= 0:
            keys_to_delete.append(key)

        ip_counts -= num_to_use
        if ip_counts <= 0:
            break

    for key in keys_to_delete:
        ipinfo_keys.pop(key)

    if not ipinfo_keys:
        print('\033[91m' + "No IPinfo keys left" + '\033[0m')
        print('\033[93m' + "Stopping geolocation lookup" + '\033[0m')

    return handlers

In [None]:
def save_geolocation(geolocation):

    if geolocation.get('bogon', False):
        print(f"No Information available for IP: {geolocation['ip']}")
        return
    
    print(geolocation)
    
    current_row = len(source_geo_locations)
    
    ip = geolocation.get('ip')
    if ip:
        source_geo_locations.loc[current_row, 'Source_Address'] = ip
    
    city = geolocation.get('city')
    if city:
        source_geo_locations.loc[current_row, 'City'] = city
        
    region = geolocation.get('region')
    if region:
        source_geo_locations.loc[current_row, 'Region'] = region
        
    country_name = geolocation.get('country_name')
    if country_name:
        source_geo_locations.loc[current_row, 'Country'] = country_name
        
    longitude = geolocation.get('longitude')
    if longitude:
        source_geo_locations.loc[current_row, 'Longitude'] = longitude
    
    latitude = geolocation.get('latitude')
    if latitude:
        source_geo_locations.loc[current_row, 'Latitude'] = latitude
        
    org = geolocation.get('org')
    if org:
        as_number, as_name = org.split(' ')[0][2:], ' '.join(org.split(' ')[1:])
        source_geo_locations.loc[current_row, 'AS_Number'] = as_number
        source_geo_locations.loc[current_row, 'AS_Name'] = as_name
    
    source_geo_locations.loc[current_row, 'Timestamp'] = datetime.datetime.now().strftime('%m/%d/%Y')


In [None]:
handlers = get_handler(len(source_addresses))
for handler, count in handlers.items():
    try:
        print("Extracting IP Geolocations")
        geolocations = handler.getBatchDetails(source_addresses[:count])
        source_addresses = source_addresses[count:]
        for source_address in geolocations.keys():
            save_geolocation(geolocations[source_address])
            
    except (ipinfo.exceptions.TimeoutExceededError,
            requests.exceptions.ReadTimeout, requests.exceptions.Timeout):
        sys.stderr.write(f"IPinfo time exceeded for.")

In [None]:
if debug == 1:
    display(source_geo_locations)

In [None]:
source_geo_locations = source_geo_locations.drop_duplicates(subset=['Source_Address'], keep='last')

source_geo_locations = source_geo_locations.rename(columns={'Source_Address':'IP' })

if debug == 1:
    display(source_geo_locations)

### Merge with Previous GeoLocations Database

In [None]:
if existing_geolocations_db == 1:
    try:
        print("Merging with Previous Geolocations data")

        database_geo_locations = pd.read_csv(geo_location_db_file)

        if debug == 1:
            display(database_geo_locations)
    except:
        sys.exit(f'Problem with reading geolocations database file ({geo_location_db_file}). Check if the path is correct')
    

In [None]:
if existing_geolocations_db == 1:
    if len(source_geo_locations) > 0:
        full_db = pd.concat([database_geo_locations, source_geo_locations], ignore_index = True)
    else:
        full_db = database_geo_locations

    if debug == 1:
        display(full_db)

In [None]:
if existing_geolocations_db == 1:
    full_db = full_db.drop_duplicates(subset=['IP'], keep='last')

    if debug == 1:
        display(full_db)

In [None]:
if existing_geolocations_db == 1:
    print("Printing new Database to file")
    try:
        full_db.to_csv(geo_location_db_file, index=False)
    except:
        print(f'Problem writing file ({geo_location_db_file}). Check if the path is correct or if the file is open elsewhere.')