In [1]:
pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [2]:
!pip install spacy



In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import spacy

In [5]:

# Replace this URL with the URL you want to scrape
url = "https://www.ci.richmond.ca.us/1404/Major-Projects"

# Send a request to the URL
response = requests.get(url)

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
# Find all parents with class 'fr-view'
parents = soup.find_all(class_="fr-view")


In [7]:
projects = []

# Iterate through each parent and find links (a tags)
for parent in parents:
    for a_tag in parent.find_all('a', href=True):
        project_url = a_tag['href']

        try:
            # Visit the project link
            project_response = requests.get(project_url)
            project_response.raise_for_status()  # Check if the request was successful

            project_soup = BeautifulSoup(project_response.content, 'html.parser')

            # Extract project title (usually in h1 or h2 tag, but depends on site structure)
            project_title = project_soup.find('h1') or project_soup.find('h2')
            if project_title:
                project_title = project_title.text.strip()

            # Extract project details (assuming details are in <p> tags)
            project_details = project_soup.get_text(separator="\n")
            project_details = "\n".join([line.strip() for line in project_details.splitlines() if line.strip()])

            # Append the project data to the list with the URL
            projects.append({
                "title": project_title,
                "details": project_details,
                "url": project_url  # Add the URL to the dictionary
            })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {project_url}: {e}")
            continue  # Skip this URL and continue with the next one


Error fetching /2835/Central-Avenue-Housing: Invalid URL '/2835/Central-Avenue-Housing': No scheme supplied. Perhaps you meant https:///2835/Central-Avenue-Housing?
Error fetching http://www.ccta.net/_resources/detail/36/2: 404 Client Error: Not Found for url: http://ccta.net/_resources/detail/36/2
Error fetching /2965/RLC-FBC: Invalid URL '/2965/RLC-FBC': No scheme supplied. Perhaps you meant https:///2965/RLC-FBC?
Error fetching /: Invalid URL '/': No scheme supplied. Perhaps you meant https:///?


In [8]:

# Save the project data into a CSV file
with open('scraped.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["title", "details","url"])
    writer.writeheader()
    writer.writerows(projects)

print("Data has been saved to csv")

Data has been saved to csv


### Extracting entities using NER and getting final output

In [9]:
df = pd.read_csv('scraped.csv')
df

Unnamed: 0,title,details,url
0,Via Verdi Slope Stabilization Project,Via Verdi Slope Stabilization Project | Richmo...,http://www.ci.richmond.ca.us/viaverdiproject
1,Travel Safe Richmond,"Travel Safe Richmond | Richmond, CA - Official...",https://www.ci.richmond.ca.us/4486/Travel-Safe...
2,Atlas Road Industrial Building Project,Atlas Road Industrial Building Project | Richm...,http://www.ci.richmond.ca.us/3001/Atlas-Road-I...
3,Richmond Greenway Gap Closure Project,Richmond Greenway Gap Closure Project | Richmo...,http://www.ci.richmond.ca.us/index.aspx?NID=2443
4,Mathieu Court Alley Play Street,"Mathieu Court Alley Play Street | Richmond, CA...",http://www.ci.richmond.ca.us/2595/Mathieu-Cour...
5,The Point Richmond Residential Project,The Point Richmond Residential Project | Richm...,http://www.ci.richmond.ca.us/3157/The-Point-Ri...
6,Richmond Wet Weather Storage Project,Richmond Wet Weather Storage Project | Richmon...,http://www.ci.richmond.ca.us/2775/Wet-Weather-...
7,Richmond Country Club Residential Project,Richmond Country Club Residential Project | Ri...,http://www.ci.richmond.ca.us/3977/Richmond-Cou...
8,Shea Homes,"Shea Homes | Richmond, CA - Official Website\n...",http://www.ci.richmond.ca.us/index.aspx?NID=2779
9,Nevin Homes Residential Project,"Nevin Homes Residential Project | Richmond, CA...",http://www.ci.richmond.ca.us/index.aspx?NID=2928


In [10]:
df['details']

Unnamed: 0,details
0,Via Verdi Slope Stabilization Project | Richmo...
1,"Travel Safe Richmond | Richmond, CA - Official..."
2,Atlas Road Industrial Building Project | Richm...
3,Richmond Greenway Gap Closure Project | Richmo...
4,"Mathieu Court Alley Play Street | Richmond, CA..."
5,The Point Richmond Residential Project | Richm...
6,Richmond Wet Weather Storage Project | Richmon...
7,Richmond Country Club Residential Project | Ri...
8,"Shea Homes | Richmond, CA - Official Website\n..."
9,"Nevin Homes Residential Project | Richmond, CA..."


In [11]:
df['details'][0]


'Via Verdi Slope Stabilization Project | Richmond, CA - Official Website\nSkip to Main Content\nSearch\nCity Government\nDepartments\nBusiness\nCommunity\nHome\nDepartments\nCommunity Development\nPlanning Division\nVia Verdi Slope Stabilization Project\nA\nA\nVia Verdi Slope Stabilization Project\nProject Contact:\nLina Velasco\nPlanning & Building Services Director\nLina_Velasco@ci.richmond.ca.us\n(510) 620-6841\nProject Description\nThe proposed Via Verdi Slope Stabilization Project (project) has been designed by the City of Richmond (City) to reconstruct a segment of the Via Verdi roadway that was damaged by a landslide in 2017; the road was closed at that time and an emergency roadway continues to provide access for the Sobrante Glen neighborhood. Reconstruction of the roadway requires installation of a culvert within San Pablo Creek, backfilled with engineered fill, to buttress the landslide and provide a stabilized footing for the roadway embankment. An offsite mitigation area i

In [12]:
# Load a pre-trained NLP model from SpaCy
nlp = spacy.load("en_core_web_sm")

In [13]:

# Function to extract entities
def extract_entities(text):
    if isinstance(text, str):  # Check if the input is a string
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]
    else:
        return []  # Return an empty list if the input is not a string


In [14]:

# Apply the function to each row in the 'details' column
df['entities'] = df['details'].apply(extract_entities)

In [18]:
# Function to map entities to the required attributes
from geopy.geocoders import Nominatim
from geopy.geocoders import Nominatim
import pycountry
import requests


def map_entities(row):
    entity_dict = {ent[1]: ent[0] for ent in row['entities']}

    country_name = entity_dict.get('GPE', 'na')

    # Country Name
    def find_country(city_name):
      geolocator = Nominatim(user_agent="my_geocoding_app")
      location = geolocator.geocode(city_name)


      if location:
          address = location.raw['display_name'].split(', ')
          return address[-1]
      else:
          return 'Country not found'

    city = country_name
    country_name = find_country(city)

    # Country code
    def find_country_code(city_name):
        geolocator = Nominatim(user_agent="my_geocoding_app")
        location = geolocator.geocode(city_name)

        if location:
            try:
                country = pycountry.countries.lookup(country_name)
                return country.alpha_2
            except LookupError:
                return 'NA'
        else:
            return 'NA'

    city = country_name
    country_code = find_country_code(city)



    #longitude and latitude
    def find_coordinates(city_name, api_key):
        # Geoapify Geocoding API endpoint
        url = "https://api.geoapify.com/v1/geocode/search"

        # Parameters for the API request
        params = {
            'text': city_name,
            'apiKey': api_key
        }

        # Make the request to the Geoapify API
        response = requests.get(url, params=params)
        data = response.json()

        # Check if the response contains results
        if data['results']:
            # Extract the first result's latitude and longitude
            result = data['results'][0]
            latitude = result['lat']
            longitude = result['lon']
            return latitude, longitude
        else:
            return 'NA', 'NA'

    # Example usage
    api_key = '35837307e0124668a37d69755f6d88e4'
    city_name = country_name
    latitude, longitude = find_coordinates(city_name, api_key)



    region_name = entity_dict.get('LOC', 'na')


    # Create a dictionary for the CSV row
    row_dict = {
        'original_id': row['original_id'] if 'original_id' in row else 'na',
        'aug_id': row['aug_id'] if 'aug_id' in row else 'na',
        'country_name': country_name,
        'country_code': country_code,
        'region_name': region_name,
        'region_code': 'na',
        'latitude': latitude,
        'longitude': longitude,
        'url': row['url'] if 'url' in row else 'na',
        'title': row['title'] if 'title' in row else 'na',
        'description': row['details'] if 'details' in row else 'na',
        'status': row['status'] if 'status' in row else 'na',
        'timestamp': row['timestamp'] if 'timestamp' in row else 'na',
        'timestamp_label': 'na',
        'budget': 'na',
        'budget_label': 'na',
        'currency': 'na',
        'sector': 'na',
        'subsector': 'na',
        'document_urls': 'na',
        'org':row['org'] if 'org' in row else 'na',
        'date':row['date'] if 'date' in row else 'na',

    }
    return row_dict



In [19]:
# Apply the mapping function to each row in the DataFrame
csv_data = df.apply(map_entities, axis=1)

# Create a new DataFrame from the mapped data
csv_df = pd.DataFrame(list(csv_data))



KeyError: 'results'

In [17]:
# Save the DataFrame to a CSV file
csv_df.to_csv('final.csv', index=False)

print("CSV file 'final.csv' has been created.")


CSV file 'final.csv' has been created.


In [22]:
def find_coordinates(city_name, api_key):
    # Geoapify Geocoding API endpoint
    url = "https://api.geoapify.com/v1/geocode/search"
    # Parameters for the API request
    params = {
        'text': city_name,
        'apiKey': api_key
    }
    # Make the request to the Geoapify API
    response = requests.get(url, params=params)
    data = response.json()
    # Check if the response contains results
    if data['results']:
        # Extract the first result's latitude and longitude
        result = data['results'][0]
        latitude = result['lat']
        longitude = result['lon']
        return latitude, longitude
    else:
        return 'NA', 'NA'
# Example usage
api_key = '35837307e0124668a37d69755f6d88e4'
city_name = "Gwalior"
latitude, longitude = find_coordinates(city_name, api_key)

KeyError: 'results'

In [23]:
import requests

def find_coordinates(city_name, api_key):
    url = f"https://api.example.com/geocode?city={city_name}&key={api_key}"
    response = requests.get(url)
    data = response.json()
    print(data)  # Print the entire response for inspection

    # Check if 'results' key exists and handle accordingly
    if 'results' in data:
        if data['results']:
            result = data['results'][0]
            latitude = result['geometry']['location']['lat']
            longitude = result['geometry']['location']['lng']
            return latitude, longitude
        else:
            print("No results found.")
            return None, None
    else:
        print("Key 'results' not found in response.")
        return None, None

# Example usage
api_key = '35837307e0124668a37d69755f6d88e4'
city_name = "Gwalior"
latitude, longitude = find_coordinates(city_name, api_key)


ConnectionError: HTTPSConnectionPool(host='api.example.com', port=443): Max retries exceeded with url: /geocode?city=Gwalior&key=35837307e0124668a37d69755f6d88e4 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a2c915296f0>: Failed to resolve 'api.example.com' ([Errno -2] Name or service not known)"))