In [None]:
# pip install pycountry

In [None]:
# !pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import spacy


In [None]:
url = input("Enter the URL: ")
# Replace this URL with the URL you want to scrape

# Send a request to the URL
response = requests.get(url)

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Find all parents with class 'fr-view'
parents = soup.find_all(class_="fr-view")


In [None]:
projects = []

# Iterate through each parent and find links (a tags)
for parent in parents:
    for a_tag in parent.find_all('a', href=True):
        project_url = a_tag['href']

        try:
            # Visit the project link
            project_response = requests.get(project_url)
            project_response.raise_for_status()  # Check if the request was successful

            project_soup = BeautifulSoup(project_response.content, 'html.parser')

            # Extract project title (usually in h1 or h2 tag, but depends on site structure)
            project_title = project_soup.find('h1') or project_soup.find('h2')
            if project_title:
                project_title = project_title.text.strip()

            # Extract project details (assuming details are in <p> tags)
            project_details = project_soup.get_text(separator="\n")
            project_details = "\n".join([line.strip() for line in project_details.splitlines() if line.strip()])

            # Append the project data to the list with the URL
            projects.append({
                "title": project_title,
                "details": project_details,
                "url": project_url  # Add the URL to the dictionary
            })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {project_url}: {e}")
            continue  # Skip this URL and continue with the next one


In [None]:

# Save the project data into a CSV file
with open('scraped.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["title", "details","url"])
    writer.writeheader()
    writer.writerows(projects)

print("Data has been saved to csv")

### Extracting entities using NER and getting final output

In [None]:
df = pd.read_csv('scraped.csv')
df

In [None]:
df['details']

In [None]:
df['details'][0]


In [None]:
# Load a pre-trained NLP model from SpaCy
nlp = spacy.load("en_core_web_sm")

In [None]:

# Function to extract entities
def extract_entities(text):
    if isinstance(text, str):  # Check if the input is a string
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]
    else:
        return []  # Return an empty list if the input is not a string


In [None]:

# Apply the function to each row in the 'details' column
df['entities'] = df['details'].apply(extract_entities)

In [None]:
# Function to map entities to the required attributes
from geopy.geocoders import Nominatim
from geopy.geocoders import Nominatim
import pycountry
import requests


def map_entities(row):
    entity_dict = {ent[1]: ent[0] for ent in row['entities']}

    country_name = entity_dict.get('GPE', 'na')

    # Country Name
    def find_country(city_name):
      geolocator = Nominatim(user_agent="my_geocoding_app")
      location = geolocator.geocode(city_name)


      if location:
          address = location.raw['display_name'].split(', ')
          return address[-1]
      else:
          return 'Country not found'

    city = country_name
    country_name = find_country(city)

    # Country code
    def find_country_code(city_name):
        geolocator = Nominatim(user_agent="my_geocoding_app")
        location = geolocator.geocode(city_name)

        if location:
            try:
                country = pycountry.countries.lookup(country_name)
                return country.alpha_2
            except LookupError:
                return 'NA'
        else:
            return 'NA'

    city = country_name
    country_code = find_country_code(city)

    region_name = entity_dict.get('LOC', 'na')


    # Create a dictionary for the CSV row
    row_dict = {
        'original_id': row['original_id'] if 'original_id' in row else 'na',
        'aug_id': row['aug_id'] if 'aug_id' in row else 'na',
        'country_name': country_name,
        'country_code': country_code,
        'region_name': region_name,
        'region_code': 'na',
        'latitude': 'na',
        'longitude': 'na',
        'url': row['url'] if 'url' in row else 'na',
        'title': row['title'] if 'title' in row else 'na',
        'description': row['details'] if 'details' in row else 'na',
        'status': row['status'] if 'status' in row else 'na',
        'timestamp': row['timestamp'] if 'timestamp' in row else 'na',
        'timestamp_label': 'na',
        'budget': 'na',
        'budget_label': 'na',
        'currency': 'na',
        'sector': 'na',
        'subsector': 'na',
        'document_urls': 'na',
        'org':row['org'] if 'org' in row else 'na',
        'date':row['date'] if 'date' in row else 'na',

    }
    return row_dict



In [None]:
# Apply the mapping function to each row in the DataFrame
csv_data = df.apply(map_entities, axis=1)

# Create a new DataFrame from the mapped data
csv_df = pd.DataFrame(list(csv_data))



In [None]:
# Save the DataFrame to a CSV file
csv_df.to_csv('final.csv', index=False)

print("CSV file 'final.csv' has been created.")


In [None]:
output = pd.read_csv("final.csv")
output.head(5)