### Install Python Libraries

In [10]:
! pip install langchain
! pip install langchain-core
! pip install langchain-community
! pip install google-generativeai
! pip install gephistreamer



### Create a list of Web URL for data extraction

In [12]:
# list of top Tech leaders
url_list=[  
    'https://en.wikipedia.org/wiki/Elon_Musk',
    'https://en.wikipedia.org/wiki/Mark_Zuckerberg',
    'https://en.wikipedia.org/wiki/Bill_Gates',
    'https://en.wikipedia.org/wiki/Jeff_Bezos',
    'https://en.wikipedia.org/wiki/Steve_Jobs',
    'https://en.wikipedia.org/wiki/Sam_Altman',
    'https://en.wikipedia.org/wiki/Larry_Ellison',
    'https://en.wikipedia.org/wiki/Larry_Page',
    'https://en.wikipedia.org/wiki/Sundar_Pichai',
    'https://en.wikipedia.org/wiki/Satya_Nadella'  
]


### Define function to clean data

In [9]:
# define a function to clean the extracted web URL data
import re #for regular expression 
 
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
# Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


### Use Langchain framework to extract data

In [21]:
# extract the data from the URLs
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
 
def extract_data_from_URL(url):
    loader=WebBaseLoader([url])
    data=loader.load().pop().page_content
    data=clean_text(data)
    documents=[Document(page_content=data)]
    # print(documents)
    splitter=RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
    smaller_doc=splitter.split_documents(documents)
    print(len(smaller_doc))
    return smaller_doc


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Use Gemini API and generate sample response

In [17]:
import google.generativeai as genai 
import os 
 
#system_instruction
system_prompt='''
Answer the question in JSON format and nothing else,Do not use code block formatting.
'''
# test connection for gemini 
genai.configure(api_key='AIzaSyBV53zeLNrslEn88SuDkWaJu1kUn1O_Ry4')
client = genai.GenerativeModel(model_name="gemini-2.5-flash", system_instruction=system_prompt)
response = client.generate_content("Write a story about how texas can become a tech hub in the future.")
print(response.text)


{
  "title": "The Silicon Sagebrush: Texas's Tech Horizon",
  "introduction": "In the year 2045, the scorching Texas sun still beat down, but its heat now fueled server farms and solar arrays, not just oil rigs. The scent of ozone mingled with the mesquite, a testament to a transformation few had dared to imagine just two decades prior. Texas, once defined by cattle, oil, and vast open spaces, had become a global titan of technology, its sprawling metropolises and revitalized small towns humming with innovation.",
  "chapters": [
    {
      "title": "The Roots of Innovation",
      "content": "It wasn't an overnight revolution. The seeds were always there. Texas had long been a crucible of big thinking—from NASA's lunar missions to the sprawling energy sector that demanded constant ingenuity. Its universities, like UT Austin, Texas A&M, and Rice, consistently produced top-tier engineering talent. Companies, seeing the lower cost of living and business-friendly environment, had already

# Create system prompt for the LLM to understand context of the task

In [18]:
# create system prompt to extract data in JSON format as required
 
system=""" You are a network graph maker tasked with analyzing the relationships involving top leaders in the world. Your job is to process the provided context chunk 
and extract an ontology of terms that represent key entrepreneurs, their associated entities, and all kinds of relationships present in the context.
 
**Guidelines for Extraction:**
 
1. **Identify Key Entrepreneurs and Related Terms**:
   - Extract key entrepreneurs and related concepts such as:
     - Companies, organizations, or industries they are associated with.
     - Collaborators, partners, rivals, or competitors.
     - Key innovations, achievements, or milestones.
     - Locations, events, or time periods relevant to their actions.
 
2. **Identify Relationships**:
   - Extract all types of relationships between entrepreneurs and other entities (or between entities themselves).
   - Relationships can include:
     - Professional roles or associations.
     - Business partnerships, collaborations, or rivalries.
     - Innovations or contributions to industries.
     - Personal connections or influences.
     - Historical events or shared milestones.
 
3. **Define Relationships**:
   - Clearly specify the nature of each relationship in simple and concise terms.
   - Relationships should convey meaningful connections relevant to the context.
 
**Response Format**:
- Provide your output **strictly as a list of JSON objects**. No additional text, descriptions,tags or comments are allowed.
- Each object should include the following fields:
  - `"node_1"`: The first entity in the relationship (can be a person, organization, or concept).
  - `"node_2"`: The second entity in the relationship.
  - `"edge"`: A concise sentence describing the relationship between `node_1` and `node_2`.
 
**Example Output**:
[
   {
       "node_1": "Elon Musk",
       "node_2": "SpaceX",
       "edge": "Elon Musk founded SpaceX to revolutionize space exploration."
   },
   {
       "node_1": "Steve Jobs",
       "node_2": "Apple Inc.",
       "edge": "Steve Jobs co-founded Apple Inc., a leading tech company."
   },
   {
       "node_1": "Mark Zuckerberg",
       "node_2": "Sheryl Sandberg",
       "edge": "Sheryl Sandberg worked closely with Mark Zuckerberg as COO of Facebook."
   },
   {
       "node_1": "Jeff Bezos",
       "node_2": "Blue Origin",
       "edge": "Jeff Bezos founded Blue Origin to focus on space exploration."
   }
]
 
**Important Note**:
- Always respond exclusively in JSON format. Any deviation from the JSON structure or inclusion of additional text will not be accepted.
- Do not use code block formatting like ` ``` `.
- Output must be a valid JSON array of objects without any surrounding text.
 
Please provide the context containing information about entrepreneurs and their relationships for analysis.

"""


# Use LLM’s to extract data

In [22]:
from datetime import datetime
from itertools import cycle
 
results=[]
models = [
    'gemini-2.0-flash',
    'gemini-2.0-flash-lite',
    'gemini-1.5-pro',
    'gemini-1.5-flash',
    'gemini-1.5-flash-8b',
]
model_cycle = cycle(models)  # Create an infinite cycle of models
model_name = next(model_cycle)  # Start with the first model
start_time=datetime.now()
for url in url_list:
 
 
    
    smaller_doc=extract_data_from_URL(url)
    for doc in smaller_doc[:30]:
        try:
            client = genai.GenerativeModel(model_name=model_name, system_instruction=system,
                                           generation_config={"response_mime_type":'application/json'})
            chat_completion = client.generate_content(doc.page_content)
            # print(chat_completion.candidates[0].content.parts[0].text)
            results.append(chat_completion.candidates[0].content.parts[0].text)
        
        except Exception as e:
        #print('Exception',e)
            errordata=e.args[0]
            print(e.args)
    
            if 'exceeded' in errordata or 'exhausted' in errordata:
                print('Rate limit exceeded for model:', model_name)
                model_name = next(model_cycle)  # Switch to the next model
                print('Switching to model:', model_name)
                
end_time=datetime.now()
len(results)
print(f'extracted information in {end_time-start_time}')


59
33
49
('You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15\nPlease retry in 32.605889668s.',)
Rate limit exceeded for model: gemini-2.0-flash
Switching to model: gemini-2.0-flash-lite
44
47
15
22
27
12
11
extracted information in 0:14:21.668382


# Save the result to JSON file

In [23]:
import json 
#print(results)
combined_nodes_and_edges=[]
for res in results:
    try:
        combined_nodes_and_edges.extend(json.loads(res)) #convert the string result from LLM to JSON 
    except Exception as e:
        print('buggy JSON object', e)
 
with open('Nodes_and_edges.json','w') as file:
    json.dump(combined_nodes_and_edges,file,indent=1)


# Send JSON data to Gephi

In [24]:
from gephistreamer import graph
from gephistreamer import streamer
# connect to gephi server
# create a stream 
stream = streamer.Streamer(streamer.GephiWS(hostname="localhost", port=8080, workspace="workspace1"))


# Load nodes and edges data from json to gephi

In [25]:
# load the nodes and edges from the json file
with open('Nodes_and_edges.json','r') as file:
    results=json.load(file)


In [26]:
# loop through the list of json result and send to Gephi
for res in results:
    try:        
        node_a = graph.Node(res['node_1'],custom_property=1)
        node_b = graph.Node(res['node_2'],custom_property=2)
        stream.add_node(node_a,node_b)
        edge_ab = graph.Edge(node_a,node_b,custom_property=res['edge'])
        stream.add_edge(edge_ab)
 
    except Exception as e:
        print('buggy JSON object', e,res)
