In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from pandas import json_normalize
import json
import pdfplumber

1) Reading HTML Tables
    Use Pandas to read an HTML table from a Wikipedia page and display the first 5 rows.
    Convert the extracted table into a CSV file.

In [2]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)")
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
# Extract the table headers
headers = []
for th in table.find_all('th'):
    headers.append(th.text.strip())
# Extract the table rows
rows = []
for tr in table.find_all('tr'):
    cells = tr.find_all('td')
    row = [cell.text.strip() for cell in cells]
    if row:
        rows.append(row)
# Create a DataFrame from the extracted data
df = pd.DataFrame(rows)
print(df.head())
#Convert the extracted table into a CSV file.
df.to_csv('gdp_data.csv', index=False)


               0            1          2            3          4            5  \
0          World  113,795,678       2025  105,435,540       2023  100,834,796   
1  United States   30,507,217       2025   27,360,935       2023   25,744,100   
2          China   19,231,705  [n 1]2025   17,794,782  [n 3]2023   17,963,170   
3        Germany    4,744,804       2025    4,456,081       2023    4,076,923   
4          India    4,187,017       2025    3,549,919       2023    3,465,541   

           6  
0       2022  
1       2022  
2  [n 1]2022  
3       2022  
4       2022  


2) Loading and Normalizing JSON Data
    Read JSON data from a URL using read_json().
    Normalize nested fields using json_normalize() and store the data in a Pandas Data Frame.

In [4]:
df = pd.read_json(r"data1.json")
# Display the first 5 rows of the DataFrame
print(df.head())

df = pd.read_json(r"data1.json")
# Normalize the JSON data
df_normalized = json_normalize(df)
# Display the first 5 rows of the normalized DataFrame
print("Normalized DataFrame:")
print(df_normalized.head())


      name  age            city
0      Ram   28       Avadhpuri
1    Shyam   22       Rajesthan
2  Krishna   21         Gujarat
3  Dhruval   23         Gujarat
4     Ravi   25  Madhya Pradesh
Normalized DataFrame:
Empty DataFrame
Columns: []
Index: [0, 1, 2]


3) Extracting Data from a PDF
    Extract text from a PDF file using Python.
    Convert The extracted text into a Pandas DataFrame.

In [None]:
# Path to the PDF file
pdf_path = r"google_doc_downloaded.pdf"
# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    # Iterate through all the pages
    for page_number, page in enumerate(pdf.pages, start=1):
        # Extract text from the page
        text = page.extract_text()
        print(f"Page {page_number}:\n{text}\n")

#convert above result to dataframe
text = []
for page_number, page in enumerate(pdf.pages, start=1):
    # Extract text from the page
    text.append(page.extract_text())
# Convert the list of text to a DataFrame
df = pd.DataFrame(text, columns=['Text'])

# Display the first 5 rows of the DataFrame
print(df.head())
# Convert the DataFrame to a CSV file
df.to_csv('extracted_text.csv', index=False)

Page 1:
Drone Surveillance & Object Detection
● Implement computer vision models to analyze drone footage and detect enemy
movements.
● Use anomaly detection to identify unusual activities in sensitive areas.
1. Data Collection & Preprocessing
● Drone Footage Acquisition: Collect or use publicly available drone videos/images from
military exercises, surveillance, or open datasets.
● Data Annotation: Use tools like LabelImg or Roboflow to label objects (vehicles,
humans, weapons, suspicious movements).
● Preprocessing: Convert videos to frames, resize images, normalize pixel values, and
augment data to improve model generalization.
● Location, Distance, Object_name, Date, Time,
2. Object Detection Model
You need a deep learning model capable of detecting objects in real-time. Common
approaches:
● YOLO (You Only Look Once) – Fast and efficient for real-time detection.
● Faster R-CNN – High accuracy but slower than YOLO.
● SSD (Single Shot Multibox Detector) – A balance between YOLO and F

4) Handling Missing Values in an Excel File
    Load an Excel file with missing values.
    Fill missing values using mean, median, and mode.
    Drop rows where more than 50% of data is missing.

In [None]:
df = pd.read_csv(r"Ipl_score.csv")
df['Runs in IPL'] = df['Runs in IPL'].fillna(df['Runs in IPL'].mean())
df['Match Wicket'] = df['Match Wicket'].fillna(df['Match Wicket'].median())
df['Date'] = df['Date'].fillna(df['Date'].mode()[0])
# Display the DataFrame after filling missing values
print(df.head())
#Drop rows where more than 50% of the data is missing
df = df.dropna(thresh=len(df.columns) * 0.5)
print("DataFrame after dropping rows with more than 50% missing values:")
print(df.head())

   Unnamed: 0 User Name  Runs in IPL  Match Wicket  Time Spent        Date  \
0           0     Virat           85             3  75 minutes  11-02-2025   
1           1     Rohit           90             2  70 minutes  11-02-2025   
2           2   Jasprit           25             0  65 minutes  11-02-2025   
3           3  KL Rahul           70             2  60 minutes  11-02-2025   
4           4   Rishabh           75             3  72 minutes  11-02-2025   

  Team     Role  
0  RCB  Batsman  
1   MI  Batsman  
2   MI   Bowler  
3  LSG  Batsman  
4   DC  Batsman  
DataFrame after dropping rows with more than 50% missing values:
   Unnamed: 0 User Name  Runs in IPL  Match Wicket  Time Spent        Date  \
0           0     Virat           85             3  75 minutes  11-02-2025   
1           1     Rohit           90             2  70 minutes  11-02-2025   
2           2   Jasprit           25             0  65 minutes  11-02-2025   
3           3  KL Rahul           70          

5) Removing Duplicates and Handling Unexpected Values
    Load a CSV file containing duplicate records.
    Remove duplicate rows based on multiple column.
    Find and replace unexpected values in a categorical column.

In [None]:
df = pd.read_csv(r"Ipl_score.csv")
#Remove duplicate rows based on multiple column.
df = df.drop_duplicates(subset=['User Name', 'Runs in IPL'])
# Display the DataFrame after removing duplicates
print("DataFrame after removing duplicates:")
print(df.head())
#Find and replace unexpected values in a categorical column.
df['User Name'] = df['User Name'].replace({'MS Dhoni': 'Mahendra Singh Dhoni'})
# Display the DataFrame after replacing unexpected values
print("DataFrame after replacing unexpected values:")
print(df.head())

DataFrame after removing duplicates:
   Unnamed: 0 User Name  Runs in IPL  Match Wicket  Time Spent        Date  \
0           0     Virat           85             3  75 minutes  11-02-2025   
1           1     Rohit           90             2  70 minutes  11-02-2025   
2           2   Jasprit           25             0  65 minutes  11-02-2025   
3           3  KL Rahul           70             2  60 minutes  11-02-2025   
4           4   Rishabh           75             3  72 minutes  11-02-2025   

  Team     Role  
0  RCB  Batsman  
1   MI  Batsman  
2   MI   Bowler  
3  LSG  Batsman  
4   DC  Batsman  
DataFrame after replacing unexpected values:
   Unnamed: 0 User Name  Runs in IPL  Match Wicket  Time Spent        Date  \
0           0     Virat           85             3  75 minutes  11-02-2025   
1           1     Rohit           90             2  70 minutes  11-02-2025   
2           2   Jasprit           25             0  65 minutes  11-02-2025   
3           3  KL Rahul      

6) Deleting and Removing Outliers
    Load a dataset and identify outliers in a numeric column using IQR method.
    Remove outliers and visualize the changes using a boxplot.

In [None]:
# detecting and removing outliers
df = pd.read_csv(r"Ipl_score.csv")
#identify outliers in a numeric columns using IQR method
Q1 = df['Runs in IPL'].quantile(0.25)
Q3 = df['Runs in IPL'].quantile(0.75)
IQR = Q3 - Q1
# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter out the outliers
df_no_outliers = df[(df['Runs in IPL'] >= lower_bound) & (df['Runs in IPL'] <= upper_bound)]
# Display the DataFrame without outliers
print("DataFrame without outliers:")
print(df_no_outliers.head())

DataFrame without outliers:
   Unnamed: 0 User Name  Runs in IPL  Match Wicket  Time Spent        Date  \
0           0     Virat           85             3  75 minutes  11-02-2025   
1           1     Rohit           90             2  70 minutes  11-02-2025   
2           2   Jasprit           25             0  65 minutes  11-02-2025   
3           3  KL Rahul           70             2  60 minutes  11-02-2025   
4           4   Rishabh           75             3  72 minutes  11-02-2025   

  Team     Role  
0  RCB  Batsman  
1   MI  Batsman  
2   MI   Bowler  
3  LSG  Batsman  
4   DC  Batsman  


7) Renaming Columns & Handling Irrelevant Features
    Load a dataset where column names are poorly formatted.
    Rename columns using a consistent naming convention.
    Drop columns that do not add value to the analysis.

In [None]:
#renaming columns & handling irrelevant features 
df = pd.read_csv(r"Ipl_score.csv")
# reaname columns using a consistent naming convention
df.rename(columns={'Runs in IPL': 'runs_in_ipl', 'Match Wicket': 'match_wicket'}, inplace=True)
print(df.columns)
# drop columns that do not add value to the analysis
df.drop(columns=['User Name'], inplace=True)
print(df.head())

Index(['Unnamed: 0', 'User Name', 'runs_in_ipl', 'match_wicket', 'Time Spent',
       'Date', 'Team', 'Role'],
      dtype='object')
   Unnamed: 0  runs_in_ipl  match_wicket  Time Spent        Date Team     Role
0           0           85             3  75 minutes  11-02-2025  RCB  Batsman
1           1           90             2  70 minutes  11-02-2025   MI  Batsman
2           2           25             0  65 minutes  11-02-2025   MI   Bowler
3           3           70             2  60 minutes  11-02-2025  LSG  Batsman
4           4           75             3  72 minutes  11-02-2025   DC  Batsman


8) Extracting All URLs from a Webpage
    Write a Python script to extract all hyperlinks (<a> tags with href attributes) from a given webpage.

In [10]:
#Extrating all url from a webpage
# URL of the webpage to scrape
url = 'https://www.britannica.com/story/what-was-the-first-book-ever-written'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')

for link in bs.find_all('a'):
    href = link.get('href')
    if href[:6] == "https:":
        print(href)

https://premium.britannica.com/premium-membership/?utm_source=premium&utm_medium=global-nav&utm_campaign=evergreen-cap
https://premium.britannica.com/premium-membership/?utm_source=premium&utm_medium=global-nav-mobile&utm_campaign=evergreen
https://premium.britannica.com/premium-membership/?utm_source=premium&utm_medium=hamburger-menu&utm_campaign=evergreen
https://kids.britannica.com/
https://www.facebook.com/BRITANNICA/
https://x.com/britannica
https://www.britannica.com/story/what-was-the-first-book-ever-written
https://www.facebook.com/BRITANNICA/
https://x.com/britannica
https://www.britannica.com/story/what-was-the-first-book-ever-written
https://cdn.britannica.com/62/121162-050-25A9F581/Flood-Tablet-epic-series-Gilgamesh-Nineveh-British.jpg
https://www.britannica.com/place/Mesopotamia-historical-region-Asia
https://www.britannica.com/topic/Epic-of-Gilgamesh
https://www.britannica.com/topic/Iliad-epic-poem-by-Homer
https://www.britannica.com/topic/Odyssey-epic-by-Homer
https://ww

9) Extracting Image URLs from a Webpage
    Write a script to scrape all image (<img>) URLs from a given webpage and save them to a list.

In [11]:
#Extrating image urls from webpages
url = 'https://www.britannica.com/story/what-was-the-first-book-ever-written'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
for image in bs.find_all('img'):
    src = image.get('src')
    if src[:1] == "/":
        print(url + src)
    else:
        print(src)
#Save to list
image_list = []
for image in bs.find_all('img'):
    src = image.get('src')
    if src[:1] == "/":
        image_list.append(url + src)
    else:
        image_list.append(src)

https://cdn.britannica.com/mendel/eb-logo/MendelNewThistleLogo.png
https://cdn.britannica.com/mendel/eb-logo/MendelNewThistleLogo.png
https://cdn.britannica.com/28/188528-131-D58974EF/Marie-Antoinette-Louis-XVI-unrest-monarchy-overthrow-France-August-1792.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/87/238587-050-A11B8CBF/Stylish-light-blue-jeans-on-wooden-background-closeup-of-inset-pocket.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/27/238527-131-D73B3F08/flagpoles-world-countries.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/64/164764-131-89DFC4D4/Richard-M-Nixon-scandal-press-conference-March-12-1971.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/94/187194-131-4117D246/Thumbnail-flags-quiz-Russia-Iceland-Norway-Slovenia.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/37/191937-131-9472ED49/Polar-bear-ice-floes-waters-Arctic-Norway.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/70/62770-131-C97CD1F7/Black-widow-spider.jpg?w=68&h=68&s=crop
https://cdn.britannica.com/62/1211

10) Extracting Title & Meta Description of a Webpage (Write a script to extract and display the title and meta Description of a Webpage)

In [12]:
# URL of the webpage to scrape
url = 'https://www.britannica.com/story/what-was-the-first-book-ever-written'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')

# Extract title
title = bs.title.string if bs.title else "No title found"

# Extract meta description
meta_tag = bs.find('meta', attrs={'name': 'description'})
meta_description = meta_tag['content'] if meta_tag else "No meta description found"

# Print the title and meta description
print("Title:", title)
print("Meta Description:", meta_description)

Title: What Was the First Book Ever Written? | Britannica
Meta Description: Learn about the earliest surviving literature.


11) Removing Unwanted HTML Tags(Parse an HTML page and Remove all <script> and <style> tags, Extract the cleaned-up content.)

In [13]:
# URL of the webpage to scrape
url = 'https://www.britannica.com/story/what-was-the-first-book-ever-written'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
# Remove all <script> and <style> tags
for script in bs(['script', 'style']):
    script.decompose()
# Get the text from the remaining HTML
text = bs.get_text()
# Print the cleaned text
print(text)
#Extract the cleaned-up content
cleaned_content = bs.find_all(text=True)
# Filter out unwanted tags
filtered_content = [content for content in cleaned_content if content.parent.name not in ['script', 'style']]
# Join the filtered content into a single string
cleaned_text = ''.join(filtered_content)
# Print the cleaned text
print(cleaned_text)
















What Was the First Book Ever Written? | Britannica











































Search Britannica





Click here to search











Search Britannica





Click here to search







   SUBSCRIBE



   SUBSCRIBE


Login

https://premium.britannica.com/premium-membership/?utm_source=premium&utm_medium=nav-login-box&utm_campaign=evergreen







  SUBSCRIBE




Home
History & Society
Science & Tech
Biographies
Animals & Nature
Geography & Travel
Arts & Culture
ProCon
Money


Games & Quizzes
Videos
On This Day
One Good Fact
Dictionary
New Articles

History & Society

Lifestyles & Social Issues
Philosophy & Religion
Politics, Law & Government
World History

Science & Tech

Health & Medicine
Science
Technology

Biographies

Browse Biographies

Animals & Nature

Birds, Reptiles & Other Vertebrates
Bugs, Mollusks & Other Invertebrates
Environment
Fossils & Geologic Time
Mammals
Plants

Geography & Travel

Geography & Travel

Arts & Culture

Entertainment & Pop C

  cleaned_content = bs.find_all(text=True)


12) Navigating & Extracting Specific HTML Elements (Given a webpage, find all <li> tags inside <ul> and navigate to the parent <ul> tag.)

In [14]:
#Navigating & extrating specific HTML elements

url = 'https://www.britannica.com/story/what-was-the-first-book-ever-written'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
# Find all <li> tags inside <ul> tags
li_tags = bs.find_all('li')
# Print the parent <ul> tag for each <li> tag
for li in li_tags:
    parent_ul = li.find_parent('ul')
    if parent_ul:
        print("Parent <ul> tag:", parent_ul)
    else:
        print("No parent <ul> tag found for this <li> tag.")
li_texts = [li.get_text() for li in li_tags]
print("Extracted <li> text content:")
for text in li_texts:
    print(text)

Parent <ul> tag: <ul>
<li><a href="/">Home</a></li>
<li><a href="/History-Society">History &amp; Society</a></li>
<li><a href="/Science-Tech">Science &amp; Tech</a></li>
<li><a href="/Biographies">Biographies</a></li>
<li><a href="/Animals-Nature">Animals &amp; Nature</a></li>
<li><a href="/Geography-Travel">Geography &amp; Travel</a></li>
<li><a href="/Arts-Culture">Arts &amp; Culture</a></li>
<li><a href="/procon">ProCon</a></li>
<li><a href="/money">Money</a></li>
</ul>
Parent <ul> tag: <ul>
<li><a href="/">Home</a></li>
<li><a href="/History-Society">History &amp; Society</a></li>
<li><a href="/Science-Tech">Science &amp; Tech</a></li>
<li><a href="/Biographies">Biographies</a></li>
<li><a href="/Animals-Nature">Animals &amp; Nature</a></li>
<li><a href="/Geography-Travel">Geography &amp; Travel</a></li>
<li><a href="/Arts-Culture">Arts &amp; Culture</a></li>
<li><a href="/procon">ProCon</a></li>
<li><a href="/money">Money</a></li>
</ul>
Parent <ul> tag: <ul>
<li><a href="/">Home</