# Q1. Scrape the details of most viewed videos on YouTube from Wikipedia. Url
= https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos You need to find following details: A)
Rank
B) Name
C) Artist
D) Upload date
E) Views 

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

# Set up Selenium WebDriver
driver = webdriver.Chrome()

# Navigate to the Wikipedia page
driver.get("https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos")

# Extract video data
videos = []
table = driver.find_element(By.XPATH, '//table[@class="sortable wikitable sticky-header static-row-numbers sort-under col3center col4right jquery-tablesorter"]')
rows = table.find_elements(By.XPATH, './/tr')[1:]

for row in rows:
    cols = row.find_elements(By.XPATH, './/td')
    if len(cols) >= 5:
        rank = cols[0].text.strip()
        name_artist_text = cols[1].text.strip()
        name, artist = name_artist_text.split('\n') if '\n' in name_artist_text else (name_artist_text, "")
        upload_date = cols[3].text.strip()
        views = cols[2].text.strip()
        
        videos.append({
            "Rank": rank,
            "Name": name,
            "Artist": artist,
            "Upload Date": upload_date,
            "Views": views
        })

# Close the WebDriver
driver.quit()

# Save the data to a DataFrame and then to a CSV file
df = pd.DataFrame(videos)
df.to_csv("most_viewed_youtube_videos.csv", index=False)

print(df)


                                               Rank  \
0                             "Baby Shark Dance"[7]   
1                                   "Despacito"[10]   
2                        "Johny Johny Yes Papa"[18]   
3                                   "Bath Song"[19]   
4                               "See You Again"[20]   
5                                "Shape of You"[25]   
6                           "Wheels on the Bus"[28]   
7                 "Phonics Song with Two Words"[29]   
8                                 "Uptown Funk"[30]   
9                               "Gangnam Style"[31]   
10  "Learning Colors – Colorful Eggs on a Farm"[36]   
11                             "Dame Tu Cosita"[37]   
12   "Masha and the Bear – Recipe for Disaster"[38]   
13                                     "Axel F"[39]   
14                        "Baa Baa Black Sheep"[40]   
15                                      "Sugar"[41]   
16                             "Lakdi Ki Kathi"[42]   
17        

# Q2. Scrape the details team India’s international fixtures from bcci.tv.
Url = https://www.bcci.tv/.
You need to find following details:
A) Series
B) Place
C) Date
D) Time
Note: - From bcci.tv home page you have reach to the international fixture page through code. 

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time


driver = webdriver.Chrome()
driver.get("https://www.bcci.tv/fixtures")
time.sleep(10) 

while True:
    try:
        load_more_button = driver.find_element(By.XPATH, '//*[@id="fixtures"]/div[3]/div[2]/div/button')
        load_more_button.click()
        time.sleep(5) 
    except:
        break

fixture_data = []



for card in fixture_cards:
    try:
        series = driver.find_element(By.XPATH, '//h5[@class="match-tournament-name ng-binding"]').text
    except:
        series = "N/A"

    try:
        place = driver.find_element(By.XPATH, './/div[@class="fixture-card-bottom"]/span').text.strip()
    except:
        place = "N/A"

    try:
        date = driver.find_element(By.XPATH, './/div[@class="fixture-card-date-time"]/span[1]').text.strip()
    except:
        date = "N/A"
    
    try:
        time = driver.find_element(By.XPATH, './/div[@class="fixture-card-date-time"]/span[2]').text.strip()
    except:
        time = "N/A"

    fixture_data.append({
        "Series": series,
        "Place": place,
        "Date": date,
        "Time": time
    })


df = pd.DataFrame(fixture_data)
df.to_csv("bcci_international_fixtures.csv", index=False)
print(df)


Empty DataFrame
Columns: []
Index: []


# Q 3. Scrape the details of State-wise GDP of India from statisticstime.com.
Url = http://statisticstimes.com/
You have to find following details: A) Rank
B) State
C) GSDP(18-19)- at current prices
D) GSDP(19-20)- at current prices
E) Share(18-19)
F) GDP($ billion)
Note: - From statisticstimes home page you have to reach to economy page through code. 

In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

driver = webdriver.Chrome()
driver.get("http://statisticstimes.com/")
time.sleep(7)

try:
    economy_link = driver.find_element(By.XPATH, '//*[@id="top"]/div[2]/div[2]/button')
    economy_link.click()
    time.sleep(3)

    indian_economy_link = driver.find_element(By.XPATH, '//*[@id="top"]/div[2]/div[2]/div/a[3]')
    indian_economy_link.click()
    time.sleep(3)

    indian_states_link = driver.find_element(By.XPATH, '/html/body/div/div[2]/div[2]/ul/li[1]/a')
    indian_states_link.click()
    time.sleep(3)

    try:
        close_button = driver.find_element(By.XPATH, '//div[@id="dismiss-button"]')
        close_button.click()
        time.sleep(2)
    except:
        pass

    table = driver.find_element(By.XPATH, '//*[@id="table_id"]/tbody')
    rows = table.find_elements(By.TAG_NAME, "tr")

    gdp_data = []

    for row in rows:
        data = row.find_elements(By.TAG_NAME, "td")
        if len(data) == 6:
            rank = data[0].text
            state = data[1].text
            gsdp_18_19 = data[2].text
            gsdp_19_20 = data[3].text
            share_18_19 = data[4].text
            gdp_billion = data[5].text

            gdp_data.append({
                "Rank": rank,
                "State": state,
                "GSDP(18-19) - Current Prices": gsdp_18_19,
                "GSDP(19-20) - Current Prices": gsdp_19_20,
                "Share(18-19)": share_18_19,
                "GDP ($ Billion)": gdp_billion
            })

    driver.quit()

    df = pd.DataFrame(gdp_data)
    df.to_csv("statewise_gdp_india.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


An error occurred: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div/div[2]/div[2]/ul/li[1]/a"}
  (Session info: chrome=125.0.6422.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7F15D1F52+60322]
	(No symbol) [0x00007FF7F154CEC9]
	(No symbol) [0x00007FF7F1407EBA]
	(No symbol) [0x00007FF7F1457676]
	(No symbol) [0x00007FF7F145773C]
	(No symbol) [0x00007FF7F149E967]
	(No symbol) [0x00007FF7F147C25F]
	(No symbol) [0x00007FF7F149BC80]
	(No symbol) [0x00007FF7F147BFC3]
	(No symbol) [0x00007FF7F1449617]
	(No symbol) [0x00007FF7F144A211]
	GetHandleVerifier [0x00007FF7F18E94AD+3301629]
	GetHandleVerifier [0x00007FF7F19336D3+3605283]
	GetHandleVerifier [0x00007FF7F1929450+3563680]
	GetHandleVerifier [0x00007FF7F1684326+790390]
	(No symbol) [0x00007FF7F155750F]
	(No symbol) [0x00007FF7F1553404]
	(No symbol) [

# Q 4. Scrape the details of trending repositories on Github.com.
Url = https://github.com/
You have to find the following details:
A) Repository title
B) Repository description
C) Contributors count
D) Language used 

In [25]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

driver = webdriver.Chrome()
driver.get("https://github.com/")
time.sleep(5)  # Wait for the page to fully load

try:
 
    trending_repos_section = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/header/div/div[2]/div/nav/ul/li[3]/div/div[3]/ul/li[2]/a')
    repos = trending_repos_section.find_elements(By.XPATH, './/article[@class="Box-row"]')

    repo_data = []

    for repo in repos:
        try:
            title = repo.find_element(By.TAG_NAME, 'h1').text.strip()
        except:
            title = "No title"

        try:
            description = repo.find_element(By.TAG_NAME, 'p').text.strip()
        except:
            description = "No description"

        try:
            language = repo.find_element(By.XPATH, './/span[@itemprop="programmingLanguage"]').text.strip()
        except:
            language = "No language"

        try:
            contributors = repo.find_element(By.XPATH, './/a[contains(@href, "/contributors")]').text.strip()
        except:
            contributors = "0"

        repo_data.append({
            "Repository Title": title,
            "Repository Description": description,
            "Contributors Count": contributors,
            "Language Used": language
        })

    driver.quit()

 
    df = pd.DataFrame(repo_data)
    df.to_csv("github_trending_repos.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


Data saved successfully.


# Q 5. Scrape the details of top 100 songs on billiboard.com. Url = https:/www.billboard.com/ You have to find the
following details:
A) Song name
B) Artist name
C) Last week rank
D) Peak rank
E) Weeks on board
 Note: - From the home page you have to click on the charts option then hot 100-page link through code. 

In [33]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

try:

    driver = webdriver.Chrome()
    driver.get("https://www.billboard.com/charts/hot-100/")
    time.sleep(5)  


    songs = driver.find_elements(By.XPATH, '//li[@class="chart-list__element display--flex"]')

    song_data = []

    for song in songs[:100]:  
        try:
            song_name = song.find_element(By.XPATH, './/span[@class="chart-element__information"]/span[@class="chart-element__information__song text--truncate color--primary"]').text.strip()
        except:
            song_name = "No Song Name"

        try:
            artist_name = song.find_element(By.XPATH, './/span[@class="chart-element__information"]/span[@class="chart-element__information__artist text--truncate color--secondary"]').text.strip()
        except:
            artist_name = "No Artist Name"

        try:
            last_week_rank = song.find_element(By.XPATH, './/div[@class="chart-element__meta text--center color--secondary text--last"]').text.strip()
        except:
            last_week_rank = "No Last Week Rank"

        try:
            peak_rank = song.find_element(By.XPATH, './/div[@class="chart-element__meta text--center color--secondary text--peak"]').text.strip()
        except:
            peak_rank = "No Peak Rank"

        try:
            weeks_on_board = song.find_element(By.XPATH, './/div[@class="chart-element__meta text--center color--secondary text--week"]').text.strip()
        except:
            weeks_on_board = "No Weeks on Board"

        song_data.append({
            "Song Name": song_name,
            "Artist Name": artist_name,
            "Last Week Rank": last_week_rank,
            "Peak Rank": peak_rank,
            "Weeks on Board": weeks_on_board
        })

  
    driver.quit()

 
    df = pd.DataFrame(song_data)
    df.to_csv("billboard_top_100_songs.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


Data saved successfully.


# Q 6. Scrape the details of Highest selling novels.
A) Book name
B) Author name
C) Volumes sold
D) Publisher
E) Genre
 Url - https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare

In [35]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd


driver = webdriver.Chrome()
url = "https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare"
driver.get(url)

try:
   
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//table[@class="in-article sortable"]')))


    table = driver.find_element(By.XPATH, '//table[@class="in-article sortable"]')

  
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  

    book_data = []

    for row in rows:
        columns = row.find_elements(By.TAG_NAME, 'td')
        
        
        if len(columns) >= 6:
            book_name = columns[1].text.strip()
            author_name = columns[2].text.strip()
            volumes_sold = columns[3].text.strip()
            publisher = columns[4].text.strip()
            genre = columns[5].text.strip()
            
           
            book_data.append({
                "Book Name": book_name,
                "Author Name": author_name,
                "Volumes Sold": volumes_sold,
                "Publisher": publisher,
                "Genre": genre
            })

   
    driver.quit()

   
    df = pd.DataFrame(book_data)
    df.to_csv("highest_selling_novels.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


Data saved successfully.


# Q 7. Scrape the details most watched tv series of all time from imdb.com.
Url = https://www.imdb.com/list/ls095964455/ You have
to find the following details:
A) Name
B) Year span
C) Genre
D) Run time
E) Ratings
F) Votes 

In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time


driver = webdriver.Chrome()
url = "https://www.imdb.com/list/ls095964455/"
driver.get(url)
time.sleep(5)  

try:

    series_elements = driver.find_elements(By.XPATH, '//div[@class="lister-item-content"]')

    series_data = []

    for series in series_elements:
        try:
            name = series.find_element(By.XPATH, './/h3[@class="lister-item-header"]/a').text.strip()
        except:
            name = "No Name"

        try:
            year_span = series.find_element(By.XPATH, './/span[@class="lister-item-year text-muted unbold"]').text.strip()
        except:
            year_span = "No Year Span"

        try:
            genre = series.find_element(By.XPATH, './/span[@class="genre"]').text.strip()
        except:
            genre = "No Genre"

        try:
            runtime = series.find_element(By.XPATH, './/span[@class="runtime"]').text.strip()
        except:
            runtime = "No Runtime"

        try:
            ratings = series.find_element(By.XPATH, './/div[@class="ipl-rating-star small"]/span[@class="ipl-rating-star__rating"]').text.strip()
        except:
            ratings = "No Ratings"

        try:
            votes = series.find_element(By.XPATH, './/p[@class="text-muted text-small"]/span[@name="nv"]').text.strip()
        except:
            votes = "No Votes"

        series_data.append({
            "Name": name,
            "Year Span": year_span,
            "Genre": genre,
            "Run Time": runtime,
            "Ratings": ratings,
            "Votes": votes
        })

    
    driver.quit()

    
    df = pd.DataFrame(series_data)
    df.to_csv("imdb_most_watched_tv_series.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


Data saved successfully.


# Q 8.Details of Datasets from UCI machine learning repositories.
Url = https://archive.ics.uci.edu/ You
have to find the following details:
A) Dataset name
B) Data type
C) Task
D) Attribute type
E) No of instances
F) No of attribute G) Year
 Note: - from the home page you have to go to the Show All Dataset page through code. 

In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time


driver = webdriver.Chrome()
url = "https://archive.ics.uci.edu/"
driver.get(url)
time.sleep(3)  

try:
   
    show_all_link = driver.find_element(By.XPATH, '/html/body/div/div[1]/div[1]/header/nav/ul/li[1]/a')
    show_all_link.click()
    time.sleep(3)  
    table = driver.find_element(By.XPATH, '//div[@class="flex flex-col gap-1"]')
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  

    dataset_data = []

    for row in rows:
        columns = row.find_elements(By.TAG_NAME, 'td')

        
        dataset_name = columns[0].text.strip()
        data_type = columns[1].text.strip()
        task = columns[2].text.strip()
        attribute_types = columns[3].text.strip()
        num_instances = columns[4].text.strip()
        num_attributes = columns[5].text.strip()
        year = columns[6].text.strip()

      
        dataset_data.append({
            "Dataset Name": dataset_name,
            "Data Type": data_type,
            "Task": task,
            "Attribute Types": attribute_types,
            "No of Instances": num_instances,
            "No of Attributes": num_attributes,
            "Year": year
        })


    driver.quit()

   
    df = pd.DataFrame(dataset_data)
    df.to_csv("uci_datasets_details.csv", index=False)
    print("Data saved successfully.")

except Exception as e:
    print("An error occurred:", str(e))
    driver.quit()


Data saved successfully.
