In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Example function for scraping data from a website
def scrape_data_from_website(registration_numbers):
    scraped_data = {}
    
    for reg_num in registration_numbers:
        # Example URL structure, replace with actual website URL
        url = f"https://www.carinfo.app/rc-details/{reg_num}"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Example: Extract vehicle name from scraped HTML
                div_element = soup.find("div", {"class": "MuiBox-root css-ig0coa"})
                if div_element:
                    p_element = div_element.find("p", {"class": "MuiTypography-root MuiTypography-body1 css-1qt56zs"})
                    if p_element:
                        vehicle_name = p_element.text.strip()
                        scraped_data[reg_num] = vehicle_name
                    else:
                        scraped_data[reg_num] = "Vehicle name not found"
                else:
                    scraped_data[reg_num] = "Div element not found"
            else:
                scraped_data[reg_num] = f"Failed to retrieve data, status code: {response.status_code}"
        except Exception as e:
            scraped_data[reg_num] = f"Failed to retrieve data: {str(e)}"
        
        # Optional: Add a delay between requests to avoid overloading the server
        time.sleep(1)
    
    return scraped_data

def split_csv_and_scrape(input_file, output_prefix, chunk_size=500):
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Calculate the number of chunks
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)
    
    for i in range(num_chunks):
        # Get the start and end index for the current chunk
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        
        # Extract the chunk
        chunk = df.iloc[start_idx:end_idx]
        
        # Perform scraping on the registration numbers in this chunk
        registration_numbers = chunk['replaced_string'].tolist()
        scraped_data = scrape_data_from_website(registration_numbers)
        
        # Define the output file name
        output_file = f"{output_prefix}_{i+1}_scraped.csv"
        
        # Save the scraped data to a new CSV file
        with open(output_file, 'w') as f:
            for reg_num, data in scraped_data.items():
                f.write(f"{reg_num},{data}\n")
        
        print(f"Saved scraped data to {output_file}")

# Example usage
input_file = "Vehiclefinal2.csv"
output_prefix = "outputfinalnew2__"
split_csv_and_scrape(input_file, output_prefix)


Saved scraped data to outputfinalnew2___1_scraped.csv
Saved scraped data to outputfinalnew2___2_scraped.csv
Saved scraped data to outputfinalnew2___3_scraped.csv
