<a href="https://colab.research.google.com/github/chetankhairnar05/Python_Automation/blob/main/scrapping_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Task1: https://www.football-data.co.uk/englandm.php this page has so many url's of csv files of englan football leuges identify the urls and download all the csv files
# First, you'll need to install the required libraries.
# You can run these commands in your terminal or a Colab cell:
# !pip install requests
# !pip install beautifulsoup4
# !pip install pandas

import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Define the URL of the webpage and the directory where you want to save the files.
base_url = "https://www.football-data.co.uk/"
page_url = urljoin(base_url, "englandm.php")
download_dir = "football_data_csv"

# --- Step 1: Fetch the HTML content of the page ---
try:
    response = requests.get(page_url)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    print(f"Successfully fetched HTML from {page_url}")
except requests.exceptions.RequestException as e:
    print(f"Error fetching the URL: {e}")
    exit()

# --- Step 2: Parse the HTML to find all links ---
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
print(f"Found {len(links)} links on the page.")

# Create the directory to store the CSV files if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print(f"Created directory: {download_dir}")

# --- Step 3: Filter for CSV links and download the files ---
csv_count = 0
for link in links:
    href = link.get('href')

    # We only care about links that end with '.csv'
    if href and href.endswith('.csv'):
        # Construct the full URL for the CSV file
        csv_url = urljoin(base_url, href)

        # Get the filename from the URL
        filename = os.path.basename(href)
        save_path = os.path.join(download_dir, filename)

        try:
            # Download the CSV file
            csv_response = requests.get(csv_url)
            csv_response.raise_for_status()

            # Save the file to the specified directory
            with open(save_path, 'wb') as f:
                f.write(csv_response.content)
            print(f"Successfully downloaded and saved: {filename}")
            csv_count += 1
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {filename}: {e}")

print(f"\n--- Download complete! Downloaded {csv_count} CSV files. ---\n")

# --- Step 4: Demonstrate reading a downloaded CSV file with pandas ---
if csv_count > 0:
    # Let's read the first downloaded file as an example
    first_file = os.path.join(download_dir, os.listdir(download_dir)[0])
    try:
        df = pd.read_csv(first_file)
        print(f"Successfully read {os.path.basename(first_file)} into a pandas DataFrame.")
        print("\nFirst 5 rows of the DataFrame:")
        print(df.head())
    except Exception as e:
        print(f"Error reading the CSV file {first_file}: {e}")


Successfully fetched HTML from https://www.football-data.co.uk/englandm.php
Found 311 links on the page.
Created directory: football_data_csv
Successfully downloaded and saved: E2.csv
Successfully downloaded and saved: E3.csv
Successfully downloaded and saved: E0.csv
Successfully downloaded and saved: E1.csv
Successfully downloaded and saved: E2.csv
Successfully downloaded and saved: E3.csv
Successfully downloaded and saved: EC.csv
Successfully downloaded and saved: E0.csv
Successfully downloaded and saved: E1.csv
Successfully downloaded and saved: E2.csv
Successfully downloaded and saved: E3.csv
Successfully downloaded and saved: EC.csv
Successfully downloaded and saved: E0.csv
Successfully downloaded and saved: E1.csv
Successfully downloaded and saved: E2.csv
Successfully downloaded and saved: E3.csv
Successfully downloaded and saved: EC.csv
Successfully downloaded and saved: E0.csv
Successfully downloaded and saved: E1.csv
Successfully downloaded and saved: E2.csv
Successfully downl

In [3]:
import shutil
from google.colab import files

# Create a zip file of the directory
shutil.make_archive("football_data_csv", 'zip', "football_data_csv")

# Download the zip file
files.download("football_data_csv.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>