In [None]:
# Generate Excel Spreadsheet

In [2]:
import pandas as pd
import numpy as np

# Create a DataFrame with columns A to J and 100 rows
df = pd.DataFrame(np.arange(1, 101).reshape(100, 1), columns=['A'])
for col in range(ord('B'), ord('K')):
  df[chr(col)] = np.arange(1, 101)

# Save the DataFrame to an Excel file
df.to_excel('output.xlsx', index=False)

print("Excel sheet 'output.xlsx' created successfully.")

Excel sheet 'output.xlsx' created successfully.


In [3]:
excel_df = pd.read_excel('output.xlsx')
display(excel_df)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1,1,1,1,1,1,1,1,1,1
1,2,2,2,2,2,2,2,2,2,2
2,3,3,3,3,3,3,3,3,3,3
3,4,4,4,4,4,4,4,4,4,4
4,5,5,5,5,5,5,5,5,5,5
...,...,...,...,...,...,...,...,...,...,...
95,96,96,96,96,96,96,96,96,96,96
96,97,97,97,97,97,97,97,97,97,97
97,98,98,98,98,98,98,98,98,98,98
98,99,99,99,99,99,99,99,99,99,99


In [None]:
# Top 250 IMDB movie

In [70]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings

# Suppress warnings that might occur during HTML parsing
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

def scrape_imdb_top_250(url):
    """Fetches and parses the IMDB Top 250 chart into a list of dictionaries."""
    # Use a User-Agent to mimic a browser, which is crucial for IMDB
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.text, 'lxml')

    # The main list items are the <li> tags with class 'ipc-metadata-list-summary-item'
    movie_list_items = soup.find_all('li', class_='ipc-metadata-list-summary-item')

    movie_data = []

    for item in movie_list_items:

        # --- 1. Rank --- (<div class="ipc-signpost__text">#1</div>)
        rank_element = item.find('div', class_='ipc-signpost__text')
        rank = rank_element.get_text(strip=True).strip('#') if rank_element else 'N/A'

        # --- 2. Title --- (<h3 class="ipc-title__text ipc-title__text--reduced">Title</h3>)
        title_element = item.find('h3', class_='ipc-title__text')
        title = title_element.get_text(strip=True) if title_element else 'N/A'

        # --- 3. Release Year --- (<span class="sc-caa65599-7 eeMIpC cli-title-metadata-item">1994</span>)
        metadata_spans = item.find_all('span', class_='cli-title-metadata-item')
        year = metadata_spans[0].get_text(strip=True) if metadata_spans and len(metadata_spans) > 0 else 'N/A'

        # --- 4. Rating --- (<span class="ipc-rating-star--rating">9.3</span>)
        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        movie_data.append({
            'Rank': rank,
            'Title': title,
            'Year': year,
            'Rating': rating
        })

    return movie_data

# --- Main Execution ---
imdb_url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
scraped_data = scrape_imdb_top_250(imdb_url)

if scraped_data:
    df_imdb_top250 = pd.DataFrame(scraped_data)

    # Convert to numeric, handle errors, and sort
    df_imdb_top250['Rank'] = pd.to_numeric(df_imdb_top250['Rank'], errors='coerce')
    df_imdb_top250['Year'] = pd.to_numeric(df_imdb_top250['Year'], errors='coerce')
    df_imdb_top250['Rating'] = pd.to_numeric(df_imdb_top250['Rating'], errors='coerce')
    df_imdb_top250 = df_imdb_top250.dropna(subset=['Rank']).sort_values(by='Rank')

    # Select and reorder the columns as requested: Title, Rank, Year, Rating
    df_display = df_imdb_top250[['Title', 'Rank', 'Year', 'Rating']]

    # Print a preview and save the data
    print("--- IMDB Top 250 Movies ---")
    print(df_display.head().to_markdown(index=False))

    csv_file = 'imdb_top_250_movies.csv'
    df_display.to_csv(csv_file, index=False)
    print(f"\nFull dataset saved to {csv_file}")
else:
    print("Scraping failed. Check the Colab environment for required libraries (requests, beautifulsoup4, lxml) and network access.")

--- IMDB Top 250 Movies ---
| Title                    |   Rank |   Year |   Rating |
|:-------------------------|-------:|-------:|---------:|
| The Shawshank Redemption |      1 |   1994 |      9.3 |
| The Godfather            |      2 |   1972 |      9.2 |
| The Dark Knight          |      3 |   2008 |      9.1 |
| The Godfather Part II    |      4 |   1974 |      9   |
| 12 Angry Men             |      5 |   1957 |      9   |

Full dataset saved to imdb_top_250_movies.csv


**Grab the <li> tag as the parent and give more context with the class name as the attributes of the movie**

Fossil C02

In [77]:
import pandas as pd
import requests
import warnings
import numpy as np
from io import StringIO # Import StringIO for best practice with read_html

# Suppress warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

def clean_and_convert(series):
    """Aggressively cleans a pandas Series of string data for float conversion."""
    return (
        series.astype(str)
        .str.replace(',', '', regex=False)     # Remove thousands separators
        .str.replace('%', '', regex=False)     # Remove percent signs
        .str.replace('−', '-', regex=False)    # Convert Unicode minus sign to ASCII minus
        .str.replace(r'[^\d\.\-]', '', regex=True) # REMOVE ALL non-digit, non-decimal, non-minus chars
        .replace('', np.nan)                   # Replace any resulting empty strings with NaN
        .astype(float)                         # Finally, convert to float
    )

def scrape_worldometers_co2(url):
    """
    Fetches the historical CO2 emissions table from Worldometer,
    cleans the data, and returns a DataFrame.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Wrap response text in StringIO to follow best practice (suppress FutureWarning)
        dataframes = pd.read_html(StringIO(response.text))
        df_co2 = dataframes[0].copy()

    except Exception as e:
        print(f"Failed to scrape table using pandas.read_html(): {e}")
        return pd.DataFrame()

    # --- 1. Column Naming Fix (Addressing the previous Length Mismatch Error) ---
    if df_co2.shape[1] == 7:
        df_co2.columns = [
            'Drop_Index', 'Year', 'CO2 Emissions (tons)', '1 Year Change',
            'Per Capita', 'Population', 'Pop. Change'
        ]
        df_co2 = df_co2.drop(columns=['Drop_Index'])
    elif df_co2.shape[1] != 6:
        print(f"Error: Expected 6 data columns, found {df_co2.shape[1]} after reading. Check table structure.")
        return pd.DataFrame()

    # --- 2. Data Cleaning and Conversion (Addressing the ValueError) ---

    # Apply aggressive cleaning to the CO2 emissions column
    df_co2['CO2 Emissions (tons)'] = clean_and_convert(df_co2['CO2 Emissions (tons)'])

    # Apply aggressive cleaning to the percentage columns
    for col in ['1 Year Change', 'Pop. Change', 'Per Capita']: # Apply to all float candidates for safety
        df_co2[col] = clean_and_convert(df_co2[col])

    # Clean and convert 'Population' to integer
    df_co2['Population'] = (
        df_co2['Population']
        .astype(str)
        .str.replace(',', '', regex=False)
        .str.replace(r'[^\d]', '', regex=True) # Remove all non-digits
        .astype(int)
    )

    # Set 'Year' as the index and sort descending (most recent first)
    df_co2 = df_co2.set_index('Year').sort_index(ascending=False)

    return df_co2

# --- Main Execution ---
worldometers_url = 'https://www.worldometers.info/co2-emissions/us-co2-emissions/'
df_us_co2 = scrape_worldometers_co2(worldometers_url)

if not df_us_co2.empty:

    # Display the result
    print("--- U.S. CO2 Emissions Historical Data (Worldometer) ---")
    print(df_us_co2.head(10).to_markdown())
    print(f"\nTotal historical years scraped: {len(df_us_co2)}")

    # Save the data to a CSV file
    csv_file = 'us_co2_emissions_historical.csv'
    df_us_co2.to_csv(csv_file)
    print(f"\nFull dataset saved to {csv_file}")
else:
    print("DataFrame is empty. Scraping failed.")

--- U.S. CO2 Emissions Historical Data (Worldometer) ---
|       Year |   CO2 Emissions (tons) |   1 Year Change |   Per Capita |   Population |   Pop. Change |
|-----------:|-----------------------:|----------------:|-------------:|-------------:|--------------:|
| 5929188390 |                   2.9  |           21.06 |  2.81484e+08 |           97 |         23.14 |
| 5888498740 |                   0.14 |           19.91 |  2.95717e+08 |            1 |         19.68 |
| 5880374400 |                   1.5  |           20.08 |  2.92786e+08 |           99 |         20.39 |
| 5878074030 |                   0.86 |           20.68 |  2.8428e+08  |           99 |         22.66 |
| 5873134210 |                   1.43 |           19.46 |  3.01844e+08 |          104 |         18.29 |
| 5793304050 |                   1.15 |           19.98 |  2.89908e+08 |           98 |         21.02 |
| 5790389750 |                   1.67 |           19.38 |  2.9873e+08  |          102 |         18.71 |
| 57623

In [78]:
# Assuming the DataFrame 'df_us_co2' from the previous step is available

if not df_us_co2.empty:
    print("--- U.S. CO2 Emissions Data Analysis ---")

    # 1. Total Emission Change Calculation
    # Find the earliest year and the latest year in the dataset
    earliest_year = df_us_co2.index.min()
    latest_year = df_us_co2.index.max()

    # Get emissions values for the start and end years
    start_emissions = df_us_co2.loc[earliest_year, 'CO2 Emissions (tons)']
    end_emissions = df_us_co2.loc[latest_year, 'CO2 Emissions (tons)']

    # Calculate the total percentage change
    total_change_percent = ((end_emissions - start_emissions) / start_emissions) * 100

    print(f"\n1. Emission Trend ({earliest_year} to {latest_year}):")
    print(f"   Emissions in {earliest_year}: {start_emissions:,.0f} tons")
    print(f"   Emissions in {latest_year}: {end_emissions:,.0f} tons")
    print(f"   Total Percentage Change: {total_change_percent:.2f}%")
    print("-" * 40)


    # 2. Largest Annual Drop in Emissions
    # Sort by '1 Year Change' in ascending order (most negative is the largest drop)
    largest_drop = df_us_co2.sort_values(by='1 Year Change', ascending=True).iloc[0]

    # The '1 Year Change' column is already a percentage, so we just report it.
    drop_year = largest_drop.name # The year is the index
    drop_value = largest_drop['1 Year Change']

    print("2. Largest Annual CO2 Emission Drop:")
    print(f"   Year of Drop: {drop_year}")
    print(f"   Change: {drop_value:.2f}% decrease")
    print(f"   Emissions: {largest_drop['CO2 Emissions (tons)']:,.0f} tons")
    print("-" * 40)


    # 3. Quick Summary Statistics
    print("3. Summary Statistics for Emissions (tons):")
    print(df_us_co2['CO2 Emissions (tons)'].describe().to_markdown(numalign="left", stralign="left"))

else:
    print("DataFrame 'df_us_co2' is empty or not found. Please run the scraping code cell first.")

--- U.S. CO2 Emissions Data Analysis ---

1. Emission Trend (4449362630 to 5929188390):
   Emissions in 4449362630: 0 tons
   Emissions in 5929188390: 3 tons
   Total Percentage Change: 705.56%
----------------------------------------
2. Largest Annual CO2 Emission Drop:
   Year of Drop: 4466731420
   Change: 13.16% decrease
   Emissions: 10 tons
----------------------------------------
3. Summary Statistics for Emissions (tons):
|       | CO2 Emissions (tons)   |
|:------|:-----------------------|
| count | 46                     |
| mean  | 2.56783                |
| std   | 2.07691                |
| min   | 0.14                   |
| 25%   | 1.06                   |
| 50%   | 1.955                  |
| 75%   | 3.405                  |
| max   | 10.07                  |
