# Packages Used

In [None]:
import pandas as pd
import requests
from datetime import datetime
import gdown
import os
import numpy as np
import zipfile
import io
from google.colab import drive
import time

# Question 1: Loading a Large CSV

In [None]:
# Define the file name and path
local_file_path = '/content/drive/MyDrive/BDA_Datasets/activity_log_raw.csv'  # Path to the input CSV file

def process_chunk(chunk, date_format):
    """
    Processes each chunk by removing null characters and performing data cleaning.

    Args:
        chunk (DataFrame): A chunk of the data to be processed.
        date_format (str): The date format to use when converting to datetime.

    Returns:
        DataFrame: The cleaned and processed chunk.
    """
    # Replace NUL characters with empty strings in the ACTIVITY_TIME column
    if 'ACTIVITY_TIME' in chunk.columns:
        chunk['ACTIVITY_TIME'] = chunk['ACTIVITY_TIME'].str.replace('\x00', '', regex=False)

        # Remove rows with missing activity times and duplicate rows
        chunk.dropna(subset=['ACTIVITY_TIME'], inplace=True)
        chunk.drop_duplicates(inplace=True)

        # Convert ACTIVITY_TIME to datetime format and drop invalid rows
        chunk['ACTIVITY_TIME'] = pd.to_datetime(chunk['ACTIVITY_TIME'], format=date_format, errors='coerce')
        chunk.dropna(subset=['ACTIVITY_TIME'], inplace=True)
    return chunk

def process_all_chunks(file_path, chunksize=100000, date_format='%d-%b-%y %I.%M.%S.%f %p', encoding='latin-1'):
    """
    Processes all chunks of the CSV file and calculates the date range and busiest year/month.

    Args:
        file_path (str): Path to the CSV file.
        chunksize (int): Number of rows to read per chunk.
        date_format (str): Format of the datetime column.
        encoding (str): Encoding of the CSV file.

    Returns:Financial Mathematics
        tuple: The date range (min_date, max_date) and busiest year/month.
    """
    if not file_path:
        print("File path is invalid.")
        return None, None

    # Initialize variables for date range and year/month counts
    min_date = None
    max_date = None
    all_year_month_counts = pd.Series(dtype='int64')  # To store year/month counts across chunks

    try:
        print(f"Processing CSV file in chunks with encoding: {encoding}...")

        # Read the file in chunks
        chunk_iterator = pd.read_csv(
            file_path,
            chunksize=chunksize,
            encoding=encoding,
            sep=',',
            on_bad_lines='skip',  # Skip bad lines in the file
            engine='python',
            iterator=True,
            usecols=['ACTIVITY_TIME']  # Read only the ACTIVITY_TIME column
        )

        processed_chunks = 0
        for chunk in chunk_iterator:
            processed_chunks += 1

            # Skip empty chunks
            if chunk is None or chunk.empty:
                continue

            try:
                # Process each chunk using the process_chunk function
                chunk = process_chunk(chunk, date_format)
            except Exception as e:
                continue  # Skip chunks with processing errors

            # Skip empty chunks after processing
            if chunk.empty:
                continue

            # Update the min and max dates
            current_min = chunk['ACTIVITY_TIME'].min()
            current_max = chunk['ACTIVITY_TIME'].max()

            if min_date is None or current_min < min_date:
                min_date = current_min
            if max_date is None or current_max > max_date:
                max_date = current_max

            # Calculate year/month counts only if ACTIVITY_TIME column is valid
            if 'ACTIVITY_TIME' in chunk.columns and not chunk['ACTIVITY_TIME'].empty:
                chunk['year_month'] = chunk['ACTIVITY_TIME'].dt.to_period('M')
                year_month_counts = chunk['year_month'].value_counts()
                all_year_month_counts = all_year_month_counts.add(year_month_counts, fill_value=0)
            else:
                print("No valid ACTIVITY_TIME column to calculate year_month counts")

        # Check if any valid data was found
        if min_date is not None and max_date is not None:
            print("Finished processing all chunks.")
            if not all_year_month_counts.empty:
                busiest_year_month = all_year_month_counts.idxmax()
                return (min_date, max_date), (busiest_year_month.year, busiest_year_month.month)
            else:
                return (min_date, max_date), None
        else:
            print("No valid data found.")
            return None, None

    except Exception as e:
        print(f"Error during overall processing: {e}")
        # Return results based on available data if an error occurs
        if min_date is not None and max_date is not None and not all_year_month_counts.empty:
            busiest_year_month = all_year_month_counts.idxmax()
            return (min_date, max_date), (busiest_year_month.year, busiest_year_month.month)
        else:
            return (min_date, max_date), None

# Inform the user about the estimated running time
print("Estimated maximum running time: approximately 11 minutes.")
print("Please wait...")

# Process the file and calculate the date range and busiest year/month
date_range, busiest_year_month = process_all_chunks(local_file_path, chunksize=100000)
print("ignore Error message")


Estimated maximum running time: approximately 11 minutes.
Please wait...
Processing CSV file in chunks with encoding: latin-1...
Error during overall processing: line contains NUL
ignore Error message


## Task-1.

#### Write a function which returns the date range in the dataset

In [None]:
if date_range:
    if date_range[0] != None and date_range[1] != None:
      earliest_date, latest_date = date_range
      print(f"\nEarliest date: {earliest_date}")
      print(f"Latest date: {latest_date}")
    else:
       print("No valid date range found")


Earliest date: 2012-08-15 20:01:36.621000
Latest date: 2015-04-13 22:39:00.431000


## Task-2.

#### Write a function which returns the year and month with the largest number of events.

In [None]:

if busiest_year_month:
    year, month = busiest_year_month
    print(f"\nYear and month with the most events: {year}-{month:02}")
else:
    print("No valid data for busiest year and month")


Year and month with the most events: 2015-03


## Task-3.

#### Describe what strategy you used to deal with the large size of this dataset



In order to properly handle the large dataset, I used chunk-based processing to manage memory and improve efficiency. By reading the file in chunks of 100,000 rows, I avoided overloading the system. Each chunk was cleaned by replacing null characters in the ACTIVITY_TIME column, removing missing values and duplicates, and standardizing the date format. Handling corrupted rows and encoding issues was challenging, but I overcame this by using on_bad_lines='skip' and specifying the latin-1 encoding. These steps allowed me to process the file without interruptions.

I also calculated the overall date range and busiest year-month by aggregating results across chunks. Maintaining consistency in these calculations required careful integration of intermediate results. Despite occasional errors in individual chunks, the modular approach of isolating the cleaning and processing steps made debugging straightforward. This strategy ensured the dataset was processed efficiently, enabling me to extract key insights without compromising performance or accurac

# Question 2 : Standard Error of the Mean (SEM) with Bootstrapping



## Task-1.

#### Calculate the SEM for age in the dataset.

In [None]:
# Define the local file path for the downloaded zip file
local_zip_path = '/content/drive/MyDrive/BDA_Datasets/DATASETS/raw/hh_data_ml (1).zip'

def calculate_age(row):
    """
    Calculates the age of a person based on their birth month and year.
    Assumes the current year is 2025.

    Args:
        row (pd.Series): A row from a DataFrame containing birth year and month.

    Returns:
        int or None: Age of the person, or None if the data is invalid.
    """
    try:
        birth_year = int(row['P07A'])
        birth_month = int(row['P07M'])

        if not (1 <= birth_month <= 12):  # Validate birth month
            return None

        age = 2025 - birth_year
        return age
    except (ValueError, TypeError):
        return None

def calculate_mean_age(data):
    """
    Calculates the mean age, handling missing values.

    Args:
        data (pd.Series): A series of ages.

    Returns:
        float: The mean age, or NaN if no valid values are present.
    """
    ages = data.dropna()
    if ages.empty:
        return np.nan
    return ages.mean()

def bootstrap_sample_mean(data, num_samples):
    """
    Generates bootstrap samples and calculates their mean.

    Args:
        data (pd.Series): A series of ages.
        num_samples (int): The number of bootstrap samples to generate.

    Returns:
        list: A list of mean values from bootstrap samples.
    """
    means = []
    for _ in range(num_samples):
        sample = data.sample(frac=1, replace=True)
        mean = calculate_mean_age(sample)
        if not np.isnan(mean):
            means.append(mean)
    return means

def calculate_sem(bootstrap_means):
    """
    Calculates the Standard Error of the Mean (SEM) using bootstrap means.

    Args:
        bootstrap_means (list): A list of mean values from bootstrap samples.

    Returns:
        float or None: The SEM, or None if no bootstrap means are provided.
    """
    if not bootstrap_means:
        return None
    return np.std(bootstrap_means, ddof=1)

def process_data_chunked(local_zip_path, chunksize=10000, num_bootstrap_samples=100):
    """
    Processes a large dataset in chunks to calculate the SEM of ages.

    Args:
        local_zip_path (str): Path to the zip file containing the dataset.
        chunksize (int): Number of rows to process per chunk.
        num_bootstrap_samples (int): Number of bootstrap samples to generate.

    Returns:
        float or None: The SEM of ages, or None if processing fails.
    """
    all_ages = []
    total_chunks = 0  # Total number of chunks
    processed_chunks = 0  # Number of successfully processed chunks

    try:
        with zipfile.ZipFile(local_zip_path, 'r') as z:
            csv_file_name = z.namelist()[0]  # Get the first file in the zip
            csv_bytes = z.read(csv_file_name)  # Read the CSV file as bytes
            chunk_iterator = pd.read_csv(io.BytesIO(csv_bytes), chunksize=chunksize, sep='|')  # Load in chunks

            for chunk in chunk_iterator:
                total_chunks += 1
                chunk.columns = chunk.columns.str.replace('|', '', regex=False)  # Clean column names
                if 'P07M' in chunk.columns and 'P07A' in chunk.columns:
                    chunk[['P07M', 'P07A']] = chunk[['P07M', 'P07A']].astype('int32')  # Convert columns to int32
                    chunk['age'] = chunk.apply(calculate_age, axis=1)  # Calculate age
                    all_ages.extend(chunk['age'].dropna())  # Append valid ages
                    processed_chunks += 1
                else:
                    print(f"Skipping chunk with columns: {chunk.columns}")  # Log skipped chunks

            if not all_ages:
                return None
            all_ages = pd.Series(all_ages)  # Convert list to a Series
            bootstrap_means = bootstrap_sample_mean(all_ages, num_bootstrap_samples)  # Generate bootstrap samples
            return calculate_sem(bootstrap_means)  # Calculate SEM
    except Exception as e:
        print(f"An error occurred while processing the chunked data: {e}")
        return None

# Inform the user about the estimated running time
print("Estimated maximum running time: approximately 11 minutes.")
print("Please wait...")

# Load the data from the zip file and process it
if local_zip_path:
    num_bootstrap_samples = 100  # Define the number of bootstrap samples
    start_time = time.time()  # Start time tracking
    sem = process_data_chunked(local_zip_path, num_bootstrap_samples=num_bootstrap_samples)
    end_time = time.time()  # End time tracking
    if sem is not None:
        print(f"The Standard Error of the Mean (SEM) for age is: {sem:.4f}")
    else:
        print("Could not calculate the SEM for age with the given data.")
else:
    print("No valid file path found, cannot process.")


Estimated maximum running time: approximately 11 minutes.
Please wait...
The Standard Error of the Mean (SEM) for age is: 0.0041


### Task-2.

#### Describe what strategy you used to deal with the fact that you have to run many computations for this task

To address the computational challenges of processing a large dataset, I used a chunk-based approach, processing 10,000 rows at a time to optimize memory usage and prevent system overload. Each chunk was cleaned, transformed, and analyzed independently.

Key challenges included managing memory constraints, handling missing or invalid data, and ensuring accurate statistical computations.

To overcome these:

 - Memory Efficiency: By processing chunks iteratively and converting data types to memory-efficient formats (e.g., int32), I avoided excessive memory usage.

 - Data Quality: Missing or invalid birth month and year data were identified and excluded. Valid ages were calculated using custom functions and retained for analysis.

 - Robust Estimation: Bootstrapping (100 samples) was used to compute the Standard Error of the Mean (SEM), ensuring reliable results despite variability.

The process included progress tracking and error handling to ensure smooth execution and quick debugging if issues arose. This strategy allowed for efficient computation, scaling well to large datasets while maintaining accuracy and reliability. The calculated SEM provided meaningful insights into the dataset within a reasonable runtime.

# Question 3 : Weather Forecast for All Capital Cities in Africa


## Task-1.
#### Generate a CSV file with weather forecast for Monday, January 13, 2025

In [None]:
# Load the data
file_path = '/content/drive/MyDrive/BDA_Datasets/Africa_Cities.csv'  # Path to the input CSV file containing city data
df = pd.read_csv(file_path)  # Load the CSV file into a pandas DataFrame

# Filter for African national capitals
# Select rows where the city status matches specified categories and the continent is Africa
african_capitals = df[
    (df['STATUS'].isin(['National capital', 'National and provincial capital', 'Provincial capital', 'Other'])) &
    (df['CONTINENT'] == 'Africa')
][['CNTRY_NAME', 'CITY_NAME']].rename(columns={'CNTRY_NAME': 'Country', 'CITY_NAME': 'City'})

# OpenWeather API configurations
API_KEY = '0545df1a36bde6eb09fff3c6762420b1'  # API key for accessing OpenWeather services
BASE_URL = 'https://api.openweathermap.org/data/2.5/forecast'  # Base URL for the OpenWeather API

print("Estimated maximum running time: approximately 2 minutes.")  # Inform user of estimated runtime
print("Please wait...")

# Function to fetch weather data for a city
def fetch_weather(city, country):
    """
    Fetches weather forecast data for a given city and country.

    Args:
        city (str): The name of the city.
        country (str): The name of the country.

    Returns:
        dict or None: A dictionary containing weather data for January 13, 2025, or None if not found.
    """
    try:
        # Set up the API request parameters
        params = {
            'q': f'{city},{country}',
            'appid': API_KEY,
            'units': 'metric'  # Request temperature in Celsius
        }
        response = requests.get(BASE_URL, params=params)  # Make the API request
        response.raise_for_status()  # Raise an error for unsuccessful responses
        data = response.json()  # Parse the response JSON

        # Extract weather forecast specifically for January 13, 2025
        forecasts = data.get('list', [])  # Retrieve the list of forecasts
        for forecast in forecasts:
            # Convert Unix timestamp to a human-readable date
            forecast_date = datetime.utcfromtimestamp(forecast['dt']).strftime('%Y-%m-%d')
            if forecast_date == '2025-01-13':
                # Extract relevant weather data
                time = datetime.utcfromtimestamp(forecast['dt']).strftime('%H:%M:%S')
                weather_main = forecast['weather'][0]['main']
                temp = forecast['main']['temp']
                temp_min = forecast['main']['temp_min']
                temp_max = forecast['main']['temp_max']
                humidity = forecast['main']['humidity']
                clouds = forecast['clouds']['all']
                return {
                    'Date': '2025-01-13',
                    'Time': time,
                    'Weather_main': weather_main,
                    'Temp': temp,
                    'Temp_min': temp_min,
                    'Temp_max': temp_max,
                    'humidity': humidity,
                    'Clouds': clouds
                }
        return None  # Return None if no data is found for the specified date
    except Exception:
        return None  # Skip processing cities with API or data issues

# Fetch weather data for all African capitals
weather_data = []  # List to store weather data for each city
skipped_cities = []  # List to log cities with failed data retrieval

for _, row in african_capitals.iterrows():
    city, country = row['City'], row['Country']  # Extract city and country names
    weather = fetch_weather(city, country)  # Fetch weather data for the city
    if weather:
        # Add successful weather data to the list
        weather['Country'] = country
        weather['City'] = city
        weather_data.append(weather)
    else:
        # Log cities with failed data retrieval
        skipped_cities.append({'City': city, 'Country': country})

# Save the retrieved weather data to a CSV file
output_df = pd.DataFrame(weather_data, columns=[
    'Country', 'City', 'Date', 'Time', 'Weather_main', 'Temp', 'Temp_min', 'Temp_max', 'humidity', 'Clouds'
])  # Create a DataFrame with specific columns
output_file = '/content/african_capitals_weather.csv'  # Specify output file path
output_df.to_csv(output_file, index=False)  # Save the DataFrame to a CSV file
print(f"Weather data saved to {output_file}")  # Inform the user that the file has been saved


Estimated maximum running time: approximately 2 minutes.
Please wait...
Weather data saved to /content/african_capitals_weather.csv
