# **Cyclistic Bike-Share Data**

This Python 3 notebook is based on the [Cyclistic bike-share analysis case study](http://https://www.coursera.org/learn/google-data-analytics-capstone), which is part of the Google Data Analytics Professional Certification capstone project.

# About Cyclistic
Cyclistic is a fictitious Chicago-based bike-sharing company. As part of the project, we will analyze the bike trip data from the last 12 months to gain insights and make data-driven recommendations to improve the company's business strategy.

# Data Source
The data used in this notebook is sourced from the [divvy-tripdata S3 bucket](https://divvy-tripdata.s3.amazonaws.com/index.html). The data is made available under a specific license provided by [Motivate International Inc](https://ride.divvybikes.com/data-license-agreement) for analysis purposes by [Divvy Bikes Sharing](https://ride.divvybikes.com/).The available data starts from April 2020.

# Purpose of the Notebook
The notebook contains code that automates the process of downloading and managing the data files for the Google Data Analytics Professional Certification project. The code ensures that the data used for analysis is up to date and the data files are downloaded, extracted, and the directory size is managed to stay within the specified limit of 15GB.

# Instructions
To use this notebook, follow these steps:

* Specify the **URL** for downloading the data files using the url variable.
* Set the **oldest file month and year** using the oldest_year and oldest_month variables.
* Set the **number of months to download** using the num_months_to_download variable.
* Set the **maximum directory size in GB** using the max_directory_size_gb variable.
* Run the code to download and manage the data files.


In [1]:
import requests
import zipfile
import io
import datetime
import os
import shutil

In [2]:
def download_and_unzip_data(url, num_months, max_directory_size_gb):
    current_date = datetime.datetime.now()
    data_folder = '/kaggle/working/data/Cyclistic/'  # Specify the folder path
    zip_folder = os.path.join(data_folder, 'zip_files')  # Folder for saving zip files
    csv_folder = os.path.join(data_folder, 'csv_files')  # Folder for saving extracted CSV files
    
    # Create the data folder if it doesn't exist
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    
    # Create the zip and csv folders if they don't exist
    if not os.path.exists(zip_folder):
        os.makedirs(zip_folder)
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)
    
    # Check if the current month's data exists
    current_year = current_date.year
    current_month = current_date.month
    current_file_name = f'{current_year}{current_month:02d}-divvy-tripdata.zip'
    current_file_path = os.path.join(zip_folder, current_file_name)
    
    if os.path.exists(current_file_path):
        print(f'{current_file_name} already exists. Skipping download.')
    else:
        print(f'{current_file_name} not found. Downloading previous month data.')
        num_months += 1
    
    for i in range(num_months):
        # Calculate the target month and year
        target_date = current_date - datetime.timedelta(days=i*30)
        target_year = target_date.year
        target_month = target_date.month
        
        if target_year < oldest_year or (target_year == oldest_year and target_month < oldest_month):
            print(f"Data is only available from {oldest_month}/{oldest_year}. Unable to download files.")
            break
        
        # Format the URL for the data file
        data_url = url.format(target_year, target_month)
        
        # Check if the file already exists
        file_name = f'{target_year}{target_month:02d}-divvy-tripdata.zip'
        file_path = os.path.join(zip_folder, file_name)
        if os.path.exists(file_path):
            print(f'{file_name} already exists. Skipping download.')
            continue
        
        # Download the file
        response = requests.get(data_url)
        
        # Save the file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        
        # Extract the ZIP file contents
        try:
            zip_file = zipfile.ZipFile(file_path)
            for member in zip_file.namelist():
                if member.endswith('.csv'):
                    zip_file.extract(member, csv_folder)
            zip_file.close()
            print(f'{file_name} downloaded and extracted successfully.')
            
            # Calculate the total size of the files in the directory
            total_size_gb = get_directory_size(data_folder) / (1024**3)
            
            # Check if the total size exceeds the maximum limit
            if total_size_gb > max_directory_size_gb:
                print(f'Total directory size exceeded. Removing oldest files.')
                remove_oldest_files(data_folder, max_directory_size_gb)
        except zipfile.BadZipFile:
            print(f'Invalid ZIP file: {file_name}. Skipping extraction.')
            os.remove(file_path)

In [3]:
def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

In [4]:
def remove_oldest_files(directory, max_directory_size_gb):
    file_list = []
    total_size_gb = 0

    # Collect information about files in the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append((file_path, os.path.getctime(file_path)))
            total_size_gb += os.path.getsize(file_path) / (1024**3)

    # Sort the files by creation time (oldest first)
    file_list.sort(key=lambda x: x[1])

    # Remove files until the directory size is within the limit
    while total_size_gb > max_directory_size_gb and file_list:
        file_path, _ = file_list.pop(0)
        file_size_gb = os.path.getsize(file_path) / (1024**3)
        total_size_gb -= file_size_gb

        # Remove the file and its corresponding extracted CSV files
        os.remove(file_path)
        csv_folder = os.path.join(directory, 'csv_files')
        csv_file_prefix = os.path.splitext(os.path.basename(file_path))[0]
        for file in os.listdir(csv_folder):
            if file.startswith(csv_file_prefix):
                csv_file_path = os.path.join(csv_folder, file)
                os.remove(csv_file_path)

        print(f'Removed file: {file_path}')

In [5]:
# Specify the URL for downloading the data files
url = 'https://divvy-tripdata.s3.amazonaws.com/{0}{1:02d}-divvy-tripdata.zip'

# Set the oldest file month and year
oldest_year = 2020
oldest_month = 4

# Set the number of months to download
num_months_to_download = 12

# Set the maximum directory size in GB
max_directory_size_gb = 15

# Call the function to download and manage the data files
download_and_unzip_data(url, num_months_to_download, max_directory_size_gb)


202306-divvy-tripdata.zip not found. Downloading previous month data.
Invalid ZIP file: 202306-divvy-tripdata.zip. Skipping extraction.
202305-divvy-tripdata.zip downloaded and extracted successfully.
202304-divvy-tripdata.zip downloaded and extracted successfully.
202303-divvy-tripdata.zip downloaded and extracted successfully.
202302-divvy-tripdata.zip downloaded and extracted successfully.
202301-divvy-tripdata.zip downloaded and extracted successfully.
202212-divvy-tripdata.zip downloaded and extracted successfully.
202211-divvy-tripdata.zip downloaded and extracted successfully.
202210-divvy-tripdata.zip downloaded and extracted successfully.
202209-divvy-tripdata.zip downloaded and extracted successfully.
202208-divvy-tripdata.zip downloaded and extracted successfully.
202207-divvy-tripdata.zip downloaded and extracted successfully.
202206-divvy-tripdata.zip downloaded and extracted successfully.
