import relevant libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

part 1:<br>
load_data:Load the dataset (CSV or JSON) using file I/O<br>
preprocess_data:Preprocess cryptocurrency data<br>
select_currencies: Filter data to include only selected currencies<br>
get_available_currencies:Prepare for next step<br>

In [None]:
def load_data(file_path):
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"Error: File does not exist: {file_path}")
        return None
    
    df = pd.read_csv(file_path, low_memory=False)
    
    # Check for required columns
    required_columns = ['symbol', 'date', 'close', 'volume']
    if not all(col in df.columns for col in required_columns):
        print(f"Error: Missing required columns. Required columns: {required_columns}")
        return None
    
    # Display basic dataset information
    print(f"Successfully loaded data with {len(df)} rows and {len(df.columns)} columns.")
    print(f"\nDataset columns: {', '.join(df.columns.tolist())}")
    
    # Display first few rows for reference
    print(f"\nFirst 5 rows:\n{df.head().to_string()}")
    
    return df

In [None]:
def preprocess_data(df):
    # Create a copy to avoid modifying original data
    processed_df = df.copy()
    
    # Convert date column to datetime type
    print("Preprocessing data...")
    processed_df['date'] = pd.to_datetime(processed_df['date'])
    
    # Ensure numerical columns have correct types
    numerical_columns = ['open', 'high', 'low', 'close', 'volume', 'market', 'close_ratio', 'spread']
    for col in numerical_columns:
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
    
    # Count missing values
    print("\nMissing values statistics:")
    missing_values = processed_df.isnull().sum()
    for col, missing in missing_values.items():
        if missing > 0:
            print(f"{col}: {missing}")
    
    # Drop all rows containing NA or missing values
    original_rows = len(processed_df)
    processed_df.dropna(inplace=True)
    deleted_rows = original_rows - len(processed_df)
    print(f"\nDeleted {deleted_rows} rows containing missing values")
    
    # Sort by date
    processed_df.sort_values(['symbol', 'date'], inplace=True)
    
    # Reset index
    processed_df.reset_index(drop=True, inplace=True)
    
    # Display date range
    print(f"\nDate range: {processed_df['date'].min()} to {processed_df['date'].max()}")
    
    # Display row count change
    print(f"Data preprocessing completed, original rows: {original_rows}, processed rows: {len(processed_df)}")
    return processed_df

In [None]:
def select_currencies(df, currencies):
    # Convert currencies to uppercase for case-insensitive matching
    currencies = [curr.upper() for curr in currencies]
    
    # Check if all requested currencies exist in the data
    available_currencies = set(df['symbol'].str.upper())
    not_found = [curr for curr in currencies if curr not in available_currencies]
    
    # Display not found currencies (if any)
    if not_found:
        print(f"Warning: The following currencies were not found in the dataset: {', '.join(not_found)}")
    
    # Filter data
    valid_currencies = [curr for curr in currencies if curr in available_currencies]
    if not valid_currencies:
        print("Error: No requested currencies were found")
        return None
    
    filtered_df = df[df['symbol'].str.upper().isin(valid_currencies)]
    return filtered_df

In [None]:
def get_available_currencies(df):
    return sorted(df['symbol'].unique())

part 2:<br>
calculate_statistics: Calculate statistics for a specific currency<br>
calculate_correlation:Calculate correlation between two cryptocurrencies<br>
get_top_volume_currencies:Get top 5 cryptocurrencies by average trading volume<br>
calculate_volatility:Calculate rolling volatility for a cryptocurrency<br>

In [None]:
def calculate_statistics(df, currency):
    curr_data = df[df['symbol'] == currency].copy()
    if curr_data.empty:
        return None
    
    # Calculate daily returns
    curr_data = curr_data.sort_values('date')
    curr_data['daily_return'] = curr_data['close'].pct_change()
    
    # Calculate statistics
    stats = {
        'currency': currency,
        'name': curr_data['name'].iloc[0] if 'name' in curr_data.columns else 'N/A',
        
        # Close price statistics
        'close_mean': curr_data['close'].mean(),
        'close_median': curr_data['close'].median(),
        'close_std': curr_data['close'].std(),
        
        # Daily return statistics
        'return_mean': curr_data['daily_return'].mean() * 100,  
        'return_median': curr_data['daily_return'].median() * 100,
        'return_std': curr_data['daily_return'].std() * 100,
        
        # Volume statistics
        'volume_mean': curr_data['volume'].mean(),
        'volume_median': curr_data['volume'].median(),
        'volume_std': curr_data['volume'].std(),
        
        # Volatility (using standard deviation of daily returns)
        'volatility': curr_data['daily_return'].std() * np.sqrt(365) * 100  
    }
    
    return stats

In [None]:
def calculate_correlation(df, currency1, currency2):
    # Filter data for two cryptocurrencies
    curr1_data = df[df['symbol'] == currency1].copy()
    curr2_data = df[df['symbol'] == currency2].copy()
    
    if curr1_data.empty or curr2_data.empty:
        return None
    
    # Set date as index
    curr1_data = curr1_data.set_index('date')['close']
    curr2_data = curr2_data.set_index('date')['close']
    
    # Merge data
    combined = pd.DataFrame({
        currency1: curr1_data,
        currency2: curr2_data
    })
    
    # Calculate correlation coefficient
    correlation = combined.corr().iloc[0, 1]
    
    return correlation

In [None]:
def get_top_volume_currencies(df, top_n=5):
    # Calculate average trading volume for each cryptocurrency
    volume_stats = df.groupby('symbol').agg({
        'volume': 'mean',
        'name': 'first'  # Get first name
    }).reset_index()
    
    # Sort by average volume in descending order and select top N
    top_currencies = volume_stats.sort_values('volume', ascending=False).head(top_n)
    
    return top_currencies

In [None]:
def calculate_volatility(df, currency, window=30):
    curr_data = df[df['symbol'] == currency].copy()
    if curr_data.empty:
        return None
    
    # Calculate daily returns
    curr_data = curr_data.sort_values('date')
    curr_data['daily_return'] = curr_data['close'].pct_change()
    
    # Calculate rolling volatility
    curr_data['volatility'] = curr_data['daily_return'].rolling(window=window).std() * np.sqrt(365) * 100
    
    return curr_data[['date', 'volatility']].dropna()

part 3:<br>
plot_closing_prices:Plot closing prices of multiple cryptocurrencies over time<br>
plot_daily_returns:Plot histograms of daily returns<br>
plot_correlation_heatmap:Plot correlation heatmap between different cryptocurrencies<br>
plot_volatility_boxplot:Plot boxplot comparing volatility between different currencies<br>
plot_price_volume_pair:Plot pair plots of price and volume relationships<br>

In [None]:
def plot_closing_prices(df, currencies=None, days=365):
    # Select currencies to plot
    if currencies is None:
        # Default to top 5 currencies by trading volume
        top_currencies = get_top_volume_currencies(df, top_n=5)
        currencies = top_currencies['symbol'].tolist()
    
    # Get recent data
    recent_days = df['date'].max() - pd.Timedelta(days=days)
    recent_data = df[df['date'] >= recent_days]
    
    # Create figure
    plt.figure(figsize=(14, 8))
    
    for currency in currencies:
        curr_data = recent_data[recent_data['symbol'] == currency].copy()
        if not curr_data.empty:
            curr_data = curr_data.sort_values('date')
            currency_name = curr_data['name'].iloc[0] if 'name' in curr_data.columns else currency
            plt.plot(curr_data['date'], curr_data['close'], label=f"{currency} ({currency_name})")
    
    plt.title('Cryptocurrency Closing Prices Over Time', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Closing Price (USD)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(loc='best', fontsize=12)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_daily_returns(df, currencies=None, bins=50):
    # Select currencies to plot
    if currencies is None:
        # choose the main 3
        top_currencies = get_top_volume_currencies(df, top_n=3)
        currencies = top_currencies['symbol'].tolist()
    
    # Create figure
    plt.figure(figsize=(14, 10))
    
    # Create subplot
    n_currencies = len(currencies)
    for i, currency in enumerate(currencies, 1):
        curr_data = df[df['symbol'] == currency].copy()
        if not curr_data.empty:
            # Calculate daily returns
            curr_data = curr_data.sort_values('date')
            curr_data['daily_return'] = curr_data['close'].pct_change() * 100  
            
            # Remove NaN values
            daily_returns = curr_data['daily_return'].dropna()
            
            # Plot histogram
            plt.subplot(n_currencies, 1, i)
            sns.histplot(daily_returns, bins=bins, kde=True)
            plt.title(f'{currency} Daily Return Distribution', fontsize=14)
            plt.xlabel('Daily Return (%)', fontsize=12)
            plt.ylabel('Frequency', fontsize=12)
            plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
def plot_correlation_heatmap(df, currencies=None, days=365):
    # Select currencies to analyze
    if currencies is None:
        # Default to top 5 currencies by trading volume
        top_currencies = get_top_volume_currencies(df, top_n=5)
        currencies = top_currencies['symbol'].tolist()
    
    # Get recent data
    recent_days = df['date'].max() - pd.Timedelta(days=days)
    recent_data = df[df['date'] >= recent_days]
    
    # Create price pivot table
    pivot_data = recent_data.pivot_table(index='date', columns='symbol', values='close')
    
    # Filter required currencies
    pivot_data = pivot_data[currencies]
    
    # Calculate correlation coefficient matrix
    correlation_matrix = pivot_data.corr()
    
    # Create heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5, fmt='.3f')
    plt.title('Cryptocurrency Price Correlation Heatmap', fontsize=16)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_volatility_boxplot(df, currencies=None, window=30):
    # Select currencies to analyze
    if currencies is None:
        # Default to top 5 currencies by trading volume
        top_currencies = get_top_volume_currencies(df, top_n=5)
        currencies = top_currencies['symbol'].tolist()
    
    # Collect volatility data and labels for each currency
    data_to_plot = []
    labels = []
    
    for currency in currencies:
        # Calculate volatility directly in the function
        curr_data = df[df['symbol'] == currency].copy()
        
        if not curr_data.empty:
            # Calculate daily returns
            curr_data = curr_data.sort_values('date')
            curr_data['daily_return'] = curr_data['close'].pct_change()
            
            # Calculate rolling volatility
            curr_data['volatility'] = curr_data['daily_return'].rolling(window=window).std() * np.sqrt(365) * 100
            
            # Filter out NaN values
            valid_volatility = curr_data['volatility'].dropna()
            
            if len(valid_volatility) > 0:
                # Store as simple Python list
                data_to_plot.append(valid_volatility.tolist())
                labels.append(currency)
    
    # Draw boxplot if we have data
    if data_to_plot and len(data_to_plot) > 0:
        # Use matplotlib's boxplot to draw directly
        plt.figure(figsize=(14, 8))
        plt.boxplot(data_to_plot, labels=labels)
        plt.title(f'Cryptocurrency {window}-day Rolling Volatility Comparison', fontsize=16)
        plt.xlabel('Cryptocurrency', fontsize=14)
        plt.ylabel('Annualized Volatility (%)', fontsize=14)
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()
    else:
        print("Error: Insufficient data to draw boxplot")

In [None]:
def plot_price_volume_pair(df, currencies=None, days=365):
    # Select currencies to analyze
    if currencies is None:
        # Default to top 2 currencies by trading volume
        top_currencies = get_top_volume_currencies(df, top_n=2)
        currencies = top_currencies['symbol'].tolist()
    
    # Get recent data
    recent_days = df['date'].max() - pd.Timedelta(days=days)
    recent_data = df[df['date'] >= recent_days]
    
    # Create pair plots for each currency
    for currency in currencies:
        curr_data = recent_data[recent_data['symbol'] == currency].copy()
        if not curr_data.empty:
            currency_name = curr_data['name'].iloc[0] if 'name' in curr_data.columns else currency
            
            # Calculate log price and log volume (better visualization)
            curr_data['log_close'] = np.log(curr_data['close'] + 1)
            curr_data['log_volume'] = np.log(curr_data['volume'] + 1)
            curr_data['daily_return'] = curr_data['close'].pct_change() * 100
            curr_data['log_volume_change'] = np.log(curr_data['volume'].pct_change() + 1)
            
            # Create pair plot
            pair_data = curr_data[['close', 'volume', 'daily_return']].dropna()
            pair_data.columns = ['Price', 'Volume', 'Daily Return']
            
            plt.figure(figsize=(12, 10))
            sns.pairplot(pair_data, diag_kind='kde')
            plt.suptitle(f'{currency} ({currency_name}) Price, Volume and Return Relationship Analysis', y=1.02, fontsize=16)
            plt.tight_layout()
            plt.show()