# Presto/Trino Database Connection Test

This notebook tests the connection to the `desktop_product_intelligence_public.halley_feedback_qualtrics_comments` table.

## Step 1: Install Required Package

First, let's install the trino package if not already installed.



In [22]:
## Step 2: Import Libraries and Set Up Connection

import pandas as pd
from trino.dbapi import connect
from trino.auth import BasicAuthentication
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connection parameters
PRESTO_HOST = "presto-eda.adp.autodesk.com"
PRESTO_PORT = 443
PRESTO_USER = os.getenv("PRESTO_USER", "your_username_here")
PRESTO_CATALOG = os.getenv("PRESTO_CATALOG", "")  # Default to hive
PRESTO_SCHEMA = "desktop_product_intelligence_public"

print(f"Host: {PRESTO_HOST}")
print(f"Port: {PRESTO_PORT}")
print(f"User: {PRESTO_USER}")
print(f"Schema: {PRESTO_SCHEMA}")


Host: presto-eda.adp.autodesk.com
Port: 443
User: munoj
Schema: desktop_product_intelligence_public


In [None]:
## Step 3: Create Database Connection

# Establish a connection to the Trino/Presto database.

try:
    # Create connection
    conn = connect(
        host=PRESTO_HOST,
        port=PRESTO_PORT,
        user=PRESTO_USER,
        catalog=PRESTO_CATALOG,
        schema=PRESTO_SCHEMA,
        http_scheme='https',
    )
    
    print("‚úÖ Connection established successfully!")
    
    # Test with a simple query
    cursor = conn.cursor()
    cursor.execute("SELECT 1 as test")
    result = cursor.fetchone()
    print(f"Test query result: {result}")
    cursor.close()
    
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    print(f"\nPlease check:")
    print("1. Your username is correct")
    print("2. You have network access to the Presto server")
    print("3. You have proper authentication credentials")


In [None]:
## Step 4: Explore the Target Table

#to  Let's first check what columns are available in the `halley_feedback_qualtrics_comments` table.

# Query to get table schema/columns
query_schema = """
DESCRIBE desktop_product_intelligence_public.halley_feedback_qualtrics_comments
"""

try:
    cursor = conn.cursor()
    cursor.execute(query_schema)
    columns = cursor.fetchall()
    cursor.close()
    
    print("üìã Table Columns:")
    print("-" * 80)
    for col in columns:
        print(f"  {col[0]:<40} {col[1]}")
    print("-" * 80)
    
except Exception as e:
    print(f"‚ùå Error describing table: {e}")
    print("\nTrying alternative approach...")
    
    # Alternative: Just fetch first row to see columns
    try:
        query_sample = """
        SELECT * 
        FROM desktop_product_intelligence_public.halley_feedback_qualtrics_comments 
        LIMIT 1
        """
        df_sample = pd.read_sql(query_sample, conn)
        print("üìã Table Columns (from sample query):")
        print("-" * 80)
        for col in df_sample.columns:
            print(f"  {col}")
        print("-" * 80)
    except Exception as e2:
        print(f"‚ùå Error: {e2}")


In [None]:
## Step 5: Fetch Sample Data

# Retrieve a small sample of data from the table to inspect the content.

# Fetch sample data
query_sample = """
SELECT * 
FROM desktop_product_intelligence_public.halley_feedback_qualtrics_comments 
LIMIT 10
"""

try:
    print("üîç Fetching sample data...")
    df = pd.read_sql(query_sample, conn)
    
    print(f"‚úÖ Successfully fetched {len(df)} rows")
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    
    # Display the data
    display(df)
    
except Exception as e:
    print(f"‚ùå Error fetching data: {e}")


In [None]:
## Step 6: Get Row Count

# Check how many rows are in the table.

# Count total rows
query_count = """
SELECT COUNT(*) as total_rows
FROM desktop_product_intelligence_public.halley_feedback_qualtrics_comments
"""

try:
    cursor = conn.cursor()
    cursor.execute(query_count)
    count = cursor.fetchone()[0]
    cursor.close()
    
    print(f"üìä Total rows in table: {count:,}")
    
except Exception as e:
    print(f"‚ùå Error counting rows: {e}")


In [None]:
## Step 7: Explore Data Quality

# Check for basic data quality metrics.

# Data quality checks
if 'df' in locals() and not df.empty:
    print("üîç Data Quality Summary:")
    print("=" * 80)
    
    # Show info
    print("\nüìä DataFrame Info:")
    df.info()
    
    print("\nüìà Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': missing_pct
    })
    display(missing_df[missing_df['Missing Count'] > 0])
    
    print("\nüìã Sample Data Types:")
    display(df.dtypes)
    
else:
    print("‚ö†Ô∏è No data loaded yet. Please run the previous cells first.")


In [None]:
## Step 8: Close Connection

# Always close the database connection when done.

# Close the connection
try:
    conn.close()
    print("‚úÖ Connection closed successfully")
except:
    print("‚ö†Ô∏è Connection was already closed or not established")
