In [1]:
# Multidimensional Data Analysis Pipeline
# A comprehensive notebook for querying, analyzing, and visualizing high-dimensional data

# Install required dependencies (run this first)

!pip install -qU pandas numpy scikit-learn plotly lux-api sqlalchemy psycopg2-binary umap-learn seaborn matplotlib jupyter ipywidgets orangewidget orange3 orange3-associate

from plotly import plotly
import pandas as pd
import numpy as np
import sqlite3
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import umap

# Interactive widgets
import ipywidgets as widgets
from IPython.display import display, HTML

# Lux for automatic visualizations
import lux

print("All dependencies loaded successfully!")

class MultiDimAnalyzer:
    """
    A comprehensive pipeline for multidimensional data analysis
    """
    
    def __init__(self):
        self.data = None
        self.scaled_data = None
        self.reducers = {}
        self.projections = {}
        
    def connect_database(self, connection_string="sqlite:///sample.db"):
        """Connect to database"""
        try:
            self.engine = create_engine(connection_string)
            print(f"Connected to database: {connection_string}")
            return True
        except Exception as e:
            print(f"Database connection failed: {e}")
            return False
    
    def query_data(self, query=None, table_name=None):
        """
        Query data from database
        Either provide a custom query or table name
        """
        if query:
            self.data = pd.read_sql(query, self.engine)
        elif table_name:
            self.data = pd.read_sql(f"SELECT * FROM {table_name}", self.engine)
        else:
            # Create sample data for demonstration
            print("No query provided, generating sample multidimensional data...")
            self.data = self._generate_sample_data()
        
        print(f"Data loaded: {self.data.shape}")
        return self.data
    
    def _generate_sample_data(self):
        """Generate sample multidimensional data for demonstration"""
        np.random.seed(42)
        n_samples = 1000
        
        # Create sample data with different clusters and relationships
        data = {
            'feature_1': np.random.normal(0, 1, n_samples),
            'feature_2': np.random.normal(0, 1.5, n_samples),
            'feature_3': np.random.exponential(2, n_samples),
            'feature_4': np.random.gamma(2, 2, n_samples),
            'feature_5': np.random.beta(2, 5, n_samples),
        }
        
        # Add some correlated features
        data['feature_6'] = data['feature_1'] * 2 + np.random.normal(0, 0.5, n_samples)
        data['feature_7'] = data['feature_2'] + data['feature_3'] + np.random.normal(0, 0.3, n_samples)
        
        # Add categorical variables
        data['category'] = np.random.choice(['A', 'B', 'C'], n_samples, p=[0.3, 0.4, 0.3])
        data['group'] = np.random.choice(['Group1', 'Group2', 'Group3'], n_samples)
        
        # Add a target variable
        data['target'] = (data['feature_1'] + data['feature_2'] * 0.5 + 
                         np.random.normal(0, 0.2, n_samples))
        
        return pd.DataFrame(data)
    
    def preprocess_data(self, exclude_columns=None):
        """Preprocess data for dimensionality reduction"""
        if exclude_columns is None:
            exclude_columns = ['category', 'group']  # Exclude categorical columns by default
        
        # Select numeric columns
        numeric_data = self.data.select_dtypes(include=[np.number])
        if exclude_columns:
            numeric_data = numeric_data.drop(columns=exclude_columns, errors='ignore')
        
        # Scale the data
        scaler = StandardScaler()
        self.scaled_data = scaler.fit_transform(numeric_data)
        self.feature_names = numeric_data.columns.tolist()
        
        print(f"Preprocessed data shape: {self.scaled_data.shape}")
        return self.scaled_data
    
    def apply_dimensionality_reduction(self):
        """Apply various dimensionality reduction techniques"""
        
        # PCA
        pca_2d = PCA(n_components=2, random_state=42)
        pca_3d = PCA(n_components=3, random_state=42)
        self.projections['PCA_2D'] = pca_2d.fit_transform(self.scaled_data)
        self.projections['PCA_3D'] = pca_3d.fit_transform(self.scaled_data)
        self.reducers['PCA_2D'] = pca_2d
        self.reducers['PCA_3D'] = pca_3d
        
        # t-SNE
        tsne_2d = TSNE(n_components=2, random_state=42, perplexity=30)
        tsne_3d = TSNE(n_components=3, random_state=42, perplexity=30)
        print("Computing t-SNE projections...")
        self.projections['tSNE_2D'] = tsne_2d.fit_transform(self.scaled_data)
        self.projections['tSNE_3D'] = tsne_3d.fit_transform(self.scaled_data)
        
        # UMAP
        umap_2d = umap.UMAP(n_components=2, random_state=42)
        umap_3d = umap.UMAP(n_components=3, random_state=42)
        print("Computing UMAP projections...")
        self.projections['UMAP_2D'] = umap_2d.fit_transform(self.scaled_data)
        self.projections['UMAP_3D'] = umap_3d.fit_transform(self.scaled_data)
        self.reducers['UMAP_2D'] = umap_2d
        self.reducers['UMAP_3D'] = umap_3d
        
        print("All dimensionality reductions completed!")
        
    def create_interactive_plots(self):
        """Create interactive plots for all projections"""
        
        # Add clustering for coloring points
        kmeans = KMeans(n_clusters=3, random_state=42)
        clusters = kmeans.fit_predict(self.scaled_data)
        
        plots = {}
        
        for method, projection in self.projections.items():
            if '2D' in method:
                fig = px.scatter(
                    x=projection[:, 0], 
                    y=projection[:, 1],
                    color=clusters,
                    title=f"{method} Projection",
                    labels={'x': f'{method} Component 1', 'y': f'{method} Component 2'},
                    color_discrete_sequence=px.colors.qualitative.Set1
                )
            else:  # 3D
                fig = px.scatter_3d(
                    x=projection[:, 0], 
                    y=projection[:, 1], 
                    z=projection[:, 2],
                    color=clusters,
                    title=f"{method} Projection",
                    labels={
                        'x': f'{method} Component 1', 
                        'y': f'{method} Component 2',
                        'z': f'{method} Component 3'
                    },
                    color_discrete_sequence=px.colors.qualitative.Set1
                )
            
            fig.update_layout(height=600, width=800)
            plots[method] = fig
            
        return plots
    
    def display_comparison_grid(self):
        """Display all projections in a comparison grid"""
        
        fig = make_subplots(
            rows=2, cols=3,
            subplot_titles=['PCA 2D', 't-SNE 2D', 'UMAP 2D', 'PCA 3D', 't-SNE 3D', 'UMAP 3D'],
            specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}],
                   [{'type': 'scatter3d'}, {'type': 'scatter3d'}, {'type': 'scatter3d'}]]
        )
        
        # Add clustering
        kmeans = KMeans(n_clusters=3, random_state=42)
        clusters = kmeans.fit_predict(self.scaled_data)
        colors = ['red', 'blue', 'green']
        
        # 2D plots
        methods_2d = ['PCA_2D', 'tSNE_2D', 'UMAP_2D']
        for i, method in enumerate(methods_2d):
            projection = self.projections[method]
            for cluster in range(3):
                mask = clusters == cluster
                fig.add_scatter(
                    x=projection[mask, 0],
                    y=projection[mask, 1],
                    mode='markers',
                    marker=dict(color=colors[cluster], size=4),
                    name=f'Cluster {cluster}' if i == 0 else '',
                    showlegend=(i == 0),
                    row=1, col=i+1
                )
        
        # 3D plots
        methods_3d = ['PCA_3D', 'tSNE_3D', 'UMAP_3D']
        for i, method in enumerate(methods_3d):
            projection = self.projections[method]
            for cluster in range(3):
                mask = clusters == cluster
                fig.add_scatter3d(
                    x=projection[mask, 0],
                    y=projection[mask, 1],
                    z=projection[mask, 2],
                    mode='markers',
                    marker=dict(color=colors[cluster], size=3),
                    name=f'Cluster {cluster}' if i == 0 else '',
                    showlegend=False,
                    row=2, col=i+1
                )
        
        fig.update_layout(height=1000, title_text="Dimensionality Reduction Comparison")
        return fig
    
    def analyze_with_lux(self):
        """Use Lux for automatic visualization recommendations"""
        
        # Combine original data with projections for Lux analysis
        analysis_df = self.data.copy()
        
        # Add 2D projections
        for method in ['PCA_2D', 'tSNE_2D', 'UMAP_2D']:
            analysis_df[f'{method}_x'] = self.projections[method][:, 0]
            analysis_df[f'{method}_y'] = self.projections[method][:, 1]
        
        # Enable Lux
        analysis_df.intent = ['target']  # Set intent for Lux recommendations
        
        print("Lux automatic visualizations:")
        return analysis_df
    
    def get_feature_importance(self, method='PCA_2D'):
        """Get feature importance for interpretability"""
        
        if method in self.reducers:
            reducer = self.reducers[method]
            
            if hasattr(reducer, 'components_'):
                # For PCA
                components = reducer.components_
                feature_importance = pd.DataFrame(
                    components.T,
                    columns=[f'Component_{i+1}' for i in range(components.shape[0])],
                    index=self.feature_names
                )
                
                # Create heatmap
                plt.figure(figsize=(10, 6))
                sns.heatmap(feature_importance, annot=True, cmap='coolwarm', center=0)
                plt.title(f'{method} Feature Loadings')
                plt.tight_layout()
                plt.show()
                
                return feature_importance
        
        return None

# Usage Example and Demo
def run_analysis_pipeline():
    """Run the complete analysis pipeline"""
    
    print("=== Multidimensional Data Analysis Pipeline ===\n")
    
    # Initialize analyzer
    analyzer = MultiDimAnalyzer()
    
    # Step 1: Load data (using sample data for demo)
    print("Step 1: Loading data...")
    data = analyzer.query_data()  # This will generate sample data
    print(f"Data preview:\n{data.head()}\n")
    
    # Step 2: Preprocess data
    print("Step 2: Preprocessing data...")
    analyzer.preprocess_data()
    
    # Step 3: Apply dimensionality reduction
    print("Step 3: Applying dimensionality reduction techniques...")
    analyzer.apply_dimensionality_reduction()
    
    # Step 4: Create visualizations
    print("Step 4: Creating interactive visualizations...")
    plots = analyzer.create_interactive_plots()
    
    # Display individual plots
    for method, plot in plots.items():
        print(f"\nDisplaying {method} plot:")
        plot.show()
    
    # Step 5: Show comparison grid
    print("Step 5: Creating comparison grid...")
    comparison_fig = analyzer.display_comparison_grid()
    comparison_fig.show()
    
    # Step 6: Feature importance analysis
    print("Step 6: Analyzing feature importance...")
    feature_importance = analyzer.get_feature_importance('PCA_2D')
    if feature_importance is not None:
        print("PCA Feature Loadings:")
        print(feature_importance)
    
    # Step 7: Lux analysis
    print("Step 7: Running Lux automatic analysis...")
    lux_df = analyzer.analyze_with_lux()
    
    return analyzer, lux_df

# Interactive widget for method selection
def create_interactive_selector(analyzer):
    """Create interactive widgets for exploring different projections"""
    
    method_dropdown = widgets.Dropdown(
        options=list(analyzer.projections.keys()),
        value='PCA_2D',
        description='Method:'
    )
    
    def update_plot(method):
        projection = analyzer.projections[method]
        
        if '2D' in method:
            fig = px.scatter(
                x=projection[:, 0], 
                y=projection[:, 1],
                title=f"Interactive {method} Projection"
            )
        else:
            fig = px.scatter_3d(
                x=projection[:, 0], 
                y=projection[:, 1], 
                z=projection[:, 2],
                title=f"Interactive {method} Projection"
            )
        
        fig.show()
    
    interactive_plot = widgets.interactive(update_plot, method=method_dropdown)
    return interactive_plot

# Run the pipeline
if __name__ == "__main__":
    print("Starting Multidimensional Data Analysis Pipeline...")
    analyzer, lux_df = run_analysis_pipeline()
    
    print("\n=== Pipeline Complete! ===")
    print("You can now:")
    print("1. Explore the interactive plots above")
    print("2. Use analyzer.data to access the original data")
    print("3. Use analyzer.projections to access the reduced dimensions")
    print("4. Use lux_df.intent = ['your_column'] for targeted Lux analysis")
    print("5. Create custom queries with analyzer.query_data('YOUR SQL QUERY')")

[31mERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement orangewidget (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for orangewidget[0m[31m
[0m

ModuleNotFoundError: No module named 'plotly'