# Step 1: Crawler Question Bank

In [12]:
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import re

In [13]:
# ============================================================================
# Step 1: Crawler Question Bank Management
# Responsible: Receiving and storing question data crawled from various travel platforms
# ============================================================================
@dataclass
class TravelQuestion:
    """
    Travel Question Data Class
    Used to store question data crawled from various platforms
    """
    question_id: str      # Unique question identifier
    text: str            # Question text content
    question_type: str   # Question type: single choice, multiple choice, open-ended
    options: List[str]   # Option list (only for choice questions)
    source_platform: str # Data source platform
    original_category: str  # Original category (preliminary classification by crawler)

class CrawlerQuestionBank:
    """
    Crawler Question Bank Management Class
    Main function: Store and manage question data crawled from various platforms
    """
   
    def __init__(self):
        # Use dictionary to store all questions, key is question_id, value is TravelQuestion object
        self.questions = {}
        # Record all data source platforms
        self.platforms = set()
    
    def add_crawled_questions(self, crawled_data: List[Dict]) -> Dict[str, Any]:
        """
        Batch add question data obtained by crawler
        Input: Raw data list provided by crawler team
        Output: Processing result statistics (success count, error messages, etc.)
        """
        # TODO: Data validation, deduplication, storage logic
        # Need to check data format, remove duplicate questions, generate unique question_id
        results = {'added_count': 0, 'errors': []}
        return results
    
    def get_questions_by_platform(self, platform: str) -> List[TravelQuestion]:
        """
        Filter questions by platform name
        Example: Get all questions from Booking
        """
        # TODO: Platform filtering logic
        return []
    
    def get_question_count_by_platform(self) -> Dict[str, int]:
        """
        Count number of questions by platform
        Output example: {'Ctrip': 50, 'Booking.com': 30, ...}
        """
        # TODO: Platform statistics logic
        return {}

# Step 2: Data Integration and Preprocessing

In [14]:
# ============================================================================
# Step 2: Data Integration and Preprocessing
# Responsible: Cleaning and standardizing crawled raw data
# ============================================================================

class DataIntegrationPreprocessor:
    """
    Data Integration and Preprocessing Class
    Main function: Clean, classify, and evaluate complexity of raw data
    """

    # 1. Travel domain keyword dictionary for automatic question classification, needs continuous improvement based on actual questionnaire content
    def __init__(self):
        self.travel_keywords = {
            '住宿': 'accommodation', '酒店': 'accommodation', '民宿': 'accommodation',
            '交通': 'transportation', '航班': 'transportation', '机票': 'transportation',
            '火车': 'transportation', '地铁': 'transportation', '租车': 'transportation',
            '美食': 'food', '餐厅': 'food', '小吃': 'food', '早餐': 'food',
            '景点': 'attraction', '门票': 'attraction', '博物馆': 'attraction',
            '签证': 'visa', '护照': 'visa', '入境': 'visa', '证件': 'visa',
            '预算': 'budget', '价格': 'budget', '费用': 'budget', '花费': 'budget'
        }
    
    # 2. Main method for data integration and cleaning
    # Input: Raw question data
    # Output: List of cleaned structured data
    def integrate_and_clean(self, raw_questions: List[Any]) -> List[Dict[str, Any]]:
        processed_questions_map = {}  # Use dictionary storage, key is question text, value is processed question
        text_frequency = {}  # Single question occurrence frequency
        
        for i, raw_question in enumerate(raw_questions):
            try:
                # 1. Extract question text
                question_text = self._extract_question_text(raw_question)
                if not question_text:  # Skip empty text
                    continue
                
                # 2. Text cleaning
                cleaned_text = self._clean_text(question_text)
                if not cleaned_text:  # Skip text that becomes empty after cleaning
                    continue
                
                # 3. Count frequency (count regardless of duplication)
                text_frequency[cleaned_text] = text_frequency.get(cleaned_text, 0) + 1
                
                # 4. If this question has been processed, skip duplicate processing
                if cleaned_text in processed_questions_map:
                    continue
                
                # 5. Extract and standardize options
                raw_options = self._extract_options(raw_question)
                standardized_options = self._standardize_options(raw_options)
                
                # 6. Determine question type (based on number of options)
                question_type = self._determine_question_type(standardized_options)
                
                # 7. Automatic classification and complexity evaluation
                category = self.categorize_question(cleaned_text)
                complexity = self.calculate_complexity(cleaned_text, standardized_options, question_type)
                
                # 8. Build structured output (including frequency information)
                processed_question = {
                    'question_id': getattr(raw_question, 'question_id', f'Q{i:04d}'), # Question ID
                    'text': cleaned_text, # Simple character processing
                    'options': standardized_options, # Extract options
                    'question_type': question_type, # Question type, such as single choice, multiple choice, open-ended
                    'category': category, # Automatic classification result, such as accommodation, transportation
                    'complexity': complexity, # Question complexity based on: text length, number of options, question type, keywords
                    'frequency': text_frequency[cleaned_text], # Number of occurrences after cleaning
                    'source_platform': getattr(raw_question, 'source_platform', 'unknown'),
                    'original_category': getattr(raw_question, 'original_category', 'unknown')
                }
                
                # 9. Store in dictionary (key is question text)
                processed_questions_map[cleaned_text] = processed_question
                
            except Exception as e:
                # Skip questions that failed processing
                continue
        
        # Return list of all unique questions
        return list(processed_questions_map.values())
    
    # 3. Frequency statistics
    def get_question_frequency_report(self, processed_questions: List[Dict]) -> Dict[str, Any]:
        """Generate question frequency statistics report"""
        if not processed_questions:
            return {}
        
        frequency_report = {
            'total_questions': len(processed_questions),
            'unique_questions': len(set(q['text'] for q in processed_questions)),
            'top_questions': [],
            'frequency_distribution': {}  # Frequency distribution: how many questions appear once, twice, etc.
        }
        
        # Count frequency of each question (obtained from processed_questions)
        text_frequency = {}
        for question in processed_questions:
            text = question['text']
            frequency = question.get('frequency', 1)  # Get frequency, default to 1 if not present
            text_frequency[text] = frequency
        
        # Find highest frequency questions (sorted by frequency)
        sorted_frequency = sorted(text_frequency.items(), key=lambda x: x[1], reverse=True)
        frequency_report['top_questions'] = [
            {'question': text, 'frequency': freq} 
            for text, freq in sorted_frequency[:10]  # Top 10 highest frequency questions
        ]
        
        # Count frequency distribution
        freq_dist = {}
        for count in text_frequency.values():
            freq_dist[count] = freq_dist.get(count, 0) + 1
        frequency_report['frequency_distribution'] = freq_dist
        
        return frequency_report

    # 4. Function support  
    def _extract_question_text(self, raw_question: Any) -> str:
        """Extract question text from raw data"""
        if hasattr(raw_question, 'text'):
            return raw_question.text
        elif isinstance(raw_question, dict) and 'text' in raw_question:
            return raw_question['text']
        return ""
    
    def _clean_text(self, text: str) -> str:
        """Clean question text: remove HTML tags, special characters, extra spaces"""
        if not text: 
            return ""
        # Remove HTML tags
        cleaned = re.sub(r'<[^>]+>', '', text)
        # Keep only Chinese, English, numbers and basic punctuation
        cleaned = re.sub(r'[^\w\u4e00-\u9fff\s\.\?\!]', ' ', cleaned)
        # Merge multiple spaces into single space
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned
    
    def _extract_options(self, raw_question: Any) -> List[str]:
        """Extract options from raw data"""
        if hasattr(raw_question, 'options'):
            return raw_question.options
        elif isinstance(raw_question, dict) and 'options' in raw_question:
            return raw_question['options']
        return []
    
    def _standardize_options(self, options: List[str]) -> List[str]:
        """Standardize option format: clean whitespace, remove numbering prefixes"""
        if not options: 
            return ["Yes", "No"]  # Default options
        
        standardized = []
        for option in options:
            if not option: 
                continue
            # Remove numbering prefixes like "A.", "1)", etc.
            cleaned = re.sub(r'^[A-Za-z0-9][\.、\)）]?\s*', '', option.strip())
            # Clean spaces
            cleaned = re.sub(r'\s+', ' ', cleaned).strip()
            if cleaned: 
                standardized.append(cleaned)
        
        return standardized if standardized else ["Yes", "No"]
    
    def _determine_question_type(self, options: List[str]) -> str:
        """Determine question type based on options"""
        count = len(options)
        if count == 0: 
            return "open_ended"  # Open-ended question
        elif count == 2 and any(opt in ["Yes", "No", "Have", "Don't have"] for opt in options): 
            return "yes_no"  # Yes/No question
        elif count <= 4: 
            return "single_choice"  # Single choice question
        else: 
            return "multiple_choice"  # Multiple choice question
    
    def calculate_complexity(self, text: str, options: List[str], question_type: str) -> str:
        """
        Calculate question complexity
        Based on: text length, number of options, question type, keyword difficulty
        """
        score = 0
        
        # Text length scoring
        if len(text) > 60: 
            score += 3  # Long text, high complexity
        elif len(text) > 40: 
            score += 2  # Medium text
        elif len(text) > 20: 
            score += 1  # Short text
        
        # Option count scoring
        if len(options) > 6: 
            score += 2  # Many options, high complexity
        elif len(options) > 4: 
            score += 1  # Medium options
        
        # Question type scoring
        if question_type == "open_ended": 
            score += 2  # Open-ended questions are more difficult
        elif question_type == "multiple_choice": 
            score += 1  # Multiple choice questions are medium
        
        # Keyword difficulty scoring
        hard_words = ['Visa', 'Policy', 'Planning', 'Budget', 'Compare', 'Why']
        if any(word in text for word in hard_words): 
            score += 2  # Contains complex concepts
        
        # Final classification
        if score >= 6: 
            return "hard"
        elif score >= 3: 
            return "medium"
        else: 
            return "easy"
    
    def categorize_question(self, text: str) -> str:
        """Automatically classify questions based on keywords"""
        if not text: 
            return "general"
        # Traverse keyword dictionary to find matching classification
        for chinese, english in self.travel_keywords.items():
            if chinese in text: 
                return english
        return "general"  # Default classification

# Step 3: Cluster Analysis

In [15]:
# ============================================================================
# Step 3: Cluster Analysis
# Responsible: Using K-means and other algorithms to group similar questions
# ============================================================================

class QuestionClusterAnalyzer:
    def __init__(self, n_clusters: int = 6):
        self.n_clusters = n_clusters  # Number of clusters: defined based on data situation
        self.is_trained = False  # Model training status
        # TODO: Add vectorizer, clustering model, etc.
    
    def perform_cluster_analysis(self, processed_questions: List[Dict]) -> Dict[str, Any]:
        """
        Execute complete cluster analysis process
        Input: Cleaned question data
        Output: Cluster results and statistical analysis
        """
        # TODO: Specific clustering steps to implement:
        # 1. Text vectorization (TF-IDF or Word2Vec)
        # 2. K-means clustering algorithm
        # 3. Cluster result analysis (silhouette score, etc.)
        # 4. Extract keywords for each cluster
        
        cluster_results = {
            'cluster_count': self.n_clusters, # Number of clusters used
            'questions_processed': len(processed_questions), # Total number of questions processed
            'cluster_assignments': [],  # Which cluster each question belongs to
            'silhouette_score': 0.0,   # Cluster effectiveness score
            'cluster_keywords': {}     # Characteristic keywords for each cluster
        }
        self.is_trained = True
        return cluster_results
    
    def assign_cluster(self, question_text: str) -> int:
        """Assign new question to appropriate cluster"""
        if not self.is_trained:
            return 0  # Return default cluster if model not trained
        # TODO: Use trained model to predict cluster for new question
        return 0
    
    def get_cluster_keywords(self, cluster_id: int) -> List[str]:
        """Get characteristic keywords for specified cluster"""
        # TODO: Analyze cluster centers, extract representative words
        return []

# Step 4: Question Hierarchy and Recursive Relationships

In [16]:
# ============================================================================
# Step 4: Question Hierarchy and Recursive Relationships
# Responsible: Establishing logical relationship network between questions, implementing personalized questionnaires
# ============================================================================

@dataclass
class QuestionNode:
    """
    Question Node Class
    Used to build hierarchical structure of questions (tree relationships)
    """
    question_id: str           # Question ID
    question_text: str         # Question text
    cluster_id: int           # Belonging cluster
    complexity: str           # Complexity level
    category: str             # Question category
    parent_id: Optional[str] = None      # Parent question ID (for building hierarchy)
    children: List[str] = None           # Child question ID list

    def __post_init__(self):
        """Post-initialization processing: Ensure children is not None"""
        if self.children is None:
            self.children = []

class QuestionHierarchyBuilder:
    """
    Question Hierarchy Builder
    Main function: Build logical relationship network of questions based on clustering results
    """
    
    def __init__(self, cluster_analyzer: QuestionClusterAnalyzer):
        self.cluster_analyzer = cluster_analyzer
        self.question_nodes = {}  # Store all question nodes
        self.cluster_hierarchies = {}  # Questions grouped by cluster
    
    def build_hierarchy_from_clusters(self, processed_questions: List[Dict]) -> Dict[str, Any]:
        """
        Build question hierarchy structure based on clustering results
        Input: Cleaned and clustered question data
        Output: Hierarchy structure building results
        """
        # TODO: Building strategies to implement:
        # 1. Build based on clustering: Establish relationships between questions in the same cluster, e.g., all questions about "accommodation" grouped together
        # 2. Build based on complexity: Simple → Medium → Complex progression, e.g., first ask simple "Do you like traveling?" then complex "What is your travel budget?"
        # 3. Build based on categories: Horizontal relationships between related category questions, e.g., accommodation questions and transportation questions may have logical relationships
        
        hierarchy_results = {
            'total_nodes': len(self.question_nodes),
            'max_depth': 0,           # Maximum depth of hierarchy
            'root_nodes': [],         # List of root nodes
            'leaf_nodes': []          # List of leaf nodes
        }
        return hierarchy_results
    
    def get_next_questions(self, current_question_id: str, user_answers: Dict[str, str]) -> List[str]:
        """
        Recommend next questions based on current question and user answers
        Implement core logic of personalized questionnaire
        """
        # TODO: Recommendation strategies to implement:
        # 1. Based on child node relationships, e.g., answer "Do you like traveling?" as "Yes", then recommend child question "What type of travel do you prefer?"
        # 2. Based on other questions in same cluster, e.g., user answering accommodation questions, recommend other accommodation-related questions
        # 3. Based on analysis of user answer content, e.g., user says "limited budget", recommend economy-related questions
        # 4. Based on question category associations, e.g., user answered accommodation question, recommend related transportation questions
        
        return []  # Return list of recommended question IDs
    
    def get_question_relationships(self) -> List[Dict[str, Any]]:
        """Get relationship data for all questions (for visualization)"""
        # TODO: Extract parent-child relationships, cluster relationships, etc. for drawing relationship diagrams
        return []

# Step 5: Personalized Survey Questionnaire Implementation

In [17]:
# ============================================================================
# Step 5: Personalized Survey Questionnaire Implementation
# Responsible: Integrate first 4 steps, implement personalized survey questionnaire through question hierarchy relationships
# ============================================================================

class TravelSurveySystem: 
    def __init__(self):
        # TODO: Initialize component instances from first 4 steps
        self.crawler_bank = None
        self.preprocessor = None
        self.cluster_analyzer = None
        self.hierarchy_builder = None
        self.user_sessions = {}  # Store user session data
    
    def process_complete_workflow(self, crawled_data: List[Dict]) -> Dict[str, Any]:
        """
        Execute complete workflow (Steps 1-4)
        Input: Crawler raw data
        Output: Processing results from each step
        """
        # TODO: Complete workflow to implement:
        # 1. Data storage (Step 1)
        # 2. Data cleaning (Step 2)
        # 3. Cluster analysis (Step 3)
        # 4. Hierarchy building (Step 4)
        
        results = {
            'crawl_results': {},          # Step 1 results
            'preprocessing_results': {},   # Step 2 results
            'cluster_results': {},         # Step 3 results
            'hierarchy_results': {}        # Step 4 results
        }
        return results
    
    def start_personalized_survey(self, user_id: str, user_profile: Dict[str, Any]) -> str:
        """
        Start personalized survey questionnaire
        Input: User ID and user profile (age, travel experience, etc.)
        Output: Session ID
        """
        # TODO: Select initial questions based on user profile
        session_id = f"user_{user_id}"
        # Create user session, record answer history and current questions
        self.user_sessions[session_id] = {
            'user_id': user_id,
            'profile': user_profile,
            'answers': {},      # Answered questions and answers
            'current_questions': []  # Current questions to answer
        }
        return session_id
    
    def submit_answer(self, session_id: str, question_id: str, answer: str) -> List[str]:
        """
        Submit user answer and get next questions
        Implement core functionality of dynamic questionnaire
        """
        # TODO:
        # 1. Record user answer
        # 2. Recommend next questions based on hierarchy relationships
        # 3. Update session status
        
        return []  # Return next question ID list
    
    def generate_survey_report(self, session_id: str) -> Dict[str, Any]:
        """Generate personalized survey report"""
        # TODO: Analyze user answers, generate personalized insights and recommendations, while classifying users
        return {}
    
    def get_system_statistics(self) -> Dict[str, Any]:
        """Get system operation statistics"""
        # TODO: Question bank size, user count, questionnaire completion rate, etc.
        return {}

# Step 6: Data Visualization

In [18]:
# ============================================================================
# Step 6: Data Visualization
# Responsible: Generate various charts and dashboards to display data analysis results
# ============================================================================

class DataVisualization:
    """
    Data Visualization Class
    Main function: Generate various charts to display system status and analysis results
    """
    
    def __init__(self):
        self.chart_templates = {}  # Chart template configuration
    
    def create_cluster_visualization(self, cluster_data: Dict[str, Any]) -> str:
        """
        Create cluster result visualization chart
        Display: Distribution of questions in 2D space, different clusters in different colors
        """
        # TODO: Use matplotlib/seaborn to create scatter plot
        # Display distribution of each cluster and cluster centers
        return "cluster_visualization.png"  # Return chart file path
    
    def create_category_distribution_chart(self, category_data: Dict[str, int]) -> str:
        """
        Create question category distribution chart
        Display: Number distribution of questions across travel question categories
        """
        # TODO: Create bar chart or pie chart
        # Display number of questions in accommodation, transportation, food, etc. categories
        return "category_distribution.png"
    
    def create_complexity_analysis_chart(self, complexity_data: Dict[str, int]) -> str:
        """
        Create complexity analysis chart
        Display: Proportion distribution of easy, medium, hard questions
        """
        # TODO: Create stacked bar chart or donut chart
        return "complexity_analysis.png"
    
    def create_platform_comparison_chart(self, platform_data: Dict[str, int]) -> str:
        """
        Create platform data comparison chart
        Display: Comparison of question quantity and quality across different travel platforms
        """
        # TODO: Create multi-series bar chart
        # Compare data characteristics of Ctrip, Booking.com, etc. platforms
        return "platform_comparison.png"
    
    def create_survey_progress_dashboard(self, survey_data: Dict[str, Any]) -> str:
        """
        Create survey progress dashboard
        Display: Questionnaire completion status, user participation, etc.
        """
        # TODO: Create comprehensive dashboard
        return "survey_dashboard.html"
    
    def create_question_relationship_graph(self, relationship_data: List[Dict]) -> str:
        """
        Create question relationship network graph
        Display: Hierarchical relationships and association networks between questions
        """
        # TODO: Use networkx to create network graph
        # Nodes represent questions, edges represent association relationships
        return "question_relationship.png"
    
    def create_user_analysis_report(self, user_data: Dict[str, Any]) -> str:
        """
        Create personalized analysis report for individual user
        Input: User data, including user profile, answer records, classification results, dimension scores, etc.
        Output: Visualization (such as html)
        """
        # TODO: Report content: User type, classification basis (key answers), feature radar chart (displaying user scores across multiple dimensions), personalized travel recommendations (destinations, activities, budget, etc.) implement detailed analysis (such as html implementation), mainly including overall user behavior analysis, preference insights, and feedback after individual user completes all questions (user type and personalized recommendations)
        return "user_analysis_report.html"
    
    def create_allusers_analysis_report(self, system_data: Dict[str, Any]) -> str:
        """Generate overall user analysis report
           Total users, classification status, completion rate, etc.
        """
        return "allusers_analysis_report.html"  
    
    def export_all_visualizations(self, system_data: Dict[str, Any]) -> Dict[str, str]:
        """
        Export all visualization charts
        Input: Data from various system modules
        Output: Dictionary of paths to various chart files
        """
        visualizations = {
            'clusters': self.create_cluster_visualization(system_data.get('cluster_data', {})),
            'categories': self.create_category_distribution_chart(system_data.get('category_data', {})),
            'complexity': self.create_complexity_analysis_chart(system_data.get('complexity_data', {})),
            'platforms': self.create_platform_comparison_chart(system_data.get('platform_data', {})),
            'survey_dashboard': self.create_survey_progress_dashboard(system_data.get('survey_data', {})),
            'relationships': self.create_question_relationship_graph(system_data.get('relationship_data', [])),
            'individual_user_report': self.create_user_analysis_report(system_data.get('user_data', {})),
            'all_users_report': self.create_allusers_analysis_report(system_data.get('system_stats', {}))
        }
        return visualizations

# Step 7: Main Program Library Integration

In [19]:
# ============================================================================
# Main Program Library Integration
# Responsible: Integrate all components from 6 steps, provide unified interface
# ============================================================================

class TravelSurveyLibrary:
# Travel Survey Questionnaire Program Library
    
    def __init__(self):
        # Step 1: Crawler Question Bank
        self.crawler_bank = CrawlerQuestionBank()
        
        # Step 2: Data Integration and Preprocessing
        self.data_preprocessor = DataIntegrationPreprocessor()
        
        # Step 3: Cluster Analysis
        self.cluster_analyzer = QuestionClusterAnalyzer(n_clusters=6)
        
        # Step 4: Question Hierarchy and Recursive Relationships
        self.hierarchy_builder = QuestionHierarchyBuilder(self.cluster_analyzer)
        
        # Step 5: Personalized Survey Questionnaire Implementation
        self.survey_system = TravelSurveySystem()
        
        # Step 6: Data Visualization
        self.visualization = DataVisualization()