<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/DiaTweet_Advanced_Algorithm_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

class AdvancedDiabetesTweetAnalyzer:
    def __init__(self):
        # Sentiment Analysis Model
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(
            'cardiffnlp/twitter-roberta-base-sentiment'
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            'cardiffnlp/twitter-roberta-base-sentiment'
        )

        # NER Model
        self.nlp = spacy.load('en_core_web_trf')

        # TF-IDF Vectorizer
        self.tfidf = TfidfVectorizer(max_features=5000)

    def analyze_sentiment(self, tweet):
        """Advanced sentiment analysis with confidence scoring"""
        inputs = self.tokenizer(tweet, return_tensors='pt')
        outputs = self.sentiment_model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)

        return {
            'sentiment': torch.argmax(probabilities).item(),
            'confidence': torch.max(probabilities).item()
        }

    def extract_entities(self, tweet):
        """Enhanced Named Entity Recognition"""
        doc = self.nlp(tweet)
        return [
            {
                'text': ent.text,
                'label': ent.label_,
                'diabetes_relevance': self._calculate_diabetes_relevance(ent)
            } for ent in doc.ents
        ]

    def _calculate_diabetes_relevance(self, entity):
        """Custom relevance scoring for medical entities"""
        diabetes_keywords = [
            'insulin', 'glucose', 'blood sugar',
            'CGM', 'A1C', 'diabetes'
        ]
        return any(keyword in entity.text.lower() for keyword in diabetes_keywords)

    def extract_features(self, tweets):
        """TF-IDF feature extraction"""
        return self.tfidf.fit_transform(tweets)

# Usage Example
analyzer = AdvancedDiabetesTweetAnalyzer()
tweet = "Managing my diabetes with a new insulin pump! #diabetes #health"

sentiment = analyzer.analyze_sentiment(tweet)
entities = analyzer.extract_entities(tweet)

print("Sentiment:", sentiment)
print("Entities:", entities)

# DiaTweet: Advanced Algorithm Selection

## Sentiment Analysis Strategy
1. **Primary Model**: RoBERTa fine-tuned on healthcare/Twitter data
   - High accuracy in understanding context
   - Handles noisy, domain-specific text
   - Low computational overhead

2. **Ensemble Approach**
   - Combine RoBERTa with rule-based sentiment scoring
   - Create a confidence-weighted sentiment classifier
   - Improve overall sentiment detection accuracy

## Named Entity Recognition (NER)
1. **Transformer-based SpaCy Model**
   - State-of-the-art performance
   - Pre-trained on medical/health terminology
   - Efficient entity extraction

2. **Custom Rule-based Extension**
   - Add domain-specific diabetes-related entity patterns
   - Enhance SpaCy's default entity recognition
   - Capture nuanced medical terminology

## Text Classification Pipeline
1. **Multi-stage Classification**
   - Initial coarse-grained classification (Naive Bayes)
   - Fine-grained classification (Transformer models)
   - Hierarchical classification approach

## Feature Extraction
1. **Hybrid Embedding Approach**
   - Word2Vec for general semantic understanding
   - Domain-specific embeddings for medical context
   - TF-IDF for keyword importance

## Performance Optimization
- Use lightweight models (DistilBERT, MiniLM)
- Implement caching mechanisms
- Leverage GPU acceleration
- Batch processing of tweets

In [1]:
#!/usr/bin/env python3
import os
import logging
from typing import Dict, List, Optional

# Advanced Imports
import transformers
import torch
import spacy
import elasticsearch
import pandas as pd
import numpy as np
from typing import Any

# Microsoft Cognitive Services Integration
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

class DiaTweetAnalyzer:
    def __init__(self,
                 twitter_api_key: str,
                 azure_endpoint: str,
                 azure_key: str,
                 es_host: str = 'localhost'):
        """
        Initialize DiaTweet advanced analysis pipeline

        Args:
            twitter_api_key (str): Twitter API authentication
            azure_endpoint (str): Azure Cognitive Services endpoint
            azure_key (str): Azure service key
            es_host (str): Elasticsearch host
        """
        # Logging configuration
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s - %(levelname)s: %(message)s')
        self.logger = logging.getLogger(__name__)

        # Azure Text Analytics Client
        self.text_analytics_client = TextAnalyticsClient(
            endpoint=azure_endpoint,
            credential=AzureKeyCredential(azure_key)
        )

        # Advanced NLP Models
        self.nlp = spacy.load('en_core_web_trf')  # Transformer-based model
        self.sentiment_model = transformers.pipeline(
            'sentiment-analysis',
            model='cardiffnlp/twitter-roberta-base-sentiment'
        )

        # Elasticsearch Configuration
        self.es_client = elasticsearch.Elasticsearch([es_host])

    def preprocess_tweet(self, tweet_text: str) -> Dict[str, Any]:
        """
        Advanced preprocessing and feature extraction

        Args:
            tweet_text (str): Raw tweet text

        Returns:
            Dict containing processed tweet metadata
        """
        # SpaCy Named Entity Recognition
        doc = self.nlp(tweet_text)

        # Extract entities with advanced categorization
        entities = [
            {
                'text': ent.text,
                'label': ent.label_,
                'type': self._categorize_entity(ent)
            } for ent in doc.ents
        ]

        # Transformer-based sentiment analysis
        sentiment_result = self.sentiment_model(tweet_text)[0]

        # Azure Cognitive Services additional insights
        azure_insights = self._get_azure_insights(tweet_text)

        return {
            'original_text': tweet_text,
            'preprocessed_entities': entities,
            'sentiment': {
                'label': sentiment_result['label'],
                'score': sentiment_result['score']
            },
            'azure_insights': azure_insights,
            'linguistic_features': {
                'tokens': [token.text for token in doc],
                'pos_tags': [token.pos_ for token in doc]
            }
        }

    def _categorize_entity(self, entity) -> str:
        """
        Advanced entity categorization

        Args:
            entity: SpaCy Named Entity

        Returns:
            Specialized entity category
        """
        diabetes_categories = {
            'PRODUCT': ['medication', 'medical_device'],
            'ORG': ['medical_institution', 'research_center'],
            'PERSON': ['medical_professional', 'patient']
        }

        return diabetes_categories.get(entity.label_, ['generic'])[0]

    def _get_azure_insights(self, text: str) -> Dict[str, Any]:
        """
        Leverage Azure Cognitive Services for advanced text insights

        Args:
            text (str): Input text

        Returns:
            Dictionary of cognitive insights
        """
        try:
            # Azure text analytics
            response = self.text_analytics_client.analyze_sentiment([text])
            insights = next(response)

            return {
                'sentiment_score': insights.confidence_scores,
                'sentiment': insights.sentiment
            }
        except Exception as e:
            self.logger.error(f"Azure insights extraction failed: {e}")
            return {}

    def index_tweet(self, processed_tweet: Dict[str, Any]):
        """
        Index processed tweet to Elasticsearch

        Args:
            processed_tweet (Dict): Processed tweet metadata
        """
        try:
            self.es_client.index(
                index='diabetes_tweets',
                body=processed_tweet
            )
        except Exception as e:
            self.logger.error(f"Elasticsearch indexing failed: {e}")

    def advanced_query(self,
                       sentiment: Optional[str] = None,
                       entity_type: Optional[str] = None) -> List[Dict]:
        """
        Advanced search with multi-dimensional filtering

        Args:
            sentiment (str, optional): Sentiment filter
            entity_type (str, optional): Entity type filter

        Returns:
            List of matching tweets
        """
        query = {
            "query": {
                "bool": {
                    "must": []
                }
            }
        }

        if sentiment:
            query['query']['bool']['must'].append({
                "match": {"sentiment.label": sentiment}
            })

        if entity_type:
            query['query']['bool']['must'].append({
                "nested": {
                    "path": "preprocessed_entities",
                    "query": {
                        "match": {"preprocessed_entities.type": entity_type}
                    }
                }
            })

        results = self.es_client.search(index='diabetes_tweets', body=query)
        return results['hits']['hits']

def main():
    # Example usage with environment variables
    analyzer = DiaTweetAnalyzer(
        twitter_api_key=os.getenv('TWITTER_API_KEY', ''),
        azure_endpoint=os.getenv('AZURE_ENDPOINT', ''),
        azure_key=os.getenv('AZURE_KEY', '')
    )

    # Sample tweets
    sample_tweets = [
        "Just got my new insulin pump! #diabetes #tech",
        "Struggling with managing blood sugar levels today 😔"
    ]

    for tweet in sample_tweets:
        processed_tweet = analyzer.preprocess_tweet(tweet)
        analyzer.index_tweet(processed_tweet)
        print(f"Processed Tweet: {processed_tweet}")

    # Advanced query example
    results = analyzer.advanced_query(
        sentiment='POSITIVE',
        entity_type='medical_device'
    )
    print("Query Results:", results)

if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'elasticsearch'