In [10]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

class ConceptNetJunkDetector:
    """
    Automated system to detect and filter junk entries in ConceptNet data
    """
    
    def __init__(self, df, sample_size=100000):
        """
        Initialize with ConceptNet dataframe
        
        Args:
            df: ConceptNet dataframe with columns [relation_type, start_concept, end_concept, edge_weight]
            sample_size: Size of sample for expensive operations
        """
        self.df = df
        self.sample_size = min(sample_size, len(df))
        self.junk_patterns = {}
        self.quality_scores = {}
        
        print(f"🔍 Initializing Junk Detector with {len(df):,} triples")
        print(f"   Working with sample size: {self.sample_size:,}")
    
    def analyze_all(self, verbose=True):
        """Run all analysis methods and return comprehensive report"""
        print("\n" + "="*60)
        print("🚀 RUNNING COMPREHENSIVE JUNK ANALYSIS")
        print("="*60)
        
        results = {
            'length_analysis': self.analyze_concept_lengths(),
            'character_analysis': self.analyze_character_patterns(),
            'frequency_analysis': self.analyze_concept_frequency(),
            'relation_analysis': self.analyze_relation_quality(),
            'weight_analysis': self.analyze_edge_weights(),
            'semantic_analysis': self.analyze_semantic_validity(),
            'connectivity_analysis': self.analyze_connectivity_patterns()
        }
        
        # Generate filtering recommendations
        self.recommendations = self.generate_filtering_recommendations(results)
        
        if verbose:
            self.print_analysis_report(results)
        
        return results
    
    def analyze_concept_lengths(self):
        """Analyze the length distribution of concepts"""
        print("\n📏 Analyzing concept lengths...")
        
        all_concepts = pd.concat([self.df['start_concept'], self.df['end_concept']])
        concept_lengths = all_concepts.str.len()
        
        analysis = {
            'single_char_concepts': list(all_concepts[concept_lengths == 1].value_counts().head(20).index),
            'very_short_concepts': list(all_concepts[concept_lengths <= 2].value_counts().head(20).index),
            'very_long_concepts': list(all_concepts[concept_lengths > 50].value_counts().head(10).index),
            'length_distribution': {
                '1_char': (concept_lengths == 1).sum(),
                '2-3_chars': ((concept_lengths >= 2) & (concept_lengths <= 3)).sum(),
                '4-10_chars': ((concept_lengths >= 4) & (concept_lengths <= 10)).sum(),
                '11-20_chars': ((concept_lengths >= 11) & (concept_lengths <= 20)).sum(),
                '21-50_chars': ((concept_lengths >= 21) & (concept_lengths <= 50)).sum(),
                '>50_chars': (concept_lengths > 50).sum()
            }
        }
        
        # Flag suspicious patterns
        analysis['suspicious_single_chars'] = [c for c in analysis['single_char_concepts'] 
                                               if not c.isalpha() or c.lower() not in ['a', 'i']]
        
        print(f"   Found {len(analysis['suspicious_single_chars'])} suspicious single-char concepts")
        print(f"   Examples: {analysis['suspicious_single_chars'][:10]}")
        
        return analysis
    
    def analyze_character_patterns(self):
        """Detect non-standard character patterns"""
        print("\n🔤 Analyzing character patterns...")
        
        # Sample for performance
        sample = self.df.sample(n=self.sample_size, random_state=42)
        all_concepts = pd.concat([sample['start_concept'], sample['end_concept']])
        
        patterns = {
            'only_numbers': [],
            'only_punctuation': [],
            'mixed_alphanumeric': [],
            'contains_underscore': [],
            'contains_dots': [],
            'non_ascii': [],
            'looks_like_code': [],
            'url_like': []
        }
        
        for concept in tqdm(all_concepts.unique(), desc="Scanning concepts"):
            if re.match(r'^\d+$', concept):
                patterns['only_numbers'].append(concept)
            elif re.match(r'^[^a-zA-Z0-9]+$', concept):
                patterns['only_punctuation'].append(concept)
            elif '_' in concept:
                patterns['contains_underscore'].append(concept)
            elif '.' in concept and not concept.replace('.', '').isalpha():
                patterns['contains_dots'].append(concept)
            elif re.search(r'[^\x00-\x7F]', concept):
                patterns['non_ascii'].append(concept)
            elif re.match(r'^[a-z0-9_]+\(\)$|^[A-Z_]+$|^\$\w+$', concept):
                patterns['looks_like_code'].append(concept)
            elif re.match(r'^(http|www|\.com|\.org)', concept):
                patterns['url_like'].append(concept)
        
        # Limit lists and get counts
        pattern_keys = list(patterns.keys())  # Create a copy of keys
        for key in pattern_keys:
            patterns[f'{key}_count'] = len(patterns[key])
            patterns[key] = patterns[key][:20]  # Keep only examples
        
        print(f"   Found {patterns['only_numbers_count']} pure number concepts")
        print(f"   Found {patterns['looks_like_code_count']} code-like concepts")
        
        return patterns
    
    def analyze_concept_frequency(self):
        """Analyze frequency distribution to find outliers"""
        print("\n📊 Analyzing concept frequency...")
        
        all_concepts = pd.concat([self.df['start_concept'], self.df['end_concept']])
        concept_freq = all_concepts.value_counts()
        
        analysis = {
            'top_20_concepts': concept_freq.head(20).to_dict(),
            'single_occurrence': (concept_freq == 1).sum(),
            'rare_concepts': (concept_freq <= 3).sum(),
            'frequency_stats': {
                'mean': concept_freq.mean(),
                'median': concept_freq.median(),
                'std': concept_freq.std(),
                'max': concept_freq.max()
            }
        }
        
        # Identify suspiciously frequent short concepts
        short_frequent = []
        for concept, freq in concept_freq.head(100).items():
            if len(concept) <= 2 and freq > 1000:
                short_frequent.append((concept, freq))
        
        analysis['suspicious_frequent'] = short_frequent[:20]
        
        print(f"   Total unique concepts: {len(concept_freq):,}")
        print(f"   Concepts appearing only once: {analysis['single_occurrence']:,}")
        print(f"   Top concept: '{concept_freq.index[0]}' appears {concept_freq.iloc[0]:,} times")
        
        return analysis
    
    def analyze_relation_quality(self):
        """Analyze relation types and their quality"""
        print("\n🔗 Analyzing relation quality...")
        
        relation_stats = self.df.groupby('relation_type').agg({
            'edge_weight': ['mean', 'std', 'count']
        }).round(3)
        
        # Sample some triples for each relation to check validity
        relation_examples = {}
        suspicious_relations = []
        
        for relation in self.df['relation_type'].unique():
            rel_df = self.df[self.df['relation_type'] == relation]
            examples = rel_df.sample(n=min(5, len(rel_df))).values.tolist()
            relation_examples[relation] = examples
            
            # Check for suspicious patterns
            avg_weight = rel_df['edge_weight'].mean()
            if avg_weight == 1.0 and len(rel_df) > 100:
                suspicious_relations.append(relation)
        
        analysis = {
            'relation_stats': relation_stats.to_dict(),
            'total_relation_types': self.df['relation_type'].nunique(),
            'suspicious_relations': suspicious_relations,
            'relation_examples': relation_examples
        }
        
        print(f"   Found {len(suspicious_relations)} suspicious relation types")
        print(f"   Total relation types: {analysis['total_relation_types']}")
        
        return analysis
    
    def analyze_edge_weights(self):
        """Analyze edge weight distribution"""
        print("\n⚖️ Analyzing edge weights...")
        
        weight_dist = self.df['edge_weight'].value_counts().sort_index()
        
        analysis = {
            'weight_distribution': weight_dist.to_dict(),
            'unique_weights': len(weight_dist),
            'all_same_weight': len(weight_dist) == 1,
            'mostly_default': (self.df['edge_weight'] == 1.0).sum() / len(self.df),
            'weight_stats': {
                'mean': self.df['edge_weight'].mean(),
                'std': self.df['edge_weight'].std(),
                'min': self.df['edge_weight'].min(),
                'max': self.df['edge_weight'].max()
            }
        }
        
        print(f"   Unique weight values: {analysis['unique_weights']}")
        print(f"   Percentage with weight=1.0: {analysis['mostly_default']*100:.1f}%")
        
        return analysis
    
    def analyze_semantic_validity(self):
        """Check semantic validity of relations"""
        print("\n🧠 Analyzing semantic validity...")
        
        # Sample for performance
        sample = self.df.sample(n=min(10000, len(self.df)), random_state=42)
        
        invalid_patterns = []
        
        # Check for nonsensical relations
        for _, row in tqdm(sample.iterrows(), total=len(sample), desc="Checking semantics"):
            start, rel, end = row['start_concept'], row['relation_type'], row['end_concept']
            
            # Single letter/number antonyms
            if rel == 'Antonym' and (len(start) == 1 or len(end) == 1):
                if not (start.isalpha() and end.isalpha()):
                    invalid_patterns.append(('single_char_antonym', start, rel, end))
            
            # Numbers with non-numeric relations
            elif start.isdigit() or end.isdigit():
                if rel not in ['GreaterThan', 'LessThan', 'EqualTo', 'RelatedTo']:
                    invalid_patterns.append(('number_invalid_relation', start, rel, end))
            
            # Same concept relations
            elif start == end and rel not in ['RelatedTo', 'SimilarTo']:
                invalid_patterns.append(('self_relation', start, rel, end))
        
        # Group by pattern type
        pattern_counts = Counter([p[0] for p in invalid_patterns])
        
        analysis = {
            'invalid_pattern_counts': dict(pattern_counts),
            'invalid_examples': invalid_patterns[:50],
            'estimated_invalid_percentage': len(invalid_patterns) / len(sample) * 100
        }
        
        print(f"   Estimated invalid triples: {analysis['estimated_invalid_percentage']:.1f}%")
        
        return analysis
    
    def analyze_connectivity_patterns(self):
        """Analyze graph connectivity patterns to find isolated junk"""
        print("\n🕸️ Analyzing connectivity patterns...")
        
        # Find concepts that only appear with low-weight edges
        concept_avg_weights = {}
        
        for concept in tqdm(pd.concat([self.df['start_concept'], self.df['end_concept']]).unique()[:10000], 
                           desc="Analyzing connectivity"):
            mask = (self.df['start_concept'] == concept) | (self.df['end_concept'] == concept)
            if mask.any():
                avg_weight = self.df[mask]['edge_weight'].mean()
                connection_count = mask.sum()
                concept_avg_weights[concept] = (avg_weight, connection_count)
        
        # Find poorly connected concepts
        poorly_connected = [(c, w, n) for c, (w, n) in concept_avg_weights.items() 
                           if w < 0.3 or n == 1]
        
        analysis = {
            'poorly_connected_concepts': poorly_connected[:50],
            'single_connection_concepts': len([c for c, (w, n) in concept_avg_weights.items() if n == 1]),
            'low_weight_concepts': len([c for c, (w, n) in concept_avg_weights.items() if w < 0.5])
        }
        
        print(f"   Concepts with single connection: {analysis['single_connection_concepts']:,}")
        
        return analysis
    
    def generate_filtering_recommendations(self, results):
        """Generate specific filtering recommendations based on analysis"""
        print("\n🎯 Generating filtering recommendations...")
        
        recommendations = {
            'remove_concepts': set(),
            'remove_relations': set(),
            'remove_patterns': [],
            'weight_threshold': 0.5,
            'estimated_reduction': 0
        }
        
        # Add junk concepts
        recommendations['remove_concepts'].update(results['length_analysis']['suspicious_single_chars'])
        recommendations['remove_concepts'].update([c for c, _ in results['frequency_analysis']['suspicious_frequent']])
        recommendations['remove_concepts'].update(results['character_analysis']['only_numbers'][:50])
        recommendations['remove_concepts'].update(results['character_analysis']['only_punctuation'])
        
        # Add patterns
        if results['character_analysis']['only_numbers_count'] > 100:
            recommendations['remove_patterns'].append(r'^\d+$')  # Pure numbers
        
        if results['character_analysis']['looks_like_code_count'] > 50:
            recommendations['remove_patterns'].append(r'^[a-z0-9_]+\(\)$')  # Function calls
        
        # Estimate reduction
        mask = (self.df['start_concept'].isin(recommendations['remove_concepts']) | 
                self.df['end_concept'].isin(recommendations['remove_concepts']))
        recommendations['estimated_reduction'] = mask.sum()
        
        return recommendations
    
    def apply_smart_filter(self, aggressive=False):
        """Apply recommended filters and return cleaned dataframe"""
        print("\n🧹 Applying smart filters...")
        
        original_size = len(self.df)
        filtered_df = self.df.copy()
        
        # Remove identified junk concepts
        junk_concepts = list(self.recommendations['remove_concepts'])
        filtered_df = filtered_df[~filtered_df['start_concept'].isin(junk_concepts)]
        filtered_df = filtered_df[~filtered_df['end_concept'].isin(junk_concepts)]
        
        print(f"   Removed {original_size - len(filtered_df):,} triples with junk concepts")
        
        # Apply regex patterns
        for pattern in self.recommendations['remove_patterns']:
            mask = (filtered_df['start_concept'].str.match(pattern) | 
                   filtered_df['end_concept'].str.match(pattern))
            filtered_df = filtered_df[~mask]
        
        print(f"   Removed {original_size - len(filtered_df):,} total triples after pattern matching")
        
        if aggressive:
            # Remove low-weight edges
            filtered_df = filtered_df[filtered_df['edge_weight'] >= 0.5]
            
            # Remove single-character concepts except 'a' and 'i'
            single_char_mask = ((filtered_df['start_concept'].str.len() == 1) | 
                               (filtered_df['end_concept'].str.len() == 1))
            valid_single = ['a', 'i', 'A', 'I']
            valid_mask = (filtered_df['start_concept'].isin(valid_single) | 
                         filtered_df['end_concept'].isin(valid_single))
            filtered_df = filtered_df[~(single_char_mask & ~valid_mask)]
            
            # Remove very short concepts with numbers
            short_with_numbers = (
                ((filtered_df['start_concept'].str.len() <= 3) & 
                 filtered_df['start_concept'].str.contains(r'\d', regex=True)) |
                ((filtered_df['end_concept'].str.len() <= 3) & 
                 filtered_df['end_concept'].str.contains(r'\d', regex=True))
            )
            filtered_df = filtered_df[~short_with_numbers]
        
        final_size = len(filtered_df)
        print(f"\n✅ Filtering complete!")
        print(f"   Original size: {original_size:,}")
        print(f"   Final size: {final_size:,}")
        print(f"   Reduction: {(1 - final_size/original_size)*100:.1f}%")
        
        return filtered_df
    
    def print_analysis_report(self, results):
        """Print a comprehensive analysis report"""
        print("\n" + "="*60)
        print("📋 JUNK DETECTION REPORT")
        print("="*60)
        
        print(f"\n🚨 TOP JUNK INDICATORS:")
        print(f"   • Single char concepts: {results['length_analysis']['length_distribution']['1_char']:,}")
        print(f"   • Pure number concepts: {results['character_analysis']['only_numbers_count']:,}")
        print(f"   • Code-like concepts: {results['character_analysis']['looks_like_code_count']:,}")
        print(f"   • Invalid semantic patterns: ~{results['semantic_analysis']['estimated_invalid_percentage']:.1f}%")
        
        print(f"\n📊 FILTERING RECOMMENDATIONS:")
        print(f"   • Remove {len(self.recommendations['remove_concepts']):,} specific junk concepts")
        print(f"   • Apply {len(self.recommendations['remove_patterns'])} regex filters")
        print(f"   • Estimated reduction: {self.recommendations['estimated_reduction']:,} triples")
        
        print(f"\n💡 SUGGESTED NEXT STEPS:")
        print(f"   1. Run: cleaned_df = detector.apply_smart_filter()")
        print(f"   2. For aggressive cleaning: cleaned_df = detector.apply_smart_filter(aggressive=True)")
        print(f"   3. Test on small sample first: test_df = cleaned_df.head(50000)")
    
    def visualize_analysis(self, results=None):
        """Create visualization of the analysis results"""
        if results is None:
            results = self.results if hasattr(self, 'results') else self.analyze_all(verbose=False)
            
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 1. Concept length distribution
        lengths = pd.concat([self.df['start_concept'].str.len(), 
                           self.df['end_concept'].str.len()])
        axes[0,0].hist(lengths[lengths <= 20], bins=20, edgecolor='black')
        axes[0,0].set_title('Concept Length Distribution (≤20 chars)')
        axes[0,0].set_xlabel('Length')
        axes[0,0].set_ylabel('Count')
        
        # 2. Edge weight distribution
        self.df['edge_weight'].value_counts().sort_index().plot(kind='bar', ax=axes[0,1])
        axes[0,1].set_title('Edge Weight Distribution')
        axes[0,1].set_xlabel('Weight')
        axes[0,1].set_ylabel('Count')
        
        # 3. Top 20 relations
        self.df['relation_type'].value_counts().head(20).plot(kind='barh', ax=axes[1,0])
        axes[1,0].set_title('Top 20 Relation Types')
        axes[1,0].set_xlabel('Count')
        
        # 4. Junk categories pie chart
        if hasattr(self, 'recommendations'):
            junk_categories = {
                'Single chars': results['length_analysis']['length_distribution']['1_char'],
                'Pure numbers': results['character_analysis']['only_numbers_count'],
                'Code-like': results['character_analysis']['looks_like_code_count'],
                'Valid': len(self.df) - self.recommendations['estimated_reduction']
            }
            axes[1,1].pie(junk_categories.values(), labels=junk_categories.keys(), autopct='%1.1f%%')
            axes[1,1].set_title('Data Quality Breakdown')
        else:
            axes[1,1].text(0.5, 0.5, 'Run analyze_all() first', ha='center', va='center')
            axes[1,1].set_title('Data Quality Breakdown')
        
        plt.tight_layout()
        plt.show()


# Example usage:
def analyze_and_clean_conceptnet(df, visualize=True):
    """
    One-stop function to analyze and clean ConceptNet data
    """
    # Initialize detector
    detector = ConceptNetJunkDetector(df)
    
    # Run comprehensive analysis
    results = detector.analyze_all()
    
    # Visualize if requested
    if visualize:
        detector.visualize_analysis(results)
    
    # Apply filtering
    print("\n🤖 Applying recommended filters...")
    cleaned_df = detector.apply_smart_filter(aggressive=False)
    
    # Show before/after stats
    print(f"\n📊 CLEANING SUMMARY:")
    print(f"   Original unique concepts: {pd.concat([df['start_concept'], df['end_concept']]).nunique():,}")
    print(f"   Cleaned unique concepts: {pd.concat([cleaned_df['start_concept'], cleaned_df['end_concept']]).nunique():,}")
    print(f"   Original relations: {df['relation_type'].nunique()}")
    print(f"   Cleaned relations: {cleaned_df['relation_type'].nunique()}")
    
    return cleaned_df, detector

In [11]:
import os
EN_PATH = '../Data/Input/conceptnet_en_full.csv'
your_conceptnet_df = pd.read_parquet(
        os.path.join(os.path.dirname(EN_PATH), 'conceptnet_en_full_cleaned.parquet.gzip')
    )

In [12]:
# Quick analysis and cleaning
cleaned_df, detector = analyze_and_clean_conceptnet(your_conceptnet_df)

# Or for more control:
detector = ConceptNetJunkDetector(your_conceptnet_df, sample_size=100000)
results = detector.analyze_all()

# Review the recommendations
print(detector.recommendations)

# Apply conservative cleaning
cleaned_df = detector.apply_smart_filter(aggressive=False)

# Or aggressive cleaning
cleaned_df = detector.apply_smart_filter(aggressive=True)

# Test your training loop on a small cleaned sample first
test_sample = cleaned_df.head(50000)

🔍 Initializing Junk Detector with 1,655,522 triples
   Working with sample size: 100,000

🚀 RUNNING COMPREHENSIVE JUNK ANALYSIS

📏 Analyzing concept lengths...
   Found 18 suspicious single-char concepts
   Examples: ['n', 'v', 'r', 'o', 't', 's', 'y', 'c', 'e', 'u']

🔤 Analyzing character patterns...


Scanning concepts: 100%|██████████| 99821/99821 [00:00<00:00, 641135.79it/s]


   Found 43 pure number concepts
   Found 0 code-like concepts

📊 Analyzing concept frequency...
   Total unique concepts: 754,380
   Concepts appearing only once: 426,993
   Top concept: 'n' appears 560,635 times

🔗 Analyzing relation quality...
   Found 0 suspicious relation types
   Total relation types: 47

⚖️ Analyzing edge weights...
   Unique weight values: 4912
   Percentage with weight=1.0: 79.0%

🧠 Analyzing semantic validity...


Checking semantics: 100%|██████████| 10000/10000 [00:00<00:00, 56991.54it/s]


   Estimated invalid triples: 2.5%

🕸️ Analyzing connectivity patterns...


Analyzing connectivity:   2%|▏         | 182/10000 [00:22<20:27,  8.00it/s]


KeyboardInterrupt: 