In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load datasets
labeled_data = pd.read_csv("sampled_dataset.csv")  # Small labeled dataset
unlabeled_data = pd.read_csv("remaining_dataset.csv")  # Large unlabelled dataset

# Feature Extraction (Example: TF-IDF for text data)
vectorizer = TfidfVectorizer(max_features=5000)
X_labeled = vectorizer.fit_transform(labeled_data["crimeaditionalinfo"]).toarray()
X_unlabeled = vectorizer.transform(unlabeled_data["crimeaditionalinfo"]).toarray()

# Normalize features
scaler = StandardScaler()
X_labeled = scaler.fit_transform(X_labeled)
X_unlabeled = scaler.transform(X_unlabeled)

# Encode string labels to integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_labeled = label_encoder.fit_transform(labeled_data["final_subcategory"].values)

# Adjust thresholds for better sample selection
confidence_threshold = 0.85  # Increased for higher quality initial predictions
min_confidence_threshold = 0.75  # Increased minimum threshold
distribution_similarity_threshold = 0.2  # Adjusted for better balance
cluster_purity_threshold = 0.75
max_iterations = 50  # Reduced to prevent overfitting
validation_size = 0.15  # Reduced to have more training data
n_clusters = len(np.unique(y_labeled))

def tune_model_parameters(val_score, current_params):
    """Unified parameter tuning based on validation performance and current parameters"""
    new_params = current_params.copy()
    
    # More aggressive parameter tuning for very low performance
    if val_score < 0.3:
        new_params['n_estimators'] = min(current_params['n_estimators'] + 200, 1000)  # Increased max and step
        new_params['max_depth'] = min(current_params['max_depth'] + 4, 30)  # Increased max and step
        new_params['min_samples_split'] = max(current_params['min_samples_split'] - 4, 2)
        new_params['min_samples_leaf'] = max(current_params['min_samples_leaf'] - 2, 1)
    elif val_score < 0.5:
        new_params['n_estimators'] = min(current_params['n_estimators'] + 100, 1000)
        new_params['max_depth'] = min(current_params['max_depth'] + 2, 30)
        new_params['min_samples_split'] = max(current_params['min_samples_split'] - 2, 2)
        new_params['min_samples_leaf'] = max(current_params['min_samples_leaf'] - 1, 1)
    
    return new_params

# Create classifier with specific parameters
def create_classifier(params):
    return RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced_subsample'
    )

# Modified dynamic confidence threshold adjustment
def adjust_confidence_threshold(iteration, base_threshold=0.85, min_threshold=0.75):
    # More aggressive threshold reduction
    decay = iteration * 0.02  # Doubled decay rate
    if iteration > 10:  # After 10 iterations, decay even faster
        decay = 0.2 + (iteration - 10) * 0.03
    return max(base_threshold - decay, min_threshold * 0.9)  # Allow going slightly below min_threshold

# Initialize parameters
current_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 4
}

# Initialize classifier with starting parameters
clf = create_classifier(current_params)

# Add class distribution tracking
initial_class_distribution = np.bincount(y_labeled) / len(y_labeled)

# Initialize tracking metrics
best_val_score = 0
best_model = None
iteration_metrics = []
patience = 5  # Number of iterations to wait before early stopping
consecutive_no_improvement = 0

# Create a validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_labeled, y_labeled, test_size=validation_size, random_state=42, stratify=y_labeled
)

# Keep track of remaining unlabeled data
current_X_unlabeled = X_unlabeled.copy()
current_unlabeled_data = unlabeled_data.copy()

print("Starting Semi-Supervised Learning Process")
print(f"Initial labeled samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Unlabeled samples: {len(X_unlabeled)}")
print(f"Number of unique categories: {n_clusters}\n")

for iteration in range(max_iterations):
    # Update confidence threshold dynamically
    current_confidence_threshold = adjust_confidence_threshold(iteration, 
                                                            confidence_threshold, 
                                                            min_confidence_threshold)
    
    print(f"\nIteration {iteration+1}: Pseudo-Labeling")
    print(f"Current confidence threshold: {current_confidence_threshold:.3f}")
    print(f"Remaining unlabeled samples: {len(current_X_unlabeled)}")
    
    if len(current_X_unlabeled) == 0:
        print("All data has been labeled. Stopping the process.")
        break

    # Train model on current labeled data
    clf.fit(X_train, y_train)
    
    # Evaluate on validation set
    val_score = clf.score(X_val, y_val)
    print(f"Current training set size: {len(X_train)}")
    print(f"Validation accuracy: {val_score:.4f}")
    
    # Predict on current unlabelled data
    pseudo_probs = clf.predict_proba(current_X_unlabeled)
    pseudo_labels = clf.predict(current_X_unlabeled)
    
    # More stringent selection of high-confidence predictions
    confidence_scores = np.max(pseudo_probs, axis=1)
    high_confidence_mask = confidence_scores > current_confidence_threshold
    
    X_new = current_X_unlabeled[high_confidence_mask]
    y_new = pseudo_labels[high_confidence_mask]
    
    print(f"Found {np.sum(high_confidence_mask)} high-confidence samples")
    
    # Check if we need to tune parameters
    should_tune = (val_score < 0.5 or len(X_new) == 0)
    
    if should_tune:
        new_params = tune_model_parameters(val_score, current_params)
        
        # Only create new classifier if parameters actually changed
        if new_params != current_params:
            print("Adjusting model parameters:")
            for param in new_params:
                if new_params[param] != current_params[param]:
                    print(f"{param}: {current_params[param]} -> {new_params[param]}")
            
            current_params = new_params
            clf = create_classifier(current_params)
            
            # Retrain with new parameters
            clf.fit(X_train, y_train)
            
            # Evaluate new model
            new_val_score = clf.score(X_val, y_val)
            print(f"New validation accuracy: {new_val_score:.4f}")
            
            if new_val_score > val_score:
                print("Improved performance with new parameters!")
                val_score = new_val_score
                if val_score > best_val_score:
                    best_val_score = val_score
                    best_model = clf
                    print("New best model saved!")
            else:
                print("No improvement with parameter tuning")
                consecutive_no_improvement += 1
            
            continue
    
    # Modified distribution similarity check
    if np.sum(high_confidence_mask) > 0:
        potential_new_labels = pseudo_labels[high_confidence_mask]
        new_distribution = np.bincount(potential_new_labels, minlength=len(initial_class_distribution))
        new_distribution = new_distribution / len(potential_new_labels)
        
        dist_similarity = np.sum(np.minimum(initial_class_distribution, new_distribution))
        
        print(f"Distribution similarity: {dist_similarity:.4f}")
        if dist_similarity < distribution_similarity_threshold:
            print("Skipping iteration due to significant class distribution shift")
            consecutive_no_improvement += 1
            continue
    
    if len(X_new) == 0:
        print("No high-confidence pseudo-labels found in this iteration.")
        consecutive_no_improvement += 1
        continue
        
    # Modified clustering validation
    if len(X_new) > n_clusters:
        validation_clusters = min(n_clusters, 20)  # Increased from 10
        kmeans = KMeans(n_clusters=validation_clusters, random_state=42, n_init=10)
        
        try:
            kmeans.fit(X_new)
            cluster_labels = kmeans.labels_
            
            # Check cluster purity
            cluster_purity = []
            for cluster in range(validation_clusters):
                cluster_mask = cluster_labels == cluster
                if np.sum(cluster_mask) > 0:
                    labels_in_cluster = y_new[cluster_mask]
                    majority_label_count = np.max(np.bincount(labels_in_cluster))
                    purity = majority_label_count / np.sum(cluster_mask)
                    cluster_purity.append(purity)
            
            avg_purity = np.mean(cluster_purity)
            print(f"Average cluster purity: {avg_purity:.4f}")
            
            # More lenient cluster purity check
            if avg_purity > cluster_purity_threshold:
                X_train = np.vstack([X_train, X_new])
                y_train = np.concatenate([y_train, y_new])
                print(f"Added {len(X_new)} validated pseudo-labeled samples")
                consecutive_no_improvement = 0  # Reset counter on successful addition
            else:
                print("Pseudo-labels rejected due to low cluster purity")
                consecutive_no_improvement += 1
                
        except Exception as e:
            print(f"Clustering validation failed: {e}")
            consecutive_no_improvement += 1
            continue
            
        # Store metrics for this iteration
        iteration_metrics.append({
            'iteration': iteration + 1,
            'validation_score': val_score,
            'pseudo_labeled_samples': len(X_new),
            'total_training_samples': len(X_train),
            'cluster_purity': avg_purity
        })
        
        # Update remaining unlabeled data
        current_X_unlabeled = current_X_unlabeled[~high_confidence_mask]
        current_unlabeled_data = current_unlabeled_data[~high_confidence_mask]
        print(f"Remaining unlabeled samples after this iteration: {len(current_X_unlabeled)}")
    else:
        print(f"Not enough samples ({len(X_new)}) for meaningful clustering validation")
        if np.mean(confidence_scores[high_confidence_mask]) > 0.98:  # Very high confidence threshold for small batches
            X_train = np.vstack([X_train, X_new])
            y_train = np.concatenate([y_train, y_new])
            print(f"Added {len(X_new)} samples from small batch (very high confidence)")
            consecutive_no_improvement = 0  # Reset counter on successful addition
        else:
            consecutive_no_improvement += 1
        
        # Update remaining unlabeled data
        current_X_unlabeled = current_X_unlabeled[~high_confidence_mask]
        current_unlabeled_data = current_unlabeled_data[~high_confidence_mask]
    
    if len(X_new) == 0 and consecutive_no_improvement >= 3:
        # If we're stuck, temporarily lower the confidence threshold significantly
        temp_threshold = current_confidence_threshold * 0.8
        print(f"Temporarily lowering confidence threshold to {temp_threshold:.3f}")
        
        pseudo_probs = clf.predict_proba(current_X_unlabeled)
        pseudo_labels = clf.predict(current_X_unlabeled)
        confidence_scores = np.max(pseudo_probs, axis=1)
        high_confidence_mask = confidence_scores > temp_threshold
        
        # Take only the top 1% most confident predictions
        if np.sum(high_confidence_mask) > 0:
            confidence_threshold_idx = max(int(len(current_X_unlabeled) * 0.01), 1)
            top_confidence_indices = np.argsort(confidence_scores)[-confidence_threshold_idx:]
            high_confidence_mask = np.zeros_like(high_confidence_mask)
            high_confidence_mask[top_confidence_indices] = True
            
            X_new = current_X_unlabeled[high_confidence_mask]
            y_new = pseudo_labels[high_confidence_mask]
            print(f"Added {len(X_new)} samples with lowered threshold")
            
            X_train = np.vstack([X_train, X_new])
            y_train = np.concatenate([y_train, y_new])
            current_X_unlabeled = current_X_unlabeled[~high_confidence_mask]
            current_unlabeled_data = current_unlabeled_data[~high_confidence_mask]
            consecutive_no_improvement = 0
            continue
        
    # Early stopping check
    if consecutive_no_improvement >= patience:
        print(f"\nStopping early due to {patience} iterations without improvement")
        break

print("\nTraining completed!")
print(f"Best validation accuracy: {best_val_score:.4f}")

# Print detailed metrics for best model
print("\nDetailed Classification Report for Best Model:")
y_val_pred = best_model.predict(X_val)
print(classification_report(y_val, y_val_pred))

# Use the best model for final predictions
final_labels = best_model.predict(X_unlabeled)

# Convert numeric labels back to original categories
final_labels = label_encoder.inverse_transform(final_labels)

# Save the labeled dataset
unlabeled_data["final_subcategory"] = final_labels
unlabeled_data.to_csv("fully_labeled_data.csv", index=False)
print("\nFinal dataset saved!")

# Display iteration metrics
print("\nIteration Metrics:")
metrics_df = pd.DataFrame(iteration_metrics)
print(metrics_df)

Starting Semi-Supervised Learning Process
Initial labeled samples: 416
Validation samples: 74
Unlabeled samples: 103325
Number of unique categories: 49


Iteration 1: Pseudo-Labeling
Current confidence threshold: 0.850
Remaining unlabeled samples: 103325
Current training set size: 416
Validation accuracy: 0.2297
Found 0 high-confidence samples
Adjusting model parameters:
n_estimators: 200 -> 400
max_depth: 10 -> 14
min_samples_split: 10 -> 6
min_samples_leaf: 4 -> 2
New validation accuracy: 0.2838
Improved performance with new parameters!
New best model saved!

Iteration 2: Pseudo-Labeling
Current confidence threshold: 0.830
Remaining unlabeled samples: 103325
Current training set size: 416
Validation accuracy: 0.2838
Found 0 high-confidence samples
Adjusting model parameters:
n_estimators: 400 -> 600
max_depth: 14 -> 18
min_samples_split: 6 -> 2
min_samples_leaf: 2 -> 1
New validation accuracy: 0.2973
Improved performance with new parameters!
New best model saved!

Iteration 3: Pseudo

In [5]:
import pandas as pd

# Example CSV path — replace with your own file path
csv_file_path = "50_subcats_train_test.csv"

# Read the CSV into a pandas DataFrame
df = pd.read_csv(csv_file_path)

df.head()


Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo,final_category,final_subcategory,category_justification,subcategory_justification,confidence_score
0,0.0,Online Cyber Trafficking,Online Trafficking,SIR I HAVE GET SMS WITH PRE APPORVED LOAN IJU...,online_financial_fraud,fraud_callvishing,The content describes a fraudulent loan scheme...,The user was contacted via WhatsApp and pressu...,
1,1.0,Online Cyber Trafficking,Online Trafficking,this number frauder call me I had ordered on a...,online_financial_fraud,debitcredit_card_fraudsim_swap_fraud,The content describes a situation where the us...,The user describes a debit card being used fra...,
2,3.0,Online Cyber Trafficking,Online Trafficking,I have received a notification by chrome he sa...,online_financial_fraud,debitcredit_card_fraudsim_swap_fraud,The content describes a user losing money afte...,The user's account balance being deducted afte...,
3,6.0,Online Cyber Trafficking,Online Trafficking,The app is in playstore with name of the five ...,online_financial_fraud,attacks_or_incidents_affecting_digital_payment...,The content describes a scenario where the use...,The mention of investing money through an app ...,
4,14.0,Online Cyber Trafficking,Online Trafficking,MY TR ID SBIUPI ID mpMTiRBaQPTVUZXBAwlwhDqcsUu...,online_financial_fraud,upi_related_frauds,"The content mentions SBIUPI ID and OTP, indica...",The mention of UPI ID suggests the fraud is re...,


In [6]:
# Name of the column that contains the labels
label_column = "final_subcategory"

# Convert labels to lowercase before grouping
df[label_column] = df[label_column].str.lower()

# Group by the label column and sample 10 rows from each label
df_sampled = df.groupby(label_column).sample(n=10, random_state=42)

# Check the distribution

print(df_sampled[label_column].value_counts())

# (Optional) Save the sampled DataFrame to a new CSV
df_sampled.to_csv("sampled_dataset.csv", index=False)

final_subcategory
aadhar_enabled_payment_system_aeps_fraud                                        10
impersonating_email                                                             10
intimidating_email                                                              10
malicious_mobile_app_attacks                                                    10
malware_attack                                                                  10
online_financial_fraud                                                          10
online_gambling_betting                                                         10
online_job_fraud                                                                10
online_matrimonial_fraud                                                        10
online_trafficking                                                              10
other                                                                           10
password_attacks                                                     

In [7]:
df_remaining = df.drop(df_sampled.index)
df_remaining.to_csv("remaining_dataset.csv", index=False)

In [8]:
len(df_sampled), len(df_remaining)

(490, 103325)