In [1]:
import json
import pandas as pd
import numpy as np
import logging
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# =======================
# Data Loading Functions
# =======================
def load_and_preprocess_data():
    try:
        # Load training classification data
        logger.info("Loading classification data...")
        classification_data = pd.read_csv(
            r'C:\Users\sarp2\Desktop\train-classification.csv', 
            delimiter=',', 
            header=None, 
            names=['username', 'label']
        )

        # Load training dataset
        logger.info("Loading training dataset...")
        with open(r'C:/Users/sarp2/Desktop/training-dataset.jsonl', 'r', encoding='utf-8') as f:
            training_data = [json.loads(line) for line in tqdm(f.readlines())]

        # Load test usernames
        logger.info("Loading test usernames...")
        with open(r'C:\Users\sarp2\Desktop\test-classification-round3.dat', 'r', encoding='utf-8') as f:
            test_usernames = [line.strip().lower() for line in f.readlines()]

        # Load regression test data
        logger.info("Loading regression test data...")
        with open(r'C:\Users\sarp2\Desktop\test-regression-round3.jsonl', 'r', encoding='utf-8') as f:
            regression_test_data = [json.loads(line) for line in f.readlines()]

        return classification_data, training_data, test_usernames, regression_test_data

    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

# =======================
# Classification Helpers
# =======================
def gather_text_for_user(record, max_captions=5):
    """
    Combine biography, category name, and up to `max_captions` post captions
    into a single text string for TF-IDF processing.
    """
    profile = record.get('profile', {})
    biography = str(profile.get('biography', ''))
    category = str(profile.get('category_name', ''))

    # Get up to max_captions from posts
    posts = record.get('posts', [])
    captions = []
    for i, post in enumerate(posts):
        if i >= max_captions:
            break
        caption_text = str(post.get('caption', ''))
        captions.append(caption_text)

    # Concatenate everything
    full_text = biography + " " + category + " " + " ".join(captions)
    return full_text.strip()

def process_user_data_for_classification(training_data, classification_data, max_captions=5):
    """
    Build a DataFrame containing:
      - username
      - numeric features (follower_count, following_count, etc.)
      - text_data (biography + category_name + first N captions)
      - label (if available)
    """
    logger.info("Building classification DataFrame with text features...")

    # Create a lookup from username -> label
    classification_dict = dict(zip(
        classification_data['username'].str.lower(),
        classification_data['label']
    ))

    all_records = []
    for record in tqdm(training_data):
        profile = record.get('profile', {})
        username = profile.get('username', '').strip().lower()
        if not username:
            continue

        # Numeric/boolean features
        follower_count = profile.get('follower_count', 0) or 0
        following_count = profile.get('following_count', 0) or 0
        post_count = profile.get('post_count', 0) or 0
        is_private = profile.get('is_private', False)
        is_business_account = profile.get('is_business_account', False)
        is_verified = profile.get('is_verified', False)

        # Text features
        text_data = gather_text_for_user(record, max_captions=max_captions)

        # Label (if exists)
        label = classification_dict.get(username, None)

        all_records.append({
            'username': username,
            'follower_count': float(follower_count),
            'following_count': float(following_count),
            'post_count': float(post_count),
            'is_private': int(is_private),
            'is_business_account': int(is_business_account),
            'is_verified': int(is_verified),
            'text_data': text_data,
            'label': label
        })

    df = pd.DataFrame(all_records)
    return df

def train_classification_model_with_text(df, random_state=42):
    """
    Train a classification model that uses both numeric features and TF-IDF text features.
    Returns:
        pipeline (Pipeline): trained pipeline
        label_encoder (LabelEncoder): to inverse transform predicted labels
    """
    logger.info("Training classification model with text features...")

    # Keep only rows that have a label
    df_train = df.dropna(subset=['label']).copy()
    
    # Separate features / labels
    y = df_train['label'].values
    X = df_train.drop(columns=['username', 'label'])

    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # We'll apply TF-IDF to 'text_data' and scaling to numeric features
    text_features = ['text_data']
    numeric_features = [
        'follower_count',
        'following_count',
        'post_count',
        'is_private',
        'is_business_account',
        'is_verified'
    ]

    # ColumnTransformer: TF-IDF for text_data, StandardScaler for numeric
    from sklearn.pipeline import make_pipeline
    from sklearn.compose import ColumnTransformer
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english'), 'text_data'),
            ('scaler', StandardScaler(), numeric_features)
        ],
        remainder='drop'
    )

    # Build pipeline: preprocessor -> RandomForestClassifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=200, random_state=random_state))
    ])

    # Fit pipeline
    pipeline.fit(X, y_encoded)
    logger.info("Model training complete.")

    return pipeline, le

def predict_user_classes(pipeline, df, label_encoder):
    """
    Predict classes for users in `df` using the trained pipeline.
    Returns a dict: {username: predicted_label}
    """
    # We'll drop 'username' and 'label' if present
    X_test = df.drop(columns=['username', 'label'], errors='ignore')

    # Predict
    preds_encoded = pipeline.predict(X_test)
    preds = label_encoder.inverse_transform(preds_encoded)

    # Build dictionary of {username -> predicted_label}
    results = {}
    for username, pred in zip(df['username'], preds):
        results[username] = pred
    return results

# =======================
# Regression Helpers
# =======================
def extract_posts_data(training_data):
    logger.info("Extracting posts data for regression...")
    all_posts = []
    
    for user_data in tqdm(training_data):
        if 'posts' in user_data and isinstance(user_data['posts'], list):
            for post in user_data['posts']:
                if 'like_count' in post and post['like_count'] is not None:
                    all_posts.append(post)
    
    return all_posts

def prepare_regression_features(posts, tfidf=None, is_training=True):
    logger.info("Preparing regression features...")
    
    # Create DataFrame with basic features
    features_df = pd.DataFrame([{
        'caption': str(post.get('caption', '')),
        'media_type': post.get('media_type', ''),
        'comments_count': float(post.get('comments_count', 0)),
    } for post in posts])

    # Handle missing values
    features_df['caption'] = features_df['caption'].fillna('')
    features_df['media_type'] = features_df['media_type'].fillna('unknown')
    features_df['comments_count'] = features_df['comments_count'].fillna(0)

    # Create TF-IDF features for captions
    if tfidf is None and is_training:
        tfidf = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_features = tfidf.fit_transform(features_df['caption'])
    else:
        tfidf_features = tfidf.transform(features_df['caption'])

    # One-hot encode media_type
    media_type_dummies = pd.get_dummies(features_df['media_type'], prefix='media_type')
    
    # Combine all features
    final_features = pd.concat([
        features_df[['comments_count']],
        pd.DataFrame(tfidf_features.toarray()),
        media_type_dummies
    ], axis=1)

    # Convert all column names to strings
    final_features.columns = final_features.columns.astype(str)

    if is_training:
        # Return features, labels, and the fitted tfidf
        return final_features, [float(post['like_count']) for post in posts], tfidf
    else:
        # Return only features, plus the tfidf object
        return final_features, tfidf

# =======================
# Main
# =======================
def main():
    try:
        # Load data
        classification_data, training_data, test_usernames, regression_test_data = load_and_preprocess_data()
        
        # === Classification Part ===
        logger.info("Starting classification with text incorporation...")

        # 1. Build DataFrame for classification with text
        classification_df = process_user_data_for_classification(
            training_data=training_data,
            classification_data=classification_data,
            max_captions=5
        )

        # 2. Train the classification model
        pipeline, label_encoder = train_classification_model_with_text(classification_df)

        # 3. Predict on test usernames
        test_df = classification_df[classification_df['username'].isin(test_usernames)].copy()
        if not test_df.empty:
            classification_preds = predict_user_classes(pipeline, test_df, label_encoder)
            with open('classification_output.json', 'w', encoding='utf-8') as f:
                json.dump(classification_preds, f, indent=4, ensure_ascii=False)
            logger.info("Classification predictions saved to classification_output.json")
        else:
            logger.warning("No matching test usernames found in classification_df.")

        # === Regression Part ===
        logger.info("Starting regression...")
        training_posts = extract_posts_data(training_data)
        
        if training_posts:
            # Prepare features and train regression model
            X_train, y_train, tfidf = prepare_regression_features(training_posts, is_training=True)
            reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
            reg_model.fit(X_train, y_train)

            # Prepare test features
            X_test, _ = prepare_regression_features(regression_test_data, tfidf=tfidf, is_training=False)

            # Generate predictions
            regression_predictions = {}
            for i, post in enumerate(regression_test_data):
                pred = reg_model.predict(X_test.iloc[[i]])[0]
                regression_predictions[post['id']] = int(max(0, round(pred)))

            with open('regression_output.json', 'w', encoding='utf-8') as f:
                json.dump(regression_predictions, f, indent=4, ensure_ascii=False)

            logger.info("Regression predictions saved to regression_output.json")
            logger.info("Completed both classification and regression predictions!")
        else:
            logger.warning("No valid posts found for regression training.")

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()


INFO:__main__:Loading classification data...
INFO:__main__:Loading training dataset...
100%|████████████████████████████████████████████████████████████████████████████| 5415/5415 [00:01<00:00, 5298.44it/s]
INFO:__main__:Loading test usernames...
INFO:__main__:Loading regression test data...
INFO:__main__:Starting classification with text incorporation...
INFO:__main__:Building classification DataFrame with text features...
100%|██████████████████████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 130483.89it/s]
INFO:__main__:Training classification model with text features...
INFO:__main__:Model training complete.
INFO:__main__:Classification predictions saved to classification_output.json
INFO:__main__:Starting regression...
INFO:__main__:Extracting posts data for regression...
100%|██████████████████████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 277732.81it/s]
INFO:__main__:Preparing regression features...
INFO:__main_