In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_and_preprocess_data():
    try:
        # Load training classification data
        logger.info("Loading classification data...")
        classification_data = pd.read_csv(
            r'C:\Users\sarp2\Desktop\train-classification.csv', 
            delimiter=',', 
            header=None, 
            names=['username', 'label']
        )

        # Load training dataset
        logger.info("Loading training dataset...")
        with open(r'C:/Users/sarp2/Desktop/training-dataset.jsonl', 'r', encoding='utf-8') as f:
            training_data = [json.loads(line) for line in tqdm(f.readlines())]

        # Load test usernames
        logger.info("Loading test usernames...")
        with open(r'C:\Users\sarp2\Desktop\test-classification-round2.dat', 'r', encoding='utf-8') as f:
            test_usernames = [line.strip().lower() for line in f.readlines()]

        # Load regression test data
        logger.info("Loading regression test data...")
        with open(r'C:\Users\sarp2\Desktop\test-regression-round2.jsonl', 'r', encoding='utf-8') as f:
            regression_test_data = [json.loads(line) for line in f.readlines()]

        return classification_data, training_data, test_usernames, regression_test_data

    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

def process_profile_data(training_data):
    logger.info("Processing profile data...")
    flattened_data = []
    
    for record in tqdm(training_data):
        if 'profile' in record:
            profile = record['profile']
            flattened_record = {
                'username': profile.get('username', '').strip().lower(),
                'follower_count': profile.get('follower_count', 0),
                'following_count': profile.get('following_count', 0),
                'post_count': profile.get('post_count', 0),
                'is_private': profile.get('is_private', False),
                'is_business_account': profile.get('is_business_account', False),
                'is_verified': profile.get('is_verified', False)
            }
            flattened_data.append(flattened_record)
    
    return pd.DataFrame(flattened_data)

def extract_posts_data(training_data):
    logger.info("Extracting posts data...")
    all_posts = []
    
    for user_data in tqdm(training_data):
        if 'posts' in user_data and isinstance(user_data['posts'], list):
            for post in user_data['posts']:
                if 'like_count' in post and post['like_count'] is not None:
                    all_posts.append(post)
    
    return all_posts

def prepare_regression_features(posts, tfidf=None, is_training=True):
    logger.info("Preparing regression features...")
    
    # Create DataFrame with basic features
    features_df = pd.DataFrame([{
        'caption': str(post.get('caption', '')),
        'media_type': post.get('media_type', ''),
        'comments_count': float(post.get('comments_count', 0)),
    } for post in posts])

    # Handle missing values
    features_df['caption'] = features_df['caption'].fillna('')
    features_df['media_type'] = features_df['media_type'].fillna('unknown')
    features_df['comments_count'] = features_df['comments_count'].fillna(0)

    # Create TF-IDF features
    if tfidf is None and is_training:
        tfidf = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_features = tfidf.fit_transform(features_df['caption'])
    else:
        tfidf_features = tfidf.transform(features_df['caption'])

    # One-hot encode media_type
    media_type_dummies = pd.get_dummies(features_df['media_type'], prefix='media_type')
    
    # Combine all features
    final_features = pd.concat([
        features_df[['comments_count']],
        pd.DataFrame(tfidf_features.toarray()),
        media_type_dummies
    ], axis=1)

    # Convert all column names to strings
    final_features.columns = final_features.columns.astype(str)

    if is_training:
        return final_features, [float(post['like_count']) for post in posts], tfidf
    return final_features, tfidf

def train_classification_model(training_df, classification_data):
    logger.info("Training classification model...")
    
    # Normalize username fields
    classification_data['username'] = classification_data['username'].str.strip().str.lower()

    # Merge classification data with training data
    merged_data = pd.merge(
        training_df,
        classification_data,
        on='username',
        how='inner'
    )

    if merged_data.empty:
        logger.warning("No matching usernames found between training data and classification data.")
        return None, None

    # Extract features and labels
    feature_columns = ['follower_count', 'following_count', 'post_count', 
                      'is_private', 'is_business_account', 'is_verified']
    classification_features = merged_data[feature_columns]
    classification_labels = merged_data['label']

    # Label encoding
    le = LabelEncoder()
    encoded_labels = le.fit_transform(classification_labels)

    # Train model
    clf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_model.fit(classification_features, encoded_labels)

    return clf_model, le, feature_columns

def main():
    try:
        # Load all data
        classification_data, training_data, test_usernames, regression_test_data = load_and_preprocess_data()
        
        # === Classification Part ===
        logger.info("Starting classification...")
        training_df = process_profile_data(training_data)
        clf_model, le, feature_columns = train_classification_model(training_df, classification_data)
        
        if clf_model is not None:
            classification_predictions = {}
            test_df = training_df[training_df['username'].isin(test_usernames)]
            
            for _, row in test_df.iterrows():
                features = pd.DataFrame([row[feature_columns]])
                pred = clf_model.predict(features)[0]
                classification_predictions[row['username']] = le.inverse_transform([pred])[0]

            with open('classification_output.json', 'w') as f:
                json.dump(classification_predictions, f, indent=4)

        # === Regression Part ===
        logger.info("Starting regression...")
        # Extract and prepare training data for regression
        training_posts = extract_posts_data(training_data)
        
        if training_posts:
            # Prepare features and train regression model
            X_train, y_train, tfidf = prepare_regression_features(training_posts, is_training=True)
            
            reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
            reg_model.fit(X_train, y_train)

            # Prepare test features
            X_test, _ = prepare_regression_features(regression_test_data, tfidf=tfidf, is_training=False)

            # Generate predictions
            regression_predictions = {}
            for i, post in enumerate(regression_test_data):
                pred = reg_model.predict(X_test.iloc[[i]])[0]
                regression_predictions[post['id']] = int(max(0, round(pred)))  # Ensure non-negative integer predictions

            with open('regression_output.json', 'w') as f:
                json.dump(regression_predictions, f, indent=4)

            logger.info("Completed both classification and regression predictions!")

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

INFO:__main__:Loading classification data...
INFO:__main__:Loading training dataset...
100%|████████████████████████████████████████████████████████████████████████████| 5415/5415 [00:01<00:00, 5074.99it/s]
INFO:__main__:Loading test usernames...
INFO:__main__:Loading regression test data...
INFO:__main__:Starting classification...
INFO:__main__:Processing profile data...
100%|██████████████████████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 833473.62it/s]
INFO:__main__:Training classification model...
INFO:__main__:Starting regression...
INFO:__main__:Extracting posts data...
100%|██████████████████████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 251906.66it/s]
INFO:__main__:Preparing regression features...
INFO:__main__:Preparing regression features...
INFO:__main__:Completed both classification and regression predictions!
