In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import calendar
from typing import Dict, Any



In [None]:
data = pd.read_csv()



In [None]:


def data_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses transaction data for fraud detection analysis.
    
    Args:
        df: Raw transaction DataFrame
        
    Returns:
        Preprocessed DataFrame with engineered features
    """
    
    # Helper functions
    def create_amount_bucket(x: float) -> str:
        """Categorize transaction amounts into buckets"""
        if x <= 5.00:
            return "less than 5 dollar"
        elif 5.00 < x <= 10.00:
            return "b/w 5 to 10 dollar"
        elif 10.00 < x <= 40.00:
            return "b/w 10 to 40 dollar"
        elif 40.00 < x <= 60.00:
            return "b/w 40 to 60 dollar"
        elif 60.00 < x <= 80.00:
            return "b/w 60 to 80 dollar"
        elif 80.00 < x <= 150.00:
            return "b/w 80 to 150 dollar"
        else:
            return "more than 150 dollar"
    
    def city_pop_cat(x: float) -> str:
        """Categorize city population"""
        if x <= 1000.00:
            return "Low_pop"
        elif 1000.00 < x <= 10000.00:
            return "Medium_pop"
        else:
            return "High_pop"
    
    def age_bkt(x: int) -> str:
        """Categorize customer age"""
        if x <= 25:
            return "less than 25"
        elif 25 < x <= 40:
            return "b/w 25 to 40"
        elif 40 < x <= 60:
            return "b/w 40 to 60"
        else:
            return "more than 60"
    
    # 1. Amount bucketing
    df["amount_bkt"] = df["amt"].apply(create_amount_bucket)
    
    # 2. Geographical features
    df['latitudinal_distance'] = abs(round(df['merch_lat'] - df['lat'], 3))
    df['longitudinal_distance'] = abs(round(df['merch_long'] - df['long'], 3))
    
    # 3. Population categorization
    df["population_bkt"] = df["city_pop"].apply(city_pop_cat)
    
    # 4. Date/time features
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date'] = df['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    
    # 5. Age calculation
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = (df['trans_date'] - df['dob']).dt.days / 365.25
    df['age'] = df['age'].round(0).astype(int)
    df["age_bkt"] = df["age"].apply(age_bkt)
    
    # 6. Time features
    df['trans_month'] = df['trans_date_trans_time'].dt.month
    df['Month_name'] = df['trans_month'].apply(lambda x: calendar.month_abbr[x])
    df['transaction_time'] = df['trans_date_trans_time'].dt.time
    
    # 7. Time buckets
    bins = [0, 6, 12, 18, 24]
    labels = ['12AM-6AM', '6AM-12PM', '12PM-6PM', '6PM-12AM']
    df['time_bucket'] = pd.cut(
        df['trans_date_trans_time'].dt.hour,
        bins=bins,
        labels=labels,
        right=False,
        include_lowest=True
    )
    
    # 8. Gender encoding
    df["gender_encod"] = df["gender"].apply(lambda x: 1 if x == "M" else 0)
    
    # 9. Select final columns
    final_columns = [
        "trans_num", "trans_date", "time_bucket", "cc_num", "amount_bkt",
        "category", "gender", "state", "latitudinal_distance",
        "longitudinal_distance", "population_bkt", "age", "age_bkt",
        "gender_encod", "is_fraud"
    ]
    
    return df[final_columns]

# Example usage:
# df = pd.read_csv('your_data.csv')
# processed_df = data_preprocessing(df)

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from typing import Tuple

def preprocess_and_predict(raw_data: pd.DataFrame, model_path: str) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Complete pipeline from raw data to predictions:
    1. Preprocesses the data
    2. Encodes categorical features
    3. Loads trained model
    4. Generates predictions
    
    Args:
        raw_data: Raw transaction DataFrame
        model_path: Path to saved model (.pkl or .joblib)
        
    Returns:
        Tuple containing (processed_features, predictions)
    """
    
    # 1. Data Preprocessing
    processed_data = data_preprocessing(raw_data)
    
    # 2. Feature Encoding
    cat_cols = ['time_bucket', 'category', 'amount_bkt', 'population_bkt', 'age_bkt']
    drop_cols = cat_cols + ["trans_date", "cc_num", "gender", "state", "age", 'trans_num']
    
    # Initialize and fit encoder
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    X_encoded = encoder.fit_transform(processed_data[cat_cols])
    encoded_feat_names = encoder.get_feature_names_out(cat_cols)
    
    # Create encoded DataFrame
    encoded_df = pd.DataFrame(X_encoded, columns=encoded_feat_names, index=processed_data.index)
    
    # Merge with remaining features
    data_processed = processed_data.drop(columns=drop_cols)
    final_features = pd.concat([data_processed, encoded_df], axis=1)
    
    # Ensure consistent column order with training data
    expected_columns = [
        'latitudinal_distance', 'longitudinal_distance', 'gender_encod',
        'time_bucket_6AM-12PM', 'time_bucket_12PM-6PM', 'time_bucket_6PM-12AM',
        'category_entertainment', 'category_food_dining', 'category_gas_transport',
        'category_grocery_net', 'category_grocery_pos', 'category_health_fitness',
        'category_home', 'category_kids_pets', 'category_misc_net',
        'category_misc_pos', 'category_personal_care', 'category_shopping_net',
        'category_shopping_pos', 'category_travel', 'amount_bkt_b/w 5 to 10 dollar',
        'amount_bkt_b/w 10 to 40 dollar', 'amount_bkt_b/w 40 to 60 dollar',
        'amount_bkt_b/w 60 to 80 dollar', 'amount_bkt_b/w 80 to 150 dollar',
        'amount_bkt_more than 150 dollar', 'population_bkt_Medium_pop',
        'population_bkt_High_pop', 'age_bkt_b/w 25 to 40', 'age_bkt_b/w 40 to 60',
        'age_bkt_more than 60'
    ]
    
    # Reindex to ensure correct column order
    final_features = final_features.reindex(columns=expected_columns, fill_value=0)
    
    # 3. Load Model
    try:
        model = joblib.load(model_path)
        print("Model loaded successfully")
    except Exception as e:
        raise ValueError(f"Error loading model: {str(e)}")
    
    # 4. Generate Predictions
    predictions = model.predict(final_features)
    prediction_probs = model.predict_proba(final_features)[:, 1]  # Fraud probabilities
    
    # Add predictions to DataFrame
    processed_data['predicted_fraud'] = predictions
    processed_data['fraud_probability'] = prediction_probs
    
    return processed_data, predictions

# Example Usage:
if __name__ == "__main__":
    # Load raw data
    raw_df = pd.read_csv('transactions.csv')
    
    # Run pipeline
    try:
        results_df, predictions = preprocess_and_predict(
            raw_data=raw_df,
            model_path='fraud_detection_model.joblib'
        )
        
        print("\nPredictions generated successfully:")
        print(results_df[['trans_num', 'predicted_fraud', 'fraud_probability']].head())
        
        # Save results
        results_df.to_csv('processed_transactions_with_predictions.csv', index=False)
        print("\nResults saved to 'processed_transactions_with_predictions.csv'")
        
    except Exception as e:
        print(f"Pipeline failed: {str(e)}")