In [None]:
### smote enn, rf, balance rf, xgboost, LR; 

##### cross val in train, test in different data


import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
import numpy as np

# Preprocessing function
def preprocess_data(X):
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    X_numeric_imputed = pd.DataFrame(imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

    scaler = StandardScaler()
    X_numeric_scaled = pd.DataFrame(scaler.fit_transform(X_numeric_imputed), columns=numeric_cols, index=X.index)

    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        X_categorical = X[categorical_cols].apply(LabelEncoder().fit_transform)
    else:
        X_categorical = pd.DataFrame(index=X.index)

    X_processed = pd.concat([X_numeric_scaled, X_categorical], axis=1)
    return X_processed

# Train and evaluate model with cross-validation
def train_with_cross_validation(model, model_name, train_data, feature_columns, target_column):
    X_train = preprocess_data(train_data[feature_columns])
    y_train = train_data[target_column]

    # Encode target variable if using XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)

    # Apply SMOTEENN
    smote_enn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

    # Perform cross-validation on resampled train data
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=skf, scoring='accuracy')

    print(f"\n{model_name} Cross-validation results on training data:")
    for i, score in enumerate(cv_scores):
        print(f"Fold {i+1}: Accuracy = {score:.4f}")
    print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
    print(f"Standard Deviation: {np.std(cv_scores):.4f}")

    # Train final model on full resampled training set
    model.fit(X_resampled, y_resampled)
    model_path = f"{model_name}_final_model.joblib"
    dump(model, model_path)
    print(f"Final {model_name} model saved at: {model_path}")

# Test model on separate test set
def test_model(test_data, feature_columns, target_column, model_name):
    # Load the model
    model_path = f"{model_name}_final_model.joblib"
    clf = load(model_path)

    # Prepare test data
    X_test = preprocess_data(test_data[feature_columns])
    y_test = test_data[target_column]

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_test = label_encoder.fit_transform(y_test)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print(f"\n{model_name} Test Set Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Main Execution
if __name__ == "__main__":
    feature_columns = [
        'maintainability_index', 'complexity_score', 'effort', 'difficulty', 'bugs',
        'vocabulary', 'volume', 'multi', 'length', 'comments', 'calculated_length', 'time', 'blank'
    ]
    target_column = 'Risk_Group'

    # Load train and test datasets
    train_file_path = "/../../training.csv"
    test_file_path = "/../../testing.csv"

    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Remove duplicates in train data if necessary
    train_data = train_data.drop_duplicates().reset_index(drop=True)

    # Models
    models = {
        "BalancedRandomForest": BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        ),
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ),
        "RandomForest": RandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        )
    }

    # Train and test each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with cross-validation...")
        train_with_cross_validation(model, model_name, train_data, feature_columns, target_column)
        
        print(f"\nTesting {model_name} on separate test set...")
        test_model(test_data, feature_columns, target_column, model_name)


In [None]:
### no smote enn, rf, balance rf, xgboost, LR; 
### cross val in train, test in different data

import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
import numpy as np

# Preprocessing function
def preprocess_data(X):
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    X_numeric_imputed = pd.DataFrame(imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

    scaler = StandardScaler()
    X_numeric_scaled = pd.DataFrame(scaler.fit_transform(X_numeric_imputed), columns=numeric_cols, index=X.index)

    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        X_categorical = X[categorical_cols].apply(LabelEncoder().fit_transform)
    else:
        X_categorical = pd.DataFrame(index=X.index)

    X_processed = pd.concat([X_numeric_scaled, X_categorical], axis=1)
    return X_processed

# Train and evaluate model with cross-validation
def train_with_cross_validation(model, model_name, train_data, feature_columns, target_column):
    X_train = preprocess_data(train_data[feature_columns])
    y_train = train_data[target_column]

    # Encode target variable if using XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)

    # Perform cross-validation on train data only
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')

    print(f"\n{model_name} Cross-validation results on training data:")
    for i, score in enumerate(cv_scores):
        print(f"Fold {i+1}: Accuracy = {score:.4f}")
    print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
    print(f"Standard Deviation: {np.std(cv_scores):.4f}")

    # Train final model on full training set
    model.fit(X_train, y_train)
    model_path = f"{model_name}_final_model.joblib"
    dump(model, model_path)
    print(f"Final {model_name} model saved at: {model_path}")

# Test model on separate test set
def test_model(test_data, feature_columns, target_column, model_name):
    # Load the model
    model_path = f"{model_name}_final_model.joblib"
    clf = load(model_path)

    # Prepare test data
    X_test = preprocess_data(test_data[feature_columns])
    y_test = test_data[target_column]

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_test = label_encoder.fit_transform(y_test)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print(f"\n{model_name} Test Set Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Main Execution
if __name__ == "__main__":
    feature_columns = [
        'maintainability_index', 'complexity_score', 'effort', 'difficulty', 'bugs',
        'vocabulary', 'volume', 'multi', 'length', 'comments', 'calculated_length', 'time', 'blank'
    ]
    target_column = 'Risk_Group'

    # Load train and test datasets
    train_file_path = "/../../training.csv"
    test_file_path = "/../../testing.csv"

    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Remove duplicates in train data if necessary
    train_data = train_data.drop_duplicates().reset_index(drop=True)

    # Models
    models = {
        "BalancedRandomForest": BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        ),
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ),
        "RandomForest": RandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        )
    }

    # Train and test each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with cross-validation...")
        train_with_cross_validation(model, model_name, train_data, feature_columns, target_column)
        
        print(f"\nTesting {model_name} on separate test set...")
        test_model(test_data, feature_columns, target_column, model_name)

In [None]:
### no smote enn, rf, balance rf, xgboost, LR;  
#cross val, train, test in different data

import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
import numpy as np

# Preprocessing function
def preprocess_data(X):
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    X_numeric_imputed = pd.DataFrame(imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

    scaler = StandardScaler()
    X_numeric_scaled = pd.DataFrame(scaler.fit_transform(X_numeric_imputed), columns=numeric_cols, index=X.index)

    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        X_categorical = X[categorical_cols].apply(LabelEncoder().fit_transform)
    else:
        X_categorical = pd.DataFrame(index=X.index)

    X_processed = pd.concat([X_numeric_scaled, X_categorical], axis=1)
    return X_processed

# Train and save model
def train_model(model, model_name, train_data, feature_columns, target_column):
    X_train = preprocess_data(train_data[feature_columns])
    y_train = train_data[target_column]

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)

    # Train the model on full training set
    model.fit(X_train, y_train)
    model_path = f"{model_name}_final_model.joblib"
    dump(model, model_path)
    print(f"Final {model_name} model saved at: {model_path}")

# Test model on separate test set
def test_model(test_data, feature_columns, target_column, model_name):
    # Load the model
    model_path = f"{model_name}_final_model.joblib"
    clf = load(model_path)

    # Prepare test data
    X_test = preprocess_data(test_data[feature_columns])
    y_test = test_data[target_column]

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_test = label_encoder.fit_transform(y_test)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print(f"\n{model_name} Test Set Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Main Execution
if __name__ == "__main__":
    feature_columns = [
        'maintainability_index', 'complexity_score', 'effort', 'difficulty', 'bugs',
        'vocabulary', 'volume', 'multi', 'length', 'comments', 'calculated_length', 'time', 'blank'
    ]
    target_column = 'Risk_Group'

    # Load train and test datasets
    train_file_path = "/../../training.csv"
    test_file_path = "/../../testing.csv"

    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Remove duplicates in train data if necessary
    train_data = train_data.drop_duplicates().reset_index(drop=True)

    # Models
    models = {
        "BalancedRandomForest": BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        ),
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ),
        "RandomForest": RandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        )
    }

    # Train and test each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        train_model(model, model_name, train_data, feature_columns, target_column)
        
        print(f"\nTesting {model_name} on separate test set...")
        test_model(test_data, feature_columns, target_column, model_name)


In [None]:
### smote enn, rf, balance rf, xgboost, LR;  
### cross val, train, test in different data

import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
import numpy as np

# Preprocessing function
def preprocess_data(X):
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    X_numeric_imputed = pd.DataFrame(imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

    scaler = StandardScaler()
    X_numeric_scaled = pd.DataFrame(scaler.fit_transform(X_numeric_imputed), columns=numeric_cols, index=X.index)

    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        X_categorical = X[categorical_cols].apply(LabelEncoder().fit_transform)
    else:
        X_categorical = pd.DataFrame(index=X.index)

    X_processed = pd.concat([X_numeric_scaled, X_categorical], axis=1)
    return X_processed

# Train model with SMOTE-ENN and save it
def train_model_with_smoteenn(model, model_name, train_data, feature_columns, target_column):
    X_train = preprocess_data(train_data[feature_columns])
    y_train = train_data[target_column]

    # Apply SMOTE-ENN for resampling
    smote_enn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_resampled = label_encoder.fit_transform(y_resampled)

    # Train the model on resampled data
    model.fit(X_resampled, y_resampled)
    model_path = f"{model_name}_final_model_with_smoteenn.joblib"
    dump(model, model_path)
    print(f"Final {model_name} model with SMOTE-ENN saved at: {model_path}")

# Test model on separate test set
def test_model(test_data, feature_columns, target_column, model_name):
    # Load the model
    model_path = f"{model_name}_final_model_with_smoteenn.joblib"
    clf = load(model_path)

    # Prepare test data
    X_test = preprocess_data(test_data[feature_columns])
    y_test = test_data[target_column]

    # Encode target variable for XGBoost
    if model_name == "XGBoost":
        label_encoder = LabelEncoder()
        y_test = label_encoder.fit_transform(y_test)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print(f"\n{model_name} Test Set Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Main Execution
if __name__ == "__main__":
    feature_columns = [
        'maintainability_index', 'complexity_score', 'effort', 'difficulty', 'bugs',
        'vocabulary', 'volume', 'multi', 'length', 'comments', 'calculated_length', 'time', 'blank'
    ]
    target_column = 'Risk_Group'

    # Load train and test datasets
    train_file_path = "/../../training.csv"
    test_file_path = "/../../testing.csv"

    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Remove duplicates in train data if necessary
    train_data = train_data.drop_duplicates().reset_index(drop=True)

    # Models
    models = {
        "BalancedRandomForest": BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        ),
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ),
        "RandomForest": RandomForestClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='log2',
            bootstrap=False,
            criterion='gini'
        )
    }

    # Train and test each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with SMOTE-ENN...")
        train_model_with_smoteenn(model, model_name, train_data, feature_columns, target_column)
        
        print(f"\nTesting {model_name} on separate test set...")
        test_model(test_data, feature_columns, target_column, model_name)
