<a href="https://colab.research.google.com/github/dhillonarman/Predictive-Analysis-/blob/main/102216076_Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

features = data.drop(columns=['Class'])
target = data['Class']

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(features, target)

balanced_dataset = pd.concat([pd.DataFrame(X_balanced, columns=features.columns),
                              pd.DataFrame(y_balanced, columns=['Class'])], axis=1)

confidence_lvl = 0.95
error_margin = 0.05
p_hat = y_balanced.mean()
z_value = 1.96

random_sample_size = int((z_value*2 * p_hat * (1 - p_hat)) / (error_margin*2))
strata_var = balanced_dataset['Class'].value_counts(normalize=True).std()
if strata_var == 0:
    strata_var = 1
stratified_sample_size = int((z_value*2 * p_hat * (1 - p_hat)) / ((error_margin / strata_var)*2))
num_clusters = 5
cluster_sample_size = int((z_value*2 * p_hat * (1 - p_hat)) / ((error_margin / num_clusters)*2))

sampling_methods = {}

sampling_methods['Random'] = balanced_dataset.sample(n=random_sample_size, random_state=42)

sampling_methods['Stratified'] = balanced_dataset.groupby('Class').apply(
    lambda x: x.sample(int(stratified_sample_size * len(x) / len(balanced_dataset)), replace=True, random_state=42)
).reset_index(drop=True)

k = len(balanced_dataset) // random_sample_size
sampling_methods['Systematic'] = balanced_dataset.iloc[::k, :].reset_index(drop=True)

balanced_dataset['Cluster'] = pd.cut(balanced_dataset['Time'], bins=num_clusters, labels=False)
chosen_clusters = balanced_dataset['Cluster'].sample(num_clusters // 2, random_state=42).unique()
sampling_methods['Cluster'] = balanced_dataset[balanced_dataset['Cluster'].isin(chosen_clusters)].reset_index(drop=True)

sampling_methods['Bootstrap'] = balanced_dataset.sample(n=random_sample_size, replace=True, random_state=42)

ml_models = {
    'LogReg': LogisticRegression(max_iter=1000),
    'DecTree': DecisionTreeClassifier(random_state=42),
    'RandForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),  # Removed use_label_encoder argument
    'GradBoost': GradientBoostingClassifier(random_state=42),
}

model_performance = {}
for method, sample_data in sampling_methods.items():
    X_sample = sample_data.drop(columns=['Class', 'Cluster'], errors='ignore')
    y_sample = sample_data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    for model_name, model in ml_models.items():
        model.fit(X_train, y_train)
        acc_score = model.score(X_test, y_test)
        if method not in model_performance:
            model_performance[method] = {}
        model_performance[method][model_name] = acc_score

for method, scores in model_performance.items():
    for model_name, score in scores.items():
        print(f"Sampling Method: {method} | Model: {model_name} | Accuracy: {score:.2f}")


best_sampling, best_model, top_accuracy = "", "", 0
for method, scores in model_performance.items():
    for model_name, score in scores.items():
        if score > top_accuracy:
            top_accuracy, best_sampling, best_model = score, method, model_name

print("\nBest Sampling Technique and Model:")
print(f"Sampling Method: {best_sampling}")
print(f"Model: {best_model}")
print(f"Accuracy: {top_accuracy:.2f}")


  sampling_methods['Stratified'] = balanced_dataset.groupby('Class').apply(


Sampling Method: Random | Model: LogReg | Accuracy: 0.50
Sampling Method: Random | Model: DecTree | Accuracy: 1.00
Sampling Method: Random | Model: RandForest | Accuracy: 0.50
Sampling Method: Random | Model: XGBoost | Accuracy: 0.50
Sampling Method: Random | Model: GradBoost | Accuracy: 0.50
Sampling Method: Stratified | Model: LogReg | Accuracy: 0.50
Sampling Method: Stratified | Model: DecTree | Accuracy: 0.50
Sampling Method: Stratified | Model: RandForest | Accuracy: 0.50
Sampling Method: Stratified | Model: XGBoost | Accuracy: 0.50
Sampling Method: Stratified | Model: GradBoost | Accuracy: 0.50
Sampling Method: Systematic | Model: LogReg | Accuracy: 1.00
Sampling Method: Systematic | Model: DecTree | Accuracy: 1.00
Sampling Method: Systematic | Model: RandForest | Accuracy: 1.00
Sampling Method: Systematic | Model: XGBoost | Accuracy: 1.00
Sampling Method: Systematic | Model: GradBoost | Accuracy: 1.00
Sampling Method: Cluster | Model: LogReg | Accuracy: 0.93
Sampling Method: Clu