# **LOAD   LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks
from imblearn.over_sampling import ADASYN


# **DATA PREPARATION**

In [None]:
df = pd.read_csv('/content/sample_data/C&T train dataset (1).csv', usecols=['sno','acc_info','duration_month','credit_history','purpose','savings_acc','employment_st','personal_status','gurantors','resident_since','property_type','installment_type','housing_type','credits_no','job_type','liables','Group_no'])

print(df)


# **ENCODING**

In [None]:
# Convert categorical variables to numerical values using LabelEncoder
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])

print(df)
# Drop rows with missing value
df.dropna(inplace=True)



# ***EXPLORATORY DATA ANALYSIS***

In [None]:
# example Box Plot of Duration Month by Group_no
print('Box Plot of Duration Month by Group_no')
plt.figure(figsize=(8, 6))
sns.boxplot(x='Group_no', y='duration_month', data=df)
plt.title('Box Plot of Duration Month by Group_no')
plt.show()
#Example pair plot with color-coded classes
print('pair plot with color-coded classes')
sns.pairplot(df, hue='Group_no', diag_kind='kde')
plt.suptitle('Pair Plot with Class Color Coding')
plt.show()
# Histograms for each feature with 'Group_no' as hue
print('Histograms for each feature with Group_no as hue')
plt.figure(figsize=(12, 8))
num_cols = min(len(df.columns[:-1]), 9)
for i, col in enumerate(df.columns[:-1][:num_cols]):
    plt.subplot(3, 3, i + 1)
    sns.histplot(data=df, x=col, hue='Group_no', kde=True, palette='Set1')
plt.tight_layout()
plt.show()

# Box plots for each feature with 'Group_no' as x-axis
print('Box plots for each feature with Group_no as x-axis')
plt.figure(figsize=(12, 8))
num_cols = min(len(df.columns[:-1]), 9)
for i, col in enumerate(df.columns[:-1][:num_cols]):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(data=df, x='Group_no', y=col, palette='Set2')
plt.tight_layout()
plt.show()


# Plot distribution of 'target_variable'
sns.histplot(df['Group_no'])
plt.title('Distribution of Group_no')
plt.show()

# Plot missing values
sns.heatmap(df.isnull(), cbar=False)
plt.title('Missing Values Heatmap')
plt.show()


# Plot correlation matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# **UNIVARIATE ANALYSIS**

In [None]:
# Numeric Features
numeric_features = ['sno', 'duration_month', 'credits_no', 'liables', 'resident_since']

# Categorical Features
categorical_features = ['acc_info', 'credit_history', 'purpose', 'savings_acc',
                        'employment_st', 'personal_status', 'gurantors', 'property_type',
                        'installment_type', 'housing_type', 'job_type']

# Summary statistics for numeric features
print("Summary statistics for numeric features:")
print(df[numeric_features].describe())

# Histogram for numeric features
df[numeric_features].hist(figsize=(12, 8))
plt.suptitle("Histogram of Numeric Features")
plt.show()

# Box plot for numeric features
df[numeric_features].boxplot(figsize=(10, 6))
plt.title("Boxplot of Numeric Features")
plt.show()

# Frequency distribution for categorical features
for feature in categorical_features:
    print("\nFrequency distribution for", feature)
    print(df[feature].value_counts())

    # Bar plot for categorical features
    df[feature].value_counts().plot(kind='bar', figsize=(8, 6))
    plt.title("Bar plot of " + feature)
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.show()

    # Pie chart for categorical features
    df[feature].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8))
    plt.title("Pie chart of " + feature)
    plt.ylabel("")
    plt.show()

# **BIVARIATE ANALYSIS**

In [None]:

# Numeric-Numeric relationships
# Scatter plot matrix
pd.plotting.scatter_matrix(df[numeric_features], figsize=(12, 12))
plt.suptitle("Scatter Plot Matrix of Numeric Features")
plt.show()

# Correlation matrix
correlation_matrix = df[numeric_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix of Numeric Features")
plt.show()

# Categorical-Categorical relationships
# Create contingency tables
for feature1 in categorical_features:
    for feature2 in categorical_features:
        if feature1 != feature2:
            contingency_table = pd.crosstab(df[feature1], df[feature2])
            print("\nContingency table for {} vs {}".format(feature1, feature2))
            print(contingency_table)
            chi2, p, dof, expected = chi2_contingency(contingency_table)
            print("Chi-square test p-value:", p)

# Numeric-Categorical relationships
# Box plot of numeric features vs categorical features
for feature in categorical_features:
    for num_feature in numeric_features:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=feature, y=num_feature, data=df)
        plt.title("Box plot of {} vs {}".format(feature, num_feature))
        plt.show()

# **SPLITTING OF DATA**

In [None]:
# Split the dataset into features (X) and target variable (y)
X = df.drop("Group_no", axis="columns")
y = df['Group_no']

# Split the data into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# **MODEL EVALUATION FUNCTION**

In [None]:
# Initialize models including SVM
models = [
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Extra Trees', ExtraTreesClassifier()),
    ('SVM', SVC())
]

# Define parameter grid for SVM
svm_param_dist = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


# **CLASSIFICATION MODELS**

In [None]:
# Train and evaluate models including SVM
results = {}
for name, model in models:
    if name == 'SVM':
        random_search_svm = RandomizedSearchCV(estimator=model, param_distributions=svm_param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
        random_search_svm.fit(X_train_scaled, y_train)
        best_params_svm = random_search_svm.best_params_
        model.set_params(**best_params_svm)

        model.fit(X_train_scaled, y_train)
    else:
        model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy}")
    best_model_name = max(results, key=results.get)
best_model =[model for model in models if model[0] == best_model_name][0][1]


# **MODEL SELECTION**

In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test_scaled)
best_model_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model ({best_model_name}) Accuracy: {best_model_accuracy}")

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Apply Random Over Sampling to the training data
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_scaled, y)

# Apply Random Under Sampling to the training data
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_ros, y_ros)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_rus, y_rus)

# Apply ADASYN to the training data
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_smote, y_smote)

# Apply Cluster Centroids under sampling to the training data
cc = ClusterCentroids(random_state=42)
X_cc, y_cc = cc.fit_resample(X_adasyn, y_adasyn)

# Apply Tomek Links under sampling to the training data
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_cc, y_cc)

# Split the augmented data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42, stratify=y_tl)

# Initialize and train the RandomForest model
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Multiple Data Augmentation Methods: {accuracy}")

# **APPLICATION ON TEST DATASET**

In [None]:

# Load the test dataset
test_df = pd.read_csv('/content/sample_data/C&T test dataset (1).csv',usecols=['sno','acc_info','duration_month','credit_history','purpose','savings_acc','employment_st','personal_status','gurantors','resident_since','property_type','installment_type','housing_type','credits_no','job_type','liables'])


test_df.dropna(inplace=True)
test_X = test_df

# Convert categorical variables to numerical values using LabelEncoder
label_encoder = LabelEncoder()
for col in test_X.columns:
    if test_X[col].dtype == 'object':
        test_X[col] = label_encoder.fit_transform(test_X[col])

# Feature scaling
scaler = StandardScaler()
test_X_scaled = scaler.fit_transform(test_X)

# Make predictions on the test set
y_pred = rf_classifier.predict(test_X_scaled)

# Create a DataFrame with sno and predicted group_no
result_df = pd.DataFrame({'Serial number': test_df['sno'], 'Group_no': y_pred})



# **CREATING SUBMISSION FILE**

In [None]:
# Write the result to a CSV file
result_df.to_csv('SUBMISSION.csv', index=False)