<a href="https://colab.research.google.com/github/deva-kumari/devakumari_cse22237/blob/main/lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Load the dataset
file_path = '/content/cc_embed_data (4).csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info()
print(data.head())

# Split the data into features and target
X = data.drop('Final_Marks', axis=1)
y = data['Final_Marks']

# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Define the Perceptron model
perceptron = Perceptron(random_state=42)

# Define the smaller parameter grid for Perceptron
perceptron_param_grid_small = {
    'penalty': [None, 'l2'],
    'alpha': np.logspace(-3, 2, 5),  # Smaller range
    'max_iter': [1000, 1500],  # Fewer options
    'eta0': np.logspace(-3, 0, 5),  # Smaller range
    'fit_intercept': [True, False]
}

# Randomized Search for Perceptron with fewer iterations
perceptron_search_small = RandomizedSearchCV(perceptron, perceptron_param_grid_small, n_iter=10, scoring='accuracy',
                                             cv=3, random_state=42, n_jobs=-1)
perceptron_search_small.fit(X_train, y_train)
best_perceptron_params_small = perceptron_search_small.best_params_
best_perceptron_score_small = perceptron_search_small.best_score_

# Define the MLP model
mlp = MLPClassifier(random_state=42)

# Define the smaller parameter grid for MLP
mlp_param_grid_small = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': np.logspace(-3, 2, 5),  # Smaller range
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [1000, 1500]  # Fewer options
}

# Randomized Search for MLP with fewer iterations
mlp_search_small = RandomizedSearchCV(mlp, mlp_param_grid_small, n_iter=10, scoring='accuracy',
                                      cv=3, random_state=42, n_jobs=-1)
mlp_search_small.fit(X_train, y_train)
best_mlp_params_small = mlp_search_small.best_params_
best_mlp_score_small = mlp_search_small.best_score_

# Display the best parameters and scores for both models
print("Best Perceptron Parameters (Optimized):", best_perceptron_params_small)
print("Best Perceptron Cross-Validation Score (Optimized):", best_perceptron_score_small)
print("Best MLP Parameters (Optimized):", best_mlp_params_small)
print("Best MLP Cross-Validation Score (Optimized):", best_mlp_score_small)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Columns: 769 entries, cc_embedding_0 to Final_Marks
dtypes: float64(768), int64(1)
memory usage: 6.9 MB
   cc_embedding_0  cc_embedding_1  cc_embedding_2  cc_embedding_3  \
0       -1.543605        1.192574        1.600436       -1.540511   
1       -1.416784        1.277785        1.634119       -1.646463   
2       -1.563877        1.208654        1.569922       -1.536870   
3       -1.493047        0.999892        1.603005       -1.372708   
4       -1.579709        1.184545        1.641025       -1.530369   

   cc_embedding_4  cc_embedding_5  cc_embedding_6  cc_embedding_7  \
0        0.049140        0.597720       -1.806607       -1.382916   
1        0.075007        0.640903       -1.820755       -1.488606   
2        0.033661        0.619467       -1.809974       -1.387395   
3        0.088974        0.586771       -1.781855       -1.537741   
4        0.069332        0.606778       -1.795736       -1.373

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
file_path = '/content/cc_embed_data (4).csv'
data = pd.read_csv(file_path)

# Split the data into features and target
X = data.drop('Final_Marks', axis=1)
y = data['Final_Marks']

# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define classifiers excluding CatBoost
classifiers = {
    'Perceptron': Perceptron(random_state=42, max_iter=1500),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=1500),
    'SVM': SVC(probability=True, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'NaiveBayes': GaussianNB()
}

# Initialize a dataframe to store the results
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'])

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    # Train the model
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Check if the classifier has a predict_proba method
    if hasattr(clf, "predict_proba"):
        y_proba = clf.predict_proba(X_test)
        # Use 'ovr' (one-vs-rest) strategy for multi-class roc-auc
        roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    else:
        roc_auc = None  # Set as None if not applicable

    # Append the results to the dataframe using loc method
    results.loc[len(results)] = [clf_name, accuracy, precision, recall, f1, roc_auc]

# Display the results in a tabular format
results = results.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
results


  results.loc[len(results)] = [clf_name, accuracy, precision, recall, f1, roc_auc]


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,ROC-AUC
0,RandomForest,0.355932,0.256492,0.268751,0.256925,0.783229
1,SVM,0.338983,0.212525,0.227095,0.210365,0.773952
2,MLPClassifier,0.317797,0.253625,0.252136,0.249241,0.730454
3,XGBoost,0.305085,0.235637,0.239769,0.23213,0.752391
4,NaiveBayes,0.300847,0.252677,0.272429,0.231794,0.692969
5,DecisionTree,0.279661,0.22203,0.218512,0.215677,0.575817
6,Perceptron,0.20339,0.162195,0.194712,0.164729,
7,AdaBoost,0.190678,0.068207,0.105905,0.056402,0.522828


In [4]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
