In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score

# Load the dataset
data = pd.read_csv("C:/Users/Lenovo/Downloads/glass.csv")

## 1. Data Exploration
print("Data Exploration")
print("\nDataset shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())
print("\nData types:\n", data.dtypes)
print("\nMissing values:\n", data.isnull().sum())
print("\nClass distribution:\n", data['type'].value_counts())

# 2. Data Preprocessing
data = data.drop_duplicates()
print("\nShape after removing duplicates:", data.shape)

X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

# Split data (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
# a. Univariate selection
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Feature', 'Score']
print("\nTop 5 features using Univariate Selection:")
top5_uni = featureScores.nlargest(5, 'Score')
print(top5_uni)

In [None]:
# b. Feature importance
model = ExtraTreesClassifier(random_state=10)
model.fit(X, y)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
top5_imp = feat_importances.nlargest(5)
print("\nTop 5 features using Feature Importance:")
print(top5_imp)

# Plot feature importance
plt.figure(figsize=(10,6))
feat_importances.nlargest(5).plot(kind='barh')
plt.title("Feature Importance")
plt.show()

In [None]:
# c. Correlation
corr_data = pd.concat([X, y], axis=1)
corrmat = corr_data.corr()
top_corr = corrmat['type'].abs().sort_values(ascending=False).head(6)
top_corr = top_corr.drop('type')
top5_corr = pd.DataFrame({'Feature': top_corr.index, 'Correlation': top_corr.values})
print("\nTop 5 features using Correlation:")
print(top5_corr)

# Plot correlation
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True, cmap="RdYlGn", fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# 4. Model development and evaluation
def evaluate_model(X, y, features_used="All features"):
    print(f"\nEvaluating model with {features_used}:")

    # Create and train model
    model = GaussianNB()
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # 8-fold cross validation
    cv_scores = cross_val_score(model, X, y, cv=8)
    print(f"\n8-fold CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

In [None]:
# Evaluate with all features
evaluate_model(X, y)

# Evaluate with top 5 features from univariate selecyion
X_uni = X[top5_uni['Feature'].values]
evaluate_model(X_uni, y, "Top 5 Univariate features")

# Evaluate with top 5 features from Importance Features
X_imp = X[top5_imp.index]
evaluate_model(X_imp, y, "Top 5 Feature Importance features")

# Evaluate with top 5 features from Correlation
X_corr = X[top5_corr['Feature'].values]
evaluate_model(X_corr, y, "Top 5 Correlation features")