In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset from CSV
data=pd.read_csv('C:/Users/Danish/Documents/proj2/emails.csv')

#sparate features (X) and target variable (y)
X=data.drop(columns=['Email No.','Prediction'])
y=data['Prediction']

#preprocessing pipeline
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

numeric_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ])

#Preprocess data
X_processed=preprocessor.fit_transform(X)

#split the dataset into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

logistic_regression=LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
logistic_regression.fit(X_train, y_train)
logistic_regression_predictions=logistic_regression.predict(X_test)
logistic_regression_accuracy=accuracy_score(y_test, logistic_regression_predictions)
logistic_regression_precision=precision_score(y_test, logistic_regression_predictions)
logistic_regression_recall=recall_score(y_test, logistic_regression_predictions)
logistic_regression_f1_score=f1_score(y_test, logistic_regression_predictions)
# Cross-validation for logistic regression
logistic_regression_cv=cross_val_score(logistic_regression, X_processed, y, cv=5)
print("Logistic Regression:")
print(f"Accuracy: {logistic_regression_accuracy}")
print(f"Precision: {logistic_regression_precision}")
print(f"Recall: {logistic_regression_recall}")
print(f"F1-score: {logistic_regression_f1_score}")
print(f"Cross-Validation Mean: {logistic_regression_cv.mean()}")
print(f"Cross-Validation Std: {logistic_regression_cv.std()}")
print("\n")

Logistic Regression:
Accuracy: 0.970048309178744
Precision: 0.9288025889967637
Recall: 0.9695945945945946
F1-score: 0.9487603305785124
Cross-Validation Mean: 0.9593958082209701
Cross-Validation Std: 0.0059658958554134126




In [4]:
from sklearn.tree import DecisionTreeClassifier

decision_tree=DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

decision_tree_predictions=decision_tree.predict(X_test)
decision_tree_accuracy=accuracy_score(y_test, decision_tree_predictions)
decision_tree_precision=precision_score(y_test, decision_tree_predictions)
decision_tree_recall=recall_score(y_test, decision_tree_predictions)
decision_tree_f1_score=f1_score(y_test, decision_tree_predictions)

# Cross-validation for decision tree
decision_tree_cv=cross_val_score(decision_tree, X_processed, y, cv=5)

print("Decision Tree:")
print(f"Accuracy: {decision_tree_accuracy}")
print(f"Precision: {decision_tree_precision}")
print(f"Recall: {decision_tree_recall}")
print(f"F1-score: {decision_tree_f1_score}")
print(f"Cross-Validation Mean: {decision_tree_cv.mean()}")
print(f"Cross-Validation Std: {decision_tree_cv.std()}")
print("\n")


Decision Tree:
Accuracy: 0.9246376811594202
Precision: 0.8609271523178808
Recall: 0.8783783783783784
F1-score: 0.8695652173913043
Cross-Validation Mean: 0.914536484175707
Cross-Validation Std: 0.015688935581177323




In [5]:
from sklearn.svm import SVC

svm=SVC()
svm.fit(X_train, y_train)

svm_predictions=svm.predict(X_test)
svm_accuracy=accuracy_score(y_test, svm_predictions)
svm_precision=precision_score(y_test, svm_predictions)
svm_recall=recall_score(y_test, svm_predictions)
svm_f1_score=f1_score(y_test, svm_predictions)

# Cross-validation for SVM
svm_cv=cross_val_score(svm, X_processed, y, cv=5)

print("SVM:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")
print(f"F1-score: {svm_f1_score}")
print(f"Cross-Validation Mean: {svm_cv.mean()}")
print(f"Cross-Validation Std: {svm_cv.std()}")
print("\n")


SVM:
Accuracy: 0.9468599033816425
Precision: 0.9958847736625515
Recall: 0.8175675675675675
F1-score: 0.8979591836734693
Cross-Validation Mean: 0.9211100832562442
Cross-Validation Std: 0.011020259928962555




In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
logistic_regression.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
y_pred = logistic_regression.predict(X_test)

# Metrics
accuracy=accuracy_score(y_test, y_pred)
precision=precision_score(y_test, y_pred)
recall=recall_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)

# Cross-validation
cv_scores=cross_val_score(logistic_regression, X_processed, y, cv=5)
print("Model Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std: {cv_scores.std()}")


Model Evaluation:
Accuracy: 0.970048309178744
Precision: 0.9288025889967637
Recall: 0.9695945945945946
F1-score: 0.9487603305785124
Cross-Validation Mean: 0.9593958082209701
Cross-Validation Std: 0.0059658958554134126
