In [40]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math

class Data:
    """ A util class to store the training and test datasets. """
    
    def __init__(self, tr_explanatory: pd.DataFrame, tr_explained: pd.DataFrame, 
                test_explanatory: pd.DataFrame, test_explained: pd.DataFrame):
        
        self.tr_explanatory = tr_explanatory
        self.tr_explained = tr_explained
        self.test_explanatory = test_explanatory
        self.test_explained = test_explained
    
    def consistency_check(self):
        print("Shape EXPLANATORY (Training set): {}".format(self.tr_explanatory.shape))
        print("Shape EXPLAINED (Training set): {}".format(self.tr_explained.shape))
        print("Shape EXPLANATORY (Test set): {}".format(self.test_explanatory.shape))
        print("Shape EXPLAINED (Test set): {}".format(self.test_explained.shape))
        
        if self.tr_explanatory.shape[0] == self.tr_explained.shape[0] and \
            self.test_explanatory.shape[0] == self.test_explained.shape[0] and \
            self.tr_explanatory.shape[1] == self.test_explanatory.shape[1] :
            print("Consistent dimensions.")
            return True
        else:
            print("ERROR: Inconsistent dimensions!")
            print(self.tr_explanatory.shape[0] == self.tr_explained.shape[0])
            print(self.test_explanatory.shape[0] == self.test_explained.shape[0])
            print(self.tr_explanatory.shape[1] == self.test_explanatory.shape[1])
            return False 

In [41]:
#Load CSV file
data = pd.read_csv('../shuffle_email_spam_classification.csv')

In [42]:
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


(  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
 0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
 1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
 2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
 3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
 4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   
 
    valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
 0       0    0               0         0         0   0    0           0  
 1       0    0               0         0         0   1    0           0  
 2       0    0               0         0         0   0    0           0  
 3       0    0               0         0         0   0    0           0  
 4       0    0               0         0         0   1    0           0  
 
 [5 rows x 3002 columns],
 None)

In [43]:
#Preprocessing
# Step 1: Remove the "Email No." column
data_cleaned = data.drop(columns=["Email No."])
data_cleaned = data_cleaned.sample(frac=1).reset_index(drop=True)
n = data_cleaned.shape[0]
thr = math.floor(0.75 * n)

X_train = data_cleaned.iloc[:thr, :-1]
Y_train = data_cleaned.iloc[:thr, -1]
X_test = data_cleaned.iloc[thr:, :-1]
Y_test = data_cleaned.iloc[thr:, -1]

#Data object
yy = Data(X_train, Y_train, X_test, Y_test)

yy.consistency_check()

Shape EXPLANATORY (Training set): (3879, 3000)
Shape EXPLAINED (Training set): (3879,)
Shape EXPLANATORY (Test set): (1293, 3000)
Shape EXPLAINED (Test set): (1293,)
Consistent dimensions.


True

In [54]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Initialize Logistic Regression with increased max_iter and solver
logreg = LogisticRegression(max_iter=500, solver='lbfgs')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(yy.tr_explanatory)
X_test_scaled = scaler.transform(yy.test_explanatory)

# Fit the model
logreg.fit(X_train_scaled, yy.tr_explained)

#Predict the fitted data
predictions_LR = logreg.predict(X_test_scaled)

print("All predictions: {0} \n\n\n".format(predictions_LR))
pred_vs_actual_LR_df = pd.DataFrame({
    "LogPredict": predictions_LR,
    "Actual": yy.test_explained
})

# pred_vs_actual_LR_df = pd.DataFrame(X_test_scaled, columns=[f'Feature_{i+1}' for i in range(X_test_scaled.shape[1])])
# pred_vs_actual_LR_df['LogPredict'] = predictions_LR
# pred_vs_actual_LR_df['Actual'] = yy.test_explained

print(pred_vs_actual_LR_df)
# X, Y = make_classification(random_state=42)
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
# pipe = make_pipeline(StandardScaler(), LogisticRegression())
# pipe.fit(X_train, Y_train)  # apply scaling on training data
# pipe.score(X_test, Y_test) 


All predictions: [0 0 1 ... 1 1 1] 



      LogPredict  Actual
3879           0       0
3880           0       0
3881           1       1
3882           0       0
3883           0       0
...          ...     ...
5167           0       0
5168           0       0
5169           1       1
5170           1       1
5171           1       1

[1293 rows x 2 columns]


In [None]:
import matplotlib.pyplot as plt
# Visualize both the Training and the Test set
plt.scatter(yy.tr_explanatory.iloc[:, 0], yy.tr_explanatory.iloc[:, 1], c = yy.tr_explained, marker="x")

# Create a new column with custom colors: Green for class "1"; Blue for class "0" 
pred_vs_actual_LR_df['CLass-Color'] = pred_vs_actual_LR_df['LogPredict'].apply(lambda x: 'Green' if x == 1 else 'Blue')

plt.scatter(
    pred_vs_actual_LR_df.iloc[:, 0], 
    pred_vs_actual_LR_df.iloc[:, 1], 
    c=pred_vs_actual_LR_df["CLass-Color"], 
    marker="o")

plt.show()

In [46]:
from sklearn.metrics import classification_report, confusion_matrix
confusion_mtx_LR = confusion_matrix(yy.test_explained, predictions_LR)
print("Confusion matrix: \n{}".format(confusion_mtx_LR))

# Get only the Accuracy
ACC = classification_report(yy.test_explained, predictions_LR, output_dict=True)['accuracy']
print("\n#####   Accuracy: {} %   ##### \n\n\n".format(ACC*100))

# Get all stats
print(classification_report(yy.test_explained, predictions_LR, output_dict=False))

Confusion matrix: 
[[902  31]
 [ 17 343]]

#####   Accuracy: 96.2877030162413 %   ##### 



              precision    recall  f1-score   support

           0       0.98      0.97      0.97       933
           1       0.92      0.95      0.93       360

    accuracy                           0.96      1293
   macro avg       0.95      0.96      0.95      1293
weighted avg       0.96      0.96      0.96      1293

