# *Prompt for file and load data*

In [None]:
from google.colab import files
import io

uploaded = files.upload()

Saving spambase.data to spambase (1).data


# *Load data*

In [None]:
import pandas as pd

# Extract filename
filename = list(uploaded.keys())[0]

#    Load CSV from either:
#      - a filename (string path)
#      - an uploaded file from Colab's files.upload() dict
#      - a Flask file object (from request.files)

if isinstance(uploaded[filename], str):
  # Assume it's a file path
  df = pd.read_csv(uploaded[filename], sep=",", header=None)
elif hasattr(uploaded[filename], 'read'):
  # Flask's file object or BytesIO
  df = pd.read_csv(io.BytesIO(uploaded[filename].read()), sep=",", header=None)
elif isinstance(uploaded[filename], bytes):
  # Bytes directly (Colab uploaded dict value)
  df = pd.read_csv(io.BytesIO(uploaded[filename]), sep=",", header=None)
else:
  raise ValueError("Unsupported file source type.")

print(df.head())


     0     1     2    3     4     5     6     7     8     9   ...    48  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00  ...  0.00   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...  0.00   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...  0.01   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
4  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   

      49   50     51     52     53     54   55    56  57  
0  0.000  0.0  0.778  0.000  0.000  3.756   61   278   1  
1  0.132  0.0  0.372  0.180  0.048  5.114  101  1028   1  
2  0.143  0.0  0.276  0.184  0.010  9.821  485  2259   1  
3  0.137  0.0  0.137  0.000  0.000  3.537   40   191   1  
4  0.135  0.0  0.135  0.000  0.000  3.537   40   191   1  

[5 rows x 58 columns]


# *Read in header names. Each row contains a name*

In [None]:
names_uploaded = files.upload()

# Extract filename
header_filename = list(names_uploaded.keys())[0]

#    Load CSV from either:
#      - a filename (string path)
#      - an uploaded file from Colab's files.upload() dict
#      - a Flask file object (from request.files)

if isinstance(names_uploaded[header_filename], str):
  # Assume it's a file path
  names_df = pd.read_csv(names_uploaded[header_filename], sep=":", header=None)
elif hasattr(names_uploaded[header_filename], 'read'):
  # Flask's file object or BytesIO
  names_df = pd.read_csv(io.BytesIO(names_uploaded[header_filename].read()), sep=":", header=None)
elif isinstance(names_uploaded[header_filename], bytes):
  # Bytes directly (Colab uploaded dict value)
  names_df = pd.read_csv(io.BytesIO(names_uploaded[header_filename]), sep=":", header=None)
else:
  raise ValueError("Unsupported file source type.")

print(names_df.head())

Saving spambase_column_headers.txt to spambase_column_headers (2).txt
                   0                       1
0     word_freq_make             continuous.
1  word_freq_address             continuous.
2      word_freq_all             continuous.
3       word_freq_3d             continuous.
4      word_freq_our             continuous.


# *Put the header names on top of the dataset*

In [None]:
new_headers = names_df.iloc[:, 0].tolist()

# Assign the list to your spam dataframe
df.columns = new_headers

print(df.head())

   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \
0             0.00            0.00  ...         0.00        0.000   
1 

# *Display descriptive statistics, null values, etc*

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import f_classif, mutual_info_classif
import numpy as np

# 2) Basic dataset stats
print("=== Dataset shape ===")
print(df.shape)  # (rows, cols)
print()

print("=== Class distribution ===")
counts = df['class'].value_counts()
pct = (counts / len(df) * 100).round(2)
print(pd.DataFrame({'count': counts, 'percent': pct}))
print()

print("=== Null values ===")
missing = df.isna().sum()
print(missing[missing > 0].sort_values(ascending=False) if missing.sum() else "No missing values", "\n")

print("=== Descriptive statistics ===")
print(df.describe())

# 4) Skewness (feature distributions)
print("=== Feature skewness ===")
feature_cols = [c for c in df.columns if c != 'label']
skewness = df[feature_cols].skew(numeric_only=True)
print(skewness.sort_values(ascending=False).round(4), "\n")

# 5) Correlation with target (point-biserial via Pearson with 0/1 label)
lb = LabelBinarizer()
y_bin = lb.fit_transform(df['class']).ravel()  # 0/1
# avoid constant features to prevent NaNs in corr
X = df[feature_cols].copy()
corrs = {}
for col in feature_cols:
    x = X[col]
    if x.nunique() > 1:
        corrs[col] = np.corrcoef(x, y_bin)[0, 1]
    else:
        corrs[col] = np.nan

corrs = pd.Series(corrs, name='point_biserial_corr')
print("=== Top features correlated with target (absolute) ===")
print(corrs.abs().sort_values(ascending=False).head(15).round(4), "\n")

# 6) Inter-feature correlations (to spot redundancy)
print("=== Strong inter-feature correlations (|r| >= 0.85) ===")
corrmat = X.corr(numeric_only=True)
strong_pairs = []
threshold = 0.85
cols = corrmat.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        r = corrmat.iloc[i, j]
        if pd.notna(r) and abs(r) >= threshold:
            strong_pairs.append((cols[i], cols[j], r))
if strong_pairs:
    strong_pairs = sorted(strong_pairs, key=lambda t: abs(t[2]), reverse=True)
    for a, b, r in strong_pairs[:25]:  # print top 25 pairs
        print(f"{a}  ~  {b}:  r={r:.3f}")
else:
    print("No pairs above threshold.")
print()

# 7) Univariate relevance (optional but handy)
#    ANOVA F and Mutual Information against the binary label
print("=== Univariate relevance to target ===")
# Replace any remaining NaNs with 0 safely for these scorers
X_filled = X.fillna(0.0).values
f_vals, f_p = f_classif(X_filled, y_bin)
mi_vals = mutual_info_classif(X_filled, y_bin, discrete_features=False, random_state=80)

uni_df = pd.DataFrame({
    'feature': feature_cols,
    'F_value': f_vals,
    'F_pvalue': f_p,
    'MI': mi_vals
}).sort_values(['F_value', 'MI'], ascending=False)

print(uni_df.head(20).round(4))

=== Dataset shape ===
(4601, 58)

=== Class distribution ===
       count  percent
class                
0       2788     60.6
1       1813     39.4

=== Null values ===
No missing values 

=== Descriptive statistics ===
       word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
count     4601.000000        4601.000000    4601.000000   4601.000000   
mean         0.104553           0.213015       0.280656      0.065425   
std          0.305358           1.290575       0.504143      1.395151   
min          0.000000           0.000000       0.000000      0.000000   
25%          0.000000           0.000000       0.000000      0.000000   
50%          0.000000           0.000000       0.000000      0.000000   
75%          0.000000           0.000000       0.420000      0.000000   
max          4.540000          14.280000       5.100000     42.810000   

       word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
count    4601.000000     4601.000000       

  f = msb / msw


                       feature   F_value  F_pvalue      MI
57                       class       inf       0.0  0.6708
20              word_freq_your  791.7260       0.0  0.1623
22               word_freq_000  580.5345       0.0  0.1043
6             word_freq_remove  570.1696       0.0  0.1491
52                 char_freq_$  538.0299       0.0  0.1979
18               word_freq_you  372.2740       0.0  0.1001
15              word_freq_free  342.3462       0.0  0.1361
16          word_freq_business  342.3163       0.0  0.0633
24                word_freq_hp  324.4908       0.0  0.1062
56    capital_run_length_total  304.4177       0.0  0.1418
4                word_freq_our  285.8907       0.0  0.0897
51                 char_freq_!  285.8102       0.0  0.2100
10           word_freq_receive  267.6872       0.0  0.0666
25               word_freq_hpl  263.9304       0.0  0.0732
5               word_freq_over  263.0606       0.0  0.0616
8              word_freq_order  260.5499       0.0  0.05

# *Determine feature importance using a Variance Threshold, followed by splitting the data for training and testing*

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

x = df.drop(columns=['class', 'word_freq_415']) # Use all features to train model
y = df['class'] # "quality" is our target

selector = VarianceThreshold(threshold=0.8)  # Ran several times to tune
x_reduced = selector.fit_transform(x)

# Get the mask of selected features
mask = selector.get_support()

# Get the feature names that remain
selected_features = x.columns[mask]

print(f"Number of selected features: {len(selected_features)}")
print("Selected features:")
for f in selected_features:
    print(f)

# Put back into DataFrame so we can save the split data as text files if need be
X_reduced = pd.DataFrame(pd.DataFrame(x_reduced, columns=selected_features, index=X.index))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_reduced,  y, stratify=y, test_size=0.2, random_state=5)


Number of selected features: 12
Selected features:
word_freq_address
word_freq_3d
word_freq_you
word_freq_your
word_freq_font
word_freq_hp
word_freq_george
word_freq_re
word_freq_edu
capital_run_length_average
capital_run_length_longest
capital_run_length_total
1618    1
3446    0
2235    0
4398    0
3620    0
       ..
1215    1
2573    0
2863    0
579     1
4327    0
Name: class, Length: 921, dtype: int64


# *Create a pipeline as a base, using Logistic Regression*

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Pipeline: SMOTE -> Scale -> Logistic Regression
pipe = Pipeline(steps=[
    ("smote", SMOTE(random_state=65, k_neighbors=2)),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        C=100.0,
        penalty="l1",
        solver="saga",
        tol=0.001,
        warm_start=True,
        max_iter=2000
    ))
])

pipe.fit(x_train, y_train)

# Perform predictions and evaluate results
y_pred = pipe.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Accuracy: 0.8599348534201955
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       558
           1       0.79      0.88      0.83       363

    accuracy                           0.86       921
   macro avg       0.85      0.86      0.86       921
weighted avg       0.87      0.86      0.86       921



# *Set up different models*

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

models = {
    'logreg': LogisticRegression(max_iter=1000, n_jobs=None),
    'svc': SVC(probability=True),
    'knn': KNeighborsClassifier(n_neighbors=15),  # tune k
    'dt': DecisionTreeClassifier(random_state=37),
    'rf': RandomForestClassifier(n_estimators=300, random_state=5, n_jobs=-1),
    'gb': GradientBoostingClassifier(random_state=99),
    'xgb': XGBClassifier(
        n_estimators=400,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric='logloss',
        random_state=73,
        n_jobs=-1
    ),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)


# *Now compare accuracies*

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

results = {}
for name, model in models.items():
    temp_pipe = clone(pipe)
    temp_pipe.set_params(clf=model)
    scores = cross_val_score(temp_pipe, x, y, cv=cv, scoring='accuracy', n_jobs=-1)
    results[name] = (scores.mean(), scores.std())

for name, (mean, std) in results.items():
    print(f"{name:10s} | Accuracy: {mean:.4f} | Std: {std:.4f}")

logreg     | Accuracy: 0.9265 | Std: 0.0061
svc        | Accuracy: 0.9350 | Std: 0.0056
knn        | Accuracy: 0.9055 | Std: 0.0117
dt         | Accuracy: 0.9072 | Std: 0.0088
rf         | Accuracy: 0.9541 | Std: 0.0046
gb         | Accuracy: 0.9428 | Std: 0.0041
xgb        | Accuracy: 0.9524 | Std: 0.0065


# *Used RandomForest model as it had the best accuracy. Tuning hyperparameters*

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

pipe = Pipeline([
    # drop StandardScaler for RF; it doesn't need scaling
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(random_state=86, n_jobs=-1))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)

# --- Broad randomized search (fast) ---
param_distributions = {
    'rfc__n_estimators': randint(200, 800),
    'rfc__max_depth': randint(4, 30),               # or None (handled below)
    'rfc__min_samples_split': randint(2, 20),
    'rfc__min_samples_leaf': randint(1, 20),
    'rfc__max_features': ['sqrt', 'log2', 0.2, 0.5],# try fractions too
    'rfc__bootstrap': [True, False],
    'rfc__class_weight': [None, 'balanced']         # with SMOTE, usually None
}

search = RandomizedSearchCV(
    pipe, param_distributions=param_distributions,
    n_iter=60, scoring='f1_macro', cv=cv, n_jobs=-1, random_state=21, verbose=1
)
search.fit(X, y)

best_model = search.best_estimator_
print("Best params:", search.best_params_)
print("Best CV f1_macro:", search.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best params: {'rfc__bootstrap': False, 'rfc__class_weight': None, 'rfc__max_depth': 6, 'rfc__max_features': 0.5, 'rfc__min_samples_leaf': 18, 'rfc__min_samples_split': 12, 'rfc__n_estimators': 647}
Best CV f1_macro: 1.0


# *Fit the final model*

In [None]:
final_model = Pipeline([
    # drop StandardScaler for RF; it doesn't need scaling
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(n_estimators = 647,
                                   bootstrap = False,
                                   class_weight = None,
                                   max_depth = 6,
                                   max_features = 0.5,
                                   min_samples_leaf = 18,
                                   min_samples_split = 12,
                                   random_state=86,
                                   n_jobs=-1))
    ])

final_model.fit(x_train, y_train)

# Perform predictions and evaluate results
y_pred = final_model.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.9055374592833876
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       558
           1       0.88      0.88      0.88       363

    accuracy                           0.91       921
   macro avg       0.90      0.90      0.90       921
weighted avg       0.91      0.91      0.91       921



# *Save the final model as a Pickle file*

In [None]:
import pickle

# Save the model to a pickle file
with open('final_model.pkl', 'wb') as f:
  pickle.dump(final_model, f)

# Download all pickle files
files.download("final_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# *Save the test data to a text file so a user can choose a row of values to enter on an HTML page*

In [None]:
# Convert the numpy array y_test to a pandas DataFrame
df_y = y_test.to_frame()

# Combine them
test_set = pd.concat([x_test, df_y], axis=1)

# Save as tab-delimited
test_set.to_csv("x_test_with_labels.txt", sep="\t", index=False)
files.download("x_test_with_labels.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>