In [1]:
import pandas as pd
import numpy as np
# import pingouin as pg , kaggle does not support pingouin
from sklearn.preprocessing import PowerTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [4]:
spam = pd.read_csv('/home/charumathi/Desktop/Sem-6/ML-lab/A4/archive/spambase_csv.csv')
spam.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
x = spam[spam.drop('class', axis = 1).columns]
y = spam['class']
xtrain, xtest, ytrain, ytest = train_test_split(x,y, train_size = 0.7, random_state = 42)

In [6]:
def skewness_check(data, skew_cols = False, non_skew = False):
    skew_feats = data.skew().sort_values(ascending = False)
    skewness = pd.DataFrame(skew_feats, columns = ['Skew'])
    skew_dict = {'High':0, 'Moderate':0, 'None':0}
    
    if skew_cols == True:
        df = skewness[((skewness['Skew'] <= -1) | (skewness['Skew'] >= 1)) | ((skewness['Skew'] > -1) & (skewness['Skew'] <= -0.5)) | ((skewness['Skew'] >= 0.5) & (skewness['Skew'] < 1))]
        return df
    
    elif skew_cols == False and non_skew == False:
        for row in skewness['Skew']:
            if row <= -1 or row >= 1:
                skew_dict['High'] += 1
            elif (row > -1 and row <= -0.5) or (row >= 0.5 and row < 1):
                skew_dict['Moderate'] += 1
            else:
                skew_dict["None"] += 1
        return pd.DataFrame.from_dict(skew_dict, orient = 'index', columns = ['Skew'])
    
    elif non_skew == True:
        df_non_skew = skewness[((skewness['Skew']>= 0) & (skewness['Skew']< 0.5)) |((skewness['Skew']> -0.5) & (skewness['Skew']<= 0))]
        return df_non_skew
   

In [7]:
skewness_check(xtrain)

Unnamed: 0,Skew
High,57
Moderate,0
,0


In [8]:
def correlation(data, threshold = 0.75):
    col_corr = set()
    corr_matrix = data.corr(method = 'spearman')
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) >= threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return list(col_corr)

In [9]:
high_corr_columns = correlation(xtrain)
high_corr_columns

['capital_run_length_total',
 'word_freq_415',
 'word_freq_hpl',
 'capital_run_length_longest']

In [10]:
xtrain.drop(high_corr_columns, axis = 1, inplace = True)
xtest.drop(high_corr_columns, axis = 1, inplace = True)

In [11]:
def feature_selection(x,y):
    skb = SelectKBest(score_func = f_classif, k = 'all')
    skb.fit_transform(x,y)
    col_names = x.columns.values[skb.get_support()]
    scores = skb.scores_[skb.get_support()]
    col_scores = list(zip(col_names, scores))
    df = pd.DataFrame(col_scores, columns = ['Feature','Score'])
    mean_score = df['Score'].mean()
    max_score = df['Score'].max()
    filtered_df = df[(df['Score'] >= mean_score) & (df['Score'] <= max_score)]

    return filtered_df.sort_values('Score', ascending = False)

In [12]:
df = feature_selection(xtrain, ytrain)
df

Unnamed: 0,Feature,Score
20,word_freq_your,561.575835
50,char_freq_%24,476.174683
6,word_freq_remove,430.479437
22,word_freq_000,405.013696
15,word_freq_free,262.0651
18,word_freq_you,250.727794
16,word_freq_business,235.75596
8,word_freq_order,220.207729
24,word_freq_hp,207.253341
4,word_freq_our,191.047832


In [13]:
FS_xtrain = xtrain[list(df['Feature'])]
FS_xtest = xtest[list(df['Feature'])]

In [14]:
def data_transform_PT(data_train, data_test):
    
    pt = PowerTransformer(method = 'yeo-johnson',
                         standardize = False) # Using yeo-johnson because data contains values of zero.
    data_train_transformed = pd.DataFrame(pt.fit_transform(data_train),
                                         columns = data_train.columns)
    data_test_transformed = pd.DataFrame(pt.transform(data_test),
                                        columns = data_test.columns)
    return data_train_transformed, data_test_transformed

In [15]:
xtrain_PT, xtest_PT = data_transform_PT(FS_xtrain,FS_xtest)

In [16]:
skewness_check(xtrain_PT)

Unnamed: 0,Skew
High,14
Moderate,3
,2


In [17]:
def skew_comparison(x_1, x_2):
    skew_feats = x_1.skew().sort_values(ascending = False)
    skewness = pd.DataFrame(skew_feats, columns = ['Skew Before'])
    
    skew_feats_2 = x_2.skew().sort_values(ascending = False)
    skewness_2 = pd.DataFrame(skew_feats_2, columns = ['Skew After'])
    
    df = skewness.merge(skewness_2, right_index = True, left_index = True)
    df['Skew Reduction'] = -abs(df['Skew Before'] - df['Skew After'])
    
    return df

In [18]:
skew_comparison(xtrain, xtrain_PT)

Unnamed: 0,Skew Before,Skew After,Skew Reduction
char_freq_%21,19.65747,0.6834,-18.974071
word_freq_money,14.641553,1.947225,-12.694328
word_freq_internet,10.834837,1.81284,-9.021997
word_freq_free,10.236811,1.231002,-9.005809
char_freq_%24,9.173671,1.186976,-7.986695
word_freq_addresses,7.173512,3.40059,-3.772922
word_freq_over,6.483361,1.541676,-4.941685
word_freq_remove,6.34465,1.833488,-4.511162
word_freq_business,6.038427,1.578406,-4.460021
word_freq_email,6.028436,1.505507,-4.52293


In [19]:
print(ytrain.value_counts())
print(ytest.value_counts())

0    1984
1    1236
Name: class, dtype: int64
0    804
1    577
Name: class, dtype: int64


In [20]:
smt = SMOTETomek(random_state = 42)
xtrain_res, ytrain_res = smt.fit_resample(xtrain_PT, ytrain)
xtest_res, ytest_res = smt.fit_resample(xtest_PT, ytest)

print(ytrain_res.value_counts())
print(ytest_res.value_counts())

1    1940
0    1940
Name: class, dtype: int64
0    776
1    776
Name: class, dtype: int64


In [21]:
gnb = GaussianNB()
gnb.fit(xtrain_res,ytrain_res)
predictions = gnb.predict(xtest_res)
accuracy = accuracy_score(ytest_res,predictions)
f1 = f1_score(ytest_res,predictions)
auc = roc_auc_score(ytest_res,predictions)

print('accuracy: ', accuracy)
print('f1: ', f1)
print('AUC: ', auc)

accuracy:  0.9110824742268041
f1:  0.9076305220883534
AUC:  0.9110824742268041
