In [1]:
# Manpulate
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Feature Extraction
# import user_agents # get info from user_agent (browser_info)
# from ip2geotools.databases.noncommercial import DbIpCity as ip2geo # get location from ip
# from geopy.distance import great_circle # distance btn 2 (lat,long)
# from geopy.geocoders import Nominatim # geocode("place") / reverse("lat,long")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # text feature

# Pre-Processing
from sklearn.model_selection import train_test_split # train-test-split
from sklearn.impute import SimpleImputer, KNNImputer # detect & handle NaNs
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder # Ordinal Encoding, Nominal Encoding
from category_encoders import BinaryEncoder # Nominal Encoding 
from imblearn.under_sampling import RandomUnderSampler # undersampling
from imblearn.over_sampling import RandomOverSampler, SMOTE # oversampling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler # Scaling

# Modeling
## 1) Pipeline
from sklearn.pipeline import Pipeline, make_pipeline # to make pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector # apply pipeline to each column

## 2) Regression Models
from sklearn.linear_model import LinearRegression # if data is small and small_no_features
from sklearn.linear_model import SGDRegressor # if data is large: (can have penalty=constrains)
from sklearn.preprocessing import PolynomialFeatures # for polynomial regresion (then apply scaling after it)
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV # Regularization 

## 3) Model Selection (Underfitting vs Overfitting) [bias variance tradeoff => perfect model complexity]
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV # (Train - Valid - Test) + hyperparameters tunning 
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV # if data / features is large
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # Evaluate Model: r2=> accuracy, L2-norm: if no outliers, L1-norm: if outliers
from scipy import stats # Confidence Interval of Accuracy / Loss / Utility
import joblib # save model

In [2]:
# a) Understand Columns
df = pd.read_csv("emails.csv")
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [3]:
st= df.iloc[50,0]
st

'Subject: prop 0 sal  dear siobhan _ riskin  our company will place any business with a qualified website permanently at the top of the major search engines guaranteed never to move ( eg : yahoo ! , msn , alta vista , etc ) . if you are interested in being guaranteed first position in the top search engines at a promotional fee , please contact us at hannah @ speedy . com . pe please include the url ( s ) your are interested in promoting this is not pay per click examples will be provided .  sincerely  the search engine placement specialists  if you wish to be removed , please respond to hannah @ speedy . com . pe and type the word : remove in your subject line'

In [4]:
# most 100 common words in spam mails
from collections import Counter
result = " ".join(df['text'][df['spam'] == 1].to_list())
result = result.split(" ")
count_result = Counter(result)
count_result.most_common(n=100)

[('', 30743),
 ('.', 19839),
 ('_', 13556),
 (',', 11514),
 ('the', 8975),
 ('-', 8514),
 ('to', 8165),
 ('and', 6517),
 ('of', 5629),
 ('you', 4920),
 ('a', 4695),
 (':', 4691),
 ('in', 3879),
 ('your', 3730),
 ('!', 3310),
 ('for', 3186),
 ('is', 2977),
 ('this', 2822),
 ('/', 2578),
 ("'", 2326),
 ('$', 2242),
 ('that', 1896),
 ('i', 1772),
 ('with', 1734),
 ('we', 1724),
 ('be', 1675),
 ('"', 1655),
 (')', 1598),
 ('are', 1590),
 ('or', 1588),
 ('it', 1584),
 ('on', 1582),
 ('(', 1514),
 ('from', 1471),
 ('will', 1429),
 ('our', 1404),
 ('not', 1382),
 ('Subject:', 1368),
 ('have', 1352),
 ('s', 1333),
 ('*', 1300),
 ('as', 1201),
 ('=', 1182),
 ('by', 1038),
 ('com', 998),
 ('at', 988),
 ('?', 981),
 ('all', 979),
 ('1', 952),
 ('if', 869),
 ('business', 844),
 ('can', 830),
 ('company', 805),
 ('email', 804),
 ('an', 790),
 ('|', 789),
 ('here', 770),
 ('do', 767),
 ('information', 740),
 ('more', 727),
 ('my', 699),
 ('e', 698),
 ('no', 688),
 ('5', 687),
 ('money', 662),
 ('%',

### 3) Pre-Processing Mind Map:
* a) Detect & Handle Duplicates
* b) train_test_split
* c) Detect & Handle NaNs
* d) Detect & Handle Outliers
* e) Encoding: (Ordinal:[OrdinalEncoder, LabelEncoder] - Nominal: [< 7 uniques(OneHotEncoding), > 7 uniques (BinaryEncoder)])
* f) Imbalanced: X_train_resampled
* g) Scaling: StandardScaler, MinMaxScaler, RobustScaler: X_train_resampled_scaled

In [5]:
# a) Detect & Handle Duplicates
df.duplicated().sum()


33

In [6]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5690,Subject: re : research and development charges...,0
5691,"Subject: re : receipts from visit jim , than...",0
5692,Subject: re : enron case study update wow ! a...,0
5693,"Subject: re : interest david , please , call...",0


In [7]:
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5695 entries, 0 to 5694
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5695 non-null   object
 1   spam    5695 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.1+ KB


In [9]:
import string

In [10]:
import nltk

In [11]:
from nltk.corpus import stopwords

In [12]:
from nltk.stem import PorterStemmer

In [13]:
stemmer = PorterStemmer()
stemmer.stem('running')

'run'

In [14]:
df.text.iloc[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [15]:
#nltk.download('stopwords')
#stopworeds_set = set(stopwords.words('english'))

In [16]:
stopworeds_set = set(stopwords.words('english'))

In [17]:
# preprocessing and cleaning
corpus= []
for i in range(len(df)):
    text = df['text'].iloc[i].lower()# to get all in lowercase
    text = text.translate(str.maketrans('','',string.punctuation)).split()# to remove punctuation
    text = [stemmer.stem(word) for word in text if word  not in stopworeds_set] # to stem all word to original words
    text = ' '.join(text)
    corpus.append(text)


In [18]:
df['text'].iloc[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [19]:
corpus[0]

'subject natur irresist corpor ident lt realli hard recollect compani market full suqgest inform isoverwhelminq good catchi logo stylish statloneri outstand websit make task much easier promis havinq order iogo compani automaticaili becom world ieader isguit ciear without good product effect busi organ practic aim hotat nowaday market promis market effort becom much effect list clear benefit creativ hand made origin logo special done reflect distinct compani imag conveni logo stationeri provid format easi use content manag system letsyou chang websit content even structur prompt see logo draft within three busi day afford market break make gap budget 100 satisfact guarante provid unlimit amount chang extra fee surethat love result collabor look portfolio interest'

In [20]:
vectorizer= CountVectorizer()

In [21]:
X = vectorizer.fit_transform(corpus).toarray()
y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)
X_train.shape, X_test.shape

((4556, 29254), (1139, 29254))

In [22]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=50, class_weight='balanced') 


lr_model.fit(X_train, y_train)
y_train_pred = lr_model.predict(X_train)
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_predict

y_valid_pred = cross_val_predict(lr_model, X_train, y_train, cv=3)
print(f"Train Accuracy: {lr_model.score(X_train, y_train) * 100}")
print(f"Validation Accuracy: {accuracy_score(y_train, y_valid_pred) * 100}")
print("-" * 50)
print(f"Validation Precision: {precision_score(y_train, y_valid_pred)}")
print(f"Validation Recall: {recall_score(y_train, y_valid_pred)}")
confusion_matrix(y_train, y_valid_pred) # care more about recall

Train Accuracy: 100.0
Validation Accuracy: 98.35381913959613
--------------------------------------------------
Validation Precision: 0.9627611262488647
Validation Recall: 0.9689213893967094


array([[3421,   41],
       [  34, 1060]], dtype=int64)

In [24]:
y_test_scores = lr_model.predict(X_test)

print(f"Test Accuracy: {accuracy_score(y_test, y_test_scores) * 100}")
print("-" * 50)
print(f"Test Precision: {precision_score(y_test, y_test_scores)}")
print(f"Test Recall: {recall_score(y_test, y_test_scores)}")
confusion_matrix(y_test, y_test_scores) # care more about recall

Test Accuracy: 98.77085162423178
--------------------------------------------------
Test Precision: 0.9676258992805755
Test Recall: 0.9817518248175182


array([[856,   9],
       [  5, 269]], dtype=int64)

In [25]:
lr_model.score(X_test,y_test)

0.9877085162423178

In [26]:
#from nltk.tokenize import word_tokenize

In [27]:
def custom_tokenizer(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    #tokens = word_tokenize(text)
    # Stem
    #stemmed = [stemmer.stem(word) for word in tokens]
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed

In [28]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=custom_tokenizer)),
    ('clf', lr_model)
])

In [29]:
X = df['text']
y = df['spam']

In [30]:
X[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [31]:
#nltk.download('punkt') 

In [32]:
pipeline.fit(X , y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
joblib.dump(pipeline, 'gpt1_pipeline.joblib')

['gpt1_pipeline.joblib']

In [None]:
model = joblib.load('gpt1_pipeline.joblib')

In [None]:
email_test= df.text.values[0]
email_test


"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [None]:
# very good
prediction = model.predict([email_test])[0]
prediction

1