# Importação das Bibliotecas

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from pycaret.classification import setup, compare_models, save_model
from pycaret.classification import predict_model
from sklearn.model_selection import train_test_split

# Importação do Dataset tratado e balanceado

In [14]:
df = pd.read_csv('phishing_dataset_CIS.csv')
df.head()

Unnamed: 0,time,sendingPeriod,subject,SubjectClear,body,BodyClear,urls,phishing
0,7,morning,[ie-rant] British Police Chief Calls For Legal...,ie rant british police chief calls for legalis...,P45 at the ready for this chap...\n\nhttp://ne...,p45 at the ready for this chap north wales pol...,1,0
1,23,evening,from Reyes Greene,from reyes greene,\n\n\n\n\n\n\nBu up yi yq ng M wvz edic pg ine...,bu up yi yq ng m wvz edic pg ine on tp line vi...,1,1
2,12,afternoon,Turning a small knob into a huge wand!,turning a small knob into a huge wand,Problems everywhere?\n\nGood business & he zu ...,problems everywhere good business he zu alth h...,0,1
3,23,evening,"[UAI] ICAPS-08 Call for Papers, Tutorial Propo...",uai icaps 08 call for papers tutorial proposal...,\nhttp://icaps08.icaps-conference.org/\n\nTuto...,tutorial workshop proposal deadlines approachi...,1,0
4,16,afternoon,[UAI] CFP: SAT 2005,uai cfp sat 2005,With apologies for multiple copies:\n\n ...,with apologies for multiple copies call for pa...,1,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27701 entries, 0 to 27700
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   time           27701 non-null  int64 
 1   sendingPeriod  27701 non-null  object
 2   subject        27701 non-null  object
 3   SubjectClear   27701 non-null  object
 4   body           27701 non-null  object
 5   BodyClear      27701 non-null  object
 6   urls           27701 non-null  int64 
 7   phishing       27701 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 1.7+ MB


# Separação das variáveis e tratamento 

In [16]:
# Unir o assunto e o corpo do email
df['text'] = df['SubjectClear'] + ' ' + df['BodyClear']

In [17]:
# Definir features e variável alvo
text_data = df['text']
X_other = df[['time', 'urls', 'sendingPeriod']]
y = df['phishing']

In [18]:
# Vetorizarização do texto com tfidf
tfidf = TfidfVectorizer(max_features=1500)
X_text = tfidf.fit_transform(text_data).toarray()
X_text_df = pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])]) # criação do dataframe com colunas tfidf

In [19]:
# Converter variável categórica em numérica
X_encoded = pd.get_dummies(X_other, columns=['sendingPeriod'])

In [20]:
# Concatenar os datasets
X_final = pd.concat([X_text_df, X_encoded], axis=1)
df_final = X_final.copy()
df_final['phishing'] = y.values

In [21]:
# Separar os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

# Criar dataframes para o pycaret
train_df = X_train.copy()
train_df['phishing'] = y_train

test_df = X_test.copy()
test_df['phishing'] = y_test

# Automl

In [22]:
# Setup do automl
clf_setup = setup(data=train_df,
                  target='phishing',
                  session_id=42,
                  preprocess=False,
                  verbose=True)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,phishing
2,Target type,Binary
3,Original data shape,"(22160, 1507)"
4,Transformed data shape,"(22160, 1507)"
5,Transformed train set shape,"(15511, 1507)"
6,Transformed test set shape,"(6649, 1507)"
7,Numeric features,1502


In [23]:
# obtenção do melhor modelo

best_model = compare_models()
save_model(best_model, 'modelo_phishing_pycaret')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9868,0.9988,0.9874,0.9868,0.9871,0.9736,0.9736,5.38
lightgbm,Light Gradient Boosting Machine,0.9855,0.9985,0.9904,0.9814,0.9859,0.971,0.971,146.227
rf,Random Forest Classifier,0.9836,0.9982,0.9897,0.9784,0.984,0.9671,0.9672,2.94
ridge,Ridge Classifier,0.9826,0.9982,0.9904,0.9759,0.9831,0.9652,0.9653,0.465
lr,Logistic Regression,0.9816,0.9976,0.988,0.9763,0.9821,0.9632,0.9633,3.749
lda,Linear Discriminant Analysis,0.9812,0.9979,0.9902,0.9735,0.9817,0.9623,0.9625,2.942
svm,SVM - Linear Kernel,0.9742,0.9974,0.9905,0.9608,0.9753,0.9483,0.9492,0.797
gbc,Gradient Boosting Classifier,0.969,0.994,0.9831,0.9574,0.9701,0.9379,0.9383,10.966
ada,Ada Boost Classifier,0.9623,0.9915,0.9734,0.9538,0.9635,0.9245,0.9248,2.32
dt,Decision Tree Classifier,0.9616,0.9616,0.9647,0.9605,0.9625,0.9232,0.9233,2.128


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('placeholder', None),
                 ('trained_model',
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                                       max_leaf_nodes=None, max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       monotonic_cst=None, n_estimators=100,
                                       n_jobs=-1, oob_score=False,
                                       random_state=42, verbose=0,
                                       warm_start=False))],
          verbose=False),
 'modelo_phishing_pycaret.pkl')

In [24]:
# Avaliação em dados nunca vistos

predict_model(best_model, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9894,0.9993,0.9898,0.9894,0.9896,0.9787,0.9787


Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_1499,time,urls,sendingPeriod_afternoon,sendingPeriod_dawn,sendingPeriod_evening,sendingPeriod_morning,phishing,prediction_label,prediction_score
8880,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,18,1,False,False,True,False,0,0,0.93
26493,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,8,1,False,False,False,True,0,0,0.75
7767,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,14,1,True,False,False,False,1,1,0.86
18079,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,6,1,False,False,False,True,0,0,0.88
14338,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,12,1,True,False,False,False,1,1,0.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17329,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,6,0,False,False,False,True,0,0,0.86
23653,0.000000,0.069054,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,6,0,False,False,False,True,0,0,0.86
2566,0.103776,0.000000,0.0,0.0,0.025753,0.0,0.049833,0.0,0.0,0.0,...,0.0,8,0,False,False,False,True,0,0,0.80
27526,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,7,1,False,False,False,True,0,0,0.88
