# Importação das Bibliotecas

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from pycaret.classification import setup, compare_models, save_model
from pycaret.classification import predict_model

# Importação do Dataset tratado e balanceado

In [5]:
df = pd.read_csv('phishing_dataset_CIS.csv')
df.head()

Unnamed: 0,time,sendingPeriod,subject,SubjectClear,body,BodyClear,urls,phishing
0,7,morning,[ie-rant] British Police Chief Calls For Legal...,ie rant british police chief calls for legalis...,P45 at the ready for this chap...\n\nhttp://ne...,p45 at the ready for this chap north wales pol...,1,0
1,23,evening,from Reyes Greene,from reyes greene,\n\n\n\n\n\n\nBu up yi yq ng M wvz edic pg ine...,bu up yi yq ng m wvz edic pg ine on tp line vi...,1,1
2,12,afternoon,Turning a small knob into a huge wand!,turning a small knob into a huge wand,Problems everywhere?\n\nGood business & he zu ...,problems everywhere good business he zu alth h...,0,1
3,23,evening,"[UAI] ICAPS-08 Call for Papers, Tutorial Propo...",uai icaps 08 call for papers tutorial proposal...,\nhttp://icaps08.icaps-conference.org/\n\nTuto...,tutorial workshop proposal deadlines approachi...,1,0
4,16,afternoon,[UAI] CFP: SAT 2005,uai cfp sat 2005,With apologies for multiple copies:\n\n ...,with apologies for multiple copies call for pa...,1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27701 entries, 0 to 27700
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   time           27701 non-null  int64 
 1   sendingPeriod  27701 non-null  object
 2   subject        27701 non-null  object
 3   SubjectClear   27701 non-null  object
 4   body           27701 non-null  object
 5   BodyClear      27701 non-null  object
 6   urls           27701 non-null  int64 
 7   phishing       27701 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 1.7+ MB


# Separação das variáveis e tratamento 

In [7]:
# Unir o assunto e o corpo do email
df['text'] = df['SubjectClear'] + ' ' + df['BodyClear']

In [8]:
# Definir features e variável alvo
text_data = df['text']
X_other = df[['time', 'urls', 'sendingPeriod']]
y = df['phishing']

In [9]:
# Vetorizarização do texto com tfidf
tfidf = TfidfVectorizer(max_features=1500)
X_text = tfidf.fit_transform(text_data).toarray()
X_text_df = pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])]) # criação do dataframe com colunas tfidf

In [10]:
# Converter variável categórica em numérica
X_encoded = pd.get_dummies(X_other, columns=['sendingPeriod'])

In [11]:
# Concatenar os datasets
X_final = pd.concat([X_text_df, X_encoded], axis=1)
df_final = X_final.copy()
df_final['phishing'] = y.values

# Automl

In [12]:
# Definir o PyCaret (automl)
clf_setup = setup(
    data=df_final,
    target='phishing',
    session_id=42,
    preprocess=False,  # já fizemos
    verbose=True
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,phishing
2,Target type,Binary
3,Original data shape,"(27701, 1507)"
4,Transformed data shape,"(27701, 1507)"
5,Transformed train set shape,"(19390, 1507)"
6,Transformed test set shape,"(8311, 1507)"
7,Numeric features,1502


In [13]:
# Comparar os modelos e salvar o melhor
best_model = compare_models()
save_model(best_model, 'modelo_phishing_pycaret')


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9891,0.999,0.9884,0.9903,0.9893,0.9782,0.9782,5.165
lightgbm,Light Gradient Boosting Machine,0.9865,0.9989,0.9912,0.9826,0.9869,0.9731,0.9731,181.867
rf,Random Forest Classifier,0.9843,0.9986,0.9888,0.9807,0.9847,0.9686,0.9687,3.021
ridge,Ridge Classifier,0.9832,0.9983,0.9907,0.9767,0.9837,0.9663,0.9665,0.407
lda,Linear Discriminant Analysis,0.9824,0.9981,0.9907,0.9752,0.9829,0.9647,0.9648,2.539
lr,Logistic Regression,0.9823,0.9978,0.9871,0.9785,0.9828,0.9646,0.9646,3.126
svm,SVM - Linear Kernel,0.9751,0.9974,0.9887,0.9654,0.9764,0.9501,0.9515,0.82
gbc,Gradient Boosting Classifier,0.9692,0.9942,0.9837,0.9572,0.9703,0.9384,0.9388,14.058
dt,Decision Tree Classifier,0.9668,0.9668,0.9708,0.9645,0.9677,0.9336,0.9337,2.763
ada,Ada Boost Classifier,0.9614,0.9917,0.9731,0.9525,0.9627,0.9228,0.923,2.828


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('placeholder', None),
                 ('trained_model',
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                                       max_leaf_nodes=None, max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       monotonic_cst=None, n_estimators=100,
                                       n_jobs=-1, oob_score=False,
                                       random_state=42, verbose=0,
                                       warm_start=False))],
          verbose=False),
 'modelo_phishing_pycaret.pkl')

In [16]:
# Teste do modelo com os dados de teste
predict_model(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9895,0.9991,0.9887,0.9908,0.9897,0.9791,0.9791


Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_1499,time,urls,sendingPeriod_afternoon,sendingPeriod_dawn,sendingPeriod_evening,sendingPeriod_morning,phishing,prediction_label,prediction_score
8679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22,0,False,False,True,False,1,1,0.99
13647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,0,False,True,False,False,0,0,0.97
25484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,19,1,False,False,True,False,0,0,0.99
14212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,14,1,True,False,False,False,1,1,0.98
22653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22,1,False,False,True,False,0,0,0.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15,1,True,False,False,False,1,1,0.70
23131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,18,1,False,False,True,False,0,0,0.79
1039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10,1,False,False,False,True,1,1,0.93
13818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4,0,False,True,False,False,0,0,0.86
