**Social engg**


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score
import joblib

In [17]:
df = pd.read_csv('phishing_URL.csv')
df_email = pd.read_csv('phishing_Email.csv')

In [18]:
df

Unnamed: 0,URL,Label
0,https://example.com,0
1,http://phishing.com,1
2,https://secure-login.net,0
3,http://malicious.net,1
4,https://trustedbank.com/login,0
...,...,...
89,http://phishy-streaming-login.net,1
90,https://safe-software-downloads.com,0
91,http://malware-software-update.net,1
92,https://verified-government-login.com,0


In [19]:
df_email

Unnamed: 0,Email,Label
0,Congratulations! You’ve won a prize! Claim now...,1
1,Your account has been accessed from a new devi...,1
2,Meeting rescheduled: Please check your calenda...,0
3,Urgent: Your account will be suspended. Click ...,1
4,Invoice for your recent purchase attached.,0
...,...,...
188,Special VIP offer. Upgrade now at http://fraud...,1
189,Emergency alert: Confirm your location at http...,1
190,Alert: Your account is under review. Please co...,1
191,Important tax information for you. View detail...,0


In [None]:
print(df.info())
print(df_email.info())
print(df['Label'].value_counts())
print(df_email['Label'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     94 non-null     object
 1   Label   94 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Email   193 non-null    object
 1   Label   193 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.1+ KB
None
Label
0    47
1    47
Name: count, dtype: int64
Label
1    116
0     77
Name: count, dtype: int64


In [None]:

df['URL'] = df['URL'].str.lower().str.strip()
df_email['Email'] = df_email['Email'].str.lower().str.strip()


In [None]:
df['has_http'] = df['URL'].str.contains('http')
df['has_https'] = df['URL'].str.contains('https')
df['contains_login'] = df['URL'].str.contains('login')


In [None]:
df

Unnamed: 0,URL,Label,has_http,has_https,contains_login
0,https://example.com,0,True,True,False
1,http://phishing.com,1,True,False,False
2,https://secure-login.net,0,True,True,True
3,http://malicious.net,1,True,False,False
4,https://trustedbank.com/login,0,True,True,True
...,...,...,...,...,...
89,http://phishy-streaming-login.net,1,True,False,True
90,https://safe-software-downloads.com,0,True,True,False
91,http://malware-software-update.net,1,True,False,False
92,https://verified-government-login.com,0,True,True,True


In [None]:
df_email['contains_prize'] = df_email['Email'].str.contains('prize')
df_email['contains_urgent'] = df_email['Email'].str.contains('urgent')
df_email['contains_click'] = df_email['Email'].str.contains('click')


In [None]:
df_email

Unnamed: 0,Email,Label,contains_prize,contains_urgent,contains_click
0,Congratulations! You’ve won a prize! Claim now...,1,True,False,False
1,Your account has been accessed from a new devi...,1,False,False,False
2,Meeting rescheduled: Please check your calenda...,0,False,False,False
3,Urgent: Your account will be suspended. Click ...,1,False,False,False
4,Invoice for your recent purchase attached.,0,False,False,False
...,...,...,...,...,...
188,Special VIP offer. Upgrade now at http://fraud...,1,False,False,False
189,Emergency alert: Confirm your location at http...,1,False,False,False
190,Alert: Your account is under review. Please co...,1,False,False,False
191,Important tax information for you. View detail...,0,False,False,False


In [None]:
from sklearn.model_selection import train_test_split

# For URLs
X_url = df['URL']
y_url = df['Label']
X_url_train, X_url_test, y_url_train, y_url_test = train_test_split(X_url, y_url, test_size=0.2, random_state=42)

# For Emails
X_email = df_email['Email']
y_email = df_email['Label']
X_email_train, X_email_test, y_email_train, y_email_test = train_test_split(X_email, y_email, test_size=0.2, random_state=42)


In [None]:

# For URLs
url_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])
url_pipeline.fit(X_url_train, y_url_train)

# For Emails
email_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])
email_pipeline.fit(X_email_train, y_email_train)


In [None]:

# URL Model Evaluation
y_url_pred = url_pipeline.predict(X_url_test)
print(classification_report(y_url_test, y_url_pred))
print(accuracy_score(y_url_test, y_url_pred)*100)

# Email Model Evaluation
y_email_pred = email_pipeline.predict(X_email_test)
print(classification_report(y_email_test, y_email_pred))
print(accuracy_score(y_email_test, y_email_pred)*100)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00         6

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19

100.0
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.96      1.00      0.98        25

    accuracy                           0.97        39
   macro avg       0.98      0.96      0.97        39
weighted avg       0.98      0.97      0.97        39

97.43589743589743


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Predictions for URLs
y_url_pred = url_pipeline.predict(X_url_test)

# Confusion Matrix for URLs
cm_url = confusion_matrix(y_url_test, y_url_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_url, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for URLs')
plt.show()


# Predictions for Emails
y_email_pred = email_pipeline.predict(X_email_test)

# Confusion Matrix for Emails
cm_email = confusion_matrix(y_email_test, y_email_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_email, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Emails')
plt.show()

In [None]:
import joblib

# Save URL model
joblib.dump(url_pipeline, 'url_model.joblib')

# Save Email model
joblib.dump(email_pipeline, 'email_model.joblib')


['email_model.joblib']