<a href="https://colab.research.google.com/github/deondrae4088/ML_Project/blob/main/MiniProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Import data
df = pd.read_csv("/content/phishing .csv")
df.head()

Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,1,-1,1,-1,-1,1,1,1,0,0
1,-1,-1,-1,-1,-1,0,1,1,1,1
2,1,-1,0,0,-1,0,-1,1,0,1
3,1,0,1,-1,-1,0,1,1,0,0
4,-1,-1,1,-1,0,0,-1,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1353 entries, 0 to 1352
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   SFH                1353 non-null   int64
 1   popUpWidnow        1353 non-null   int64
 2   SSLfinal_State     1353 non-null   int64
 3   Request_URL        1353 non-null   int64
 4   URL_of_Anchor      1353 non-null   int64
 5   web_traffic        1353 non-null   int64
 6   URL_Length         1353 non-null   int64
 7   age_of_domain      1353 non-null   int64
 8   having_IP_Address  1353 non-null   int64
 9   Result             1353 non-null   int64
dtypes: int64(10)
memory usage: 105.8 KB


In [6]:
# Check the value_counts of the target column
df["Result"].value_counts()

Unnamed: 0_level_0,count
Result,Unnamed: 1_level_1
-1,702
1,548
0,103


## Preprocess the data

In [7]:
# Check the data types
df.dtypes

Unnamed: 0,0
SFH,int64
popUpWidnow,int64
SSLfinal_State,int64
Request_URL,int64
URL_of_Anchor,int64
web_traffic,int64
URL_Length,int64
age_of_domain,int64
having_IP_Address,int64
Result,int64


In [8]:
# Get the target variable (the "Result" column)
y = df["Result"]
y

Unnamed: 0,Result
0,0
1,1
2,1
3,0
4,1
...,...
1348,1
1349,-1
1350,1
1351,1


In [9]:
# Get the features (everything except the "class" column)
X = df.copy()
X = X.drop(columns="Result")
X.head()

Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address
0,1,-1,1,-1,-1,1,1,1,0
1,-1,-1,-1,-1,-1,0,1,1,1
2,1,-1,0,0,-1,0,-1,1,0
3,1,0,1,-1,-1,0,1,1,0
4,-1,-1,1,-1,0,0,-1,1,0


In [10]:
# Split data into training and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [11]:
# Scaling

In [12]:
# Remember that all of the columns in the DataFrame are objects
# Use a OneHotEncoder to convert the training data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train), columns=ohe.get_feature_names_out())
X_train_encoded

Unnamed: 0,SFH_-1,SFH_0,SFH_1,popUpWidnow_-1,popUpWidnow_0,popUpWidnow_1,SSLfinal_State_-1,SSLfinal_State_0,SSLfinal_State_1,Request_URL_-1,...,web_traffic_-1,web_traffic_0,web_traffic_1,URL_Length_-1,URL_Length_0,URL_Length_1,age_of_domain_-1,age_of_domain_1,having_IP_Address_0,having_IP_Address_1
0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,1,0
1,1,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,1,1,0
2,0,0,1,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,1,0
3,1,0,0,0,1,0,0,1,0,1,...,0,1,0,1,0,0,0,1,1,0
4,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,0,0,1,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,1,1,0
1010,0,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,1,1,0,1,0
1011,1,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,1,1,0
1012,1,0,0,1,0,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0


In [13]:
# Encode the test data
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test), columns=ohe.get_feature_names_out())
X_test_encoded

Unnamed: 0,SFH_-1,SFH_0,SFH_1,popUpWidnow_-1,popUpWidnow_0,popUpWidnow_1,SSLfinal_State_-1,SSLfinal_State_0,SSLfinal_State_1,Request_URL_-1,...,web_traffic_-1,web_traffic_0,web_traffic_1,URL_Length_-1,URL_Length_0,URL_Length_1,age_of_domain_-1,age_of_domain_1,having_IP_Address_0,having_IP_Address_1
0,0,0,1,0,0,1,1,0,0,1,...,1,0,0,0,0,1,0,1,1,0
1,1,0,0,1,0,0,0,1,0,1,...,0,0,1,1,0,0,1,0,1,0
2,0,0,1,0,1,0,0,0,1,1,...,1,0,0,0,1,0,0,1,1,0
3,0,0,1,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
4,1,0,0,1,0,0,0,0,1,1,...,0,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,1,0
335,1,0,0,1,0,0,1,0,0,0,...,0,0,1,1,0,0,1,0,1,0
336,0,0,1,0,1,0,0,0,1,1,...,0,1,0,1,0,0,1,0,1,0
337,0,0,1,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,1,1,0


## Model and Fit to a Logistic Regression Classifier

In [15]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)

# Fit the model to the training data
lr_model.fit(X_train_encoded, y_train_encoded)

NameError: name 'y_train_encoded' is not defined

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train_encoded, y_train_encoded))
print('Test Accuracy: %.3f' % lr_model.score(X_test_encoded, y_test_encoded))

## Model and Fit to a Support Vector Machine

In [None]:
# Create the support vector machine classifier model with a 'poly' kernel
svm_model = SVC(kernel='poly')

# Fit the model to the training data
svm_model.fit(X_train_encoded, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train_encoded, y_train_encoded))
print('Test Accuracy: %.3f' % svm_model.score(X_test_encoded, y_test_encoded))

## Model and Fit to a KNN Model

In [None]:
# Create the KNN model with 9 neighbors
knn_model = KNeighborsClassifier(n_neighbors=9)

# Fit the model to the training data
knn_model.fit(X_train_encoded, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % knn_model.score(X_train_encoded, y_train_encoded))
print('Test Accuracy: %.3f' % knn_model.score(X_test_encoded, y_test_encoded))

## Model and Fit to a Decision Tree Classifier

In [None]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_encoded, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train_encoded, y_train_encoded))
print('Test Accuracy: %.3f' % dt_model.score(X_test_encoded, y_test_encoded))

## Model and Fit to a Random Forest Classifier

In [None]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_encoded, y_train_encoded)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % rf_model.score(X_train_encoded, y_train_encoded))
print('Test Accuracy: %.3f' % rf_model.score(X_test_encoded, y_test_encoded))