Step 1 Preprocessing


In [10]:
import pandas as pd

# Load dataset without header and set column names manually
df = pd.read_csv('/content/url_data.csv', header=None, names=['index', 'url', 'category'])

# Drop the index column as it's not needed
df = df.drop(columns=['index'])

# Handle NaN values by removing them
df = df.dropna()

# Display the first few rows to verify
print(df.head())


                                                 url category
0                   http://www.liquidgeneration.com/    Adult
1                        http://www.onlineanime.org/    Adult
2  http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...    Adult
3                         http://www.galeon.com/kmh/    Adult
4                        http://www.fanworkrecs.com/    Adult


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('/'), stop_words='english')
X = vectorizer.fit_transform(df['url'])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])





In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Accuracy: 0.3817239559174011
Precision: 0.6456912006872446
Recall: 0.3817239559174011
F1-Score: 0.3480906272475239


In [15]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(nb_classifier, X, y, cv=kf, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean()}')


Cross-Validation Accuracy Scores: [0.38172396 0.38240855 0.38150962 0.38307395 0.38171116]
Mean Cross-Validation Accuracy: 0.38208544602440864
