# NLP Project Tutorial

**Step 1:**

Load your dataset and do the necessary transformations on your target variable.

In [53]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [260]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df_raw = pd.read_csv(url)

In [239]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [240]:
df_raw.sample(10)

Unnamed: 0,url,is_spam
366,https://developer.apple.com/,False
249,https://join1440.com/,True
917,https://www.morningbrew.com/the-essentials/,True
1898,https://www.abc.net.au/news/science/2020-06-29...,False
2802,https://briefingday.com/n/20200701/m#commentform,False
459,https://link.theskimm.com/manage/5uz/oc,True
516,https://www.theverge.com/2020/6/22/21299032/mi...,False
835,https://www.theskimm.com/daily-skimm,True
2418,https://www.nytimes.com/2020/06/28/nyregion/ny...,False
369,https://www.theverge.com/2020/6/18/21296180/ap...,False


In [261]:
df_duplicates = df_raw.duplicated().sum()
df_duplicates #Hay 630 duplicados

630

In [262]:
df_int = df_raw.copy()

In [263]:
df_int = df_int.drop_duplicates().reset_index(drop = True)
#Dropping duplicates

In [264]:
#Functions for url cleaning:

def protocol(text):
    return re.sub(r'(https://www|https://)', '', text)

def punct(text):
    return re.sub('[^a-zA-Z]', ' ', text)

def char(text):
    return re.sub('(\\d|\\W)+',' ', text)

def dotcom(text):
    return re.sub(r'(com|net|co|org)', '', text)

def space(text):
    return re.sub(' +', ' ', text)


In [265]:
df_int['clean_url'] = df_int['url'].apply(protocol).apply(char).apply(space).apply(punct)

In [266]:
df_int.sample(10)

Unnamed: 0,url,is_spam,clean_url
416,https://www.theverge.com/2020/6/22/21299325/ap...,False,theverge com apple ios maps updates cycling d...
1182,https://creativemornings.com/companies/shopify,True,creativemornings com companies shopify
647,https://www.vox.com/2020/6/24/21301630/trump-r...,False,vox com trump reelection polling lead
929,https://www.cnn.com/2020/06/25/health/maskne-a...,False,cnn com health maskne acne covid masks wellne...
756,https://www.theguardian.com/us-news/2020/jun/2...,False,theguardian com us news jun texas coronavirus...
1597,https://mondaynote.com/your-next-car-will-be-e...,False,mondaynote com your next car will be electric ...
825,https://tedium.co/2020/06/23/television-test-p...,False,tedium co television test patterns history
194,https://en.wikipedia.org/wiki/Shri_Yantra,False,en wikipedia org wiki Shri Yantra
1299,https://www.fastcompany.com/90521484/how-to-de...,False,fastcompany com how to decontaminate an n mas...
480,https://www.youtube.com/watch?v=PQmDUEv939A&fe...,False,youtube com watch v PQmDUEv A feature youtu b...


In [267]:
df_int['is_spam'].value_counts()
#Unbalanced

False    2125
True      244
Name: is_spam, dtype: int64

In [268]:
df_int['is_spam'] = df_int['is_spam'].apply(lambda x: 1 if x == True else 0)
#Encoding target

In [269]:
df_int['is_spam'].value_counts()

0    2125
1     244
Name: is_spam, dtype: int64

In [270]:
df = df_int.copy()

**Step 3:**

Use Support Vector machine to build a url spam classifier.

In [271]:
vectorizer = CountVectorizer().fit_transform(df['clean_url'])

In [275]:
X = df['clean_url']
y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(vectorizer, y, stratify = y, random_state = 25)

In [276]:
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [277]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96       532
           1       0.70      0.57      0.63        61

    accuracy                           0.93       593
   macro avg       0.83      0.77      0.80       593
weighted avg       0.93      0.93      0.93       593



In [279]:
print("SVM Accuracy Score -> ", accuracy_score(predictions, y_test)*100)
#Our model reached 93% accuracy

SVM Accuracy Score ->  93.08600337268128
