In [32]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.3/201.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.2 xgboost-3.0.0


Imports

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# this is for text visualization
from wordcloud import WordCloud

# this is for naturals lanuguage processing
import nltk
from nltk.corpus import stopwords

# as we knwo there are some stop words which have very less meaning which we oftenly remove in nlp
nltk.download('stopwords') # download stopwords data

# this package if for tokeninzing like if we need in sentences or if we need in words
nltk.download("punkt") # download tokenizer data



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/namlabs/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/namlabs/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Read the csv data to analyze

In [6]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
text = "Hello! How are you doing today?"
tokens = tokenizer.tokenize(text)

print(tokens)


['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?']


In [7]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# for readable purposes we are going to change the column names to target and text

df.rename(columns= {'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df1 = df.copy()

Data Preprocessing

In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df1['target'] = encoder.fit_transform(df1['target'])

df1.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df1.shape

(5572, 2)

In [15]:
df1.duplicated().sum()

np.int64(403)

In [16]:
df1 = df1.drop_duplicates(keep= 'first')
df1.shape

(5169, 2)

In [17]:
df2 = df1.copy()

In [20]:
df2.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Feature Engineering

In [21]:
# this is for text stemming (ex: running converts to run etc)
from nltk.stem.porter import PorterStemmer

import string

stemmer = PorterStemmer()

In [None]:
text = "Available only in Bugis n Great World la e buffet... Cine there got Amore wat..."
print(tokenizer.tokenize(text))

['Available', 'only', 'in', 'Bugis', 'n', 'Great', 'World', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'Amore', 'wat', '...']


In [None]:
from nltk.tokenize import WordPunctTokenizer

# Test tokenization
tokenizer = WordPunctTokenizer()

def transform_text(text):

    text = text.lower()

    text = tokenizer.tokenize(text)

    y = []
    for word in text:
        if word.isalnum():
            y.append(word)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for word in text:
        y.append(stemmer.stem(word))

    return " ".join(y)

In [24]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [25]:
df2['transform_text'] = df2['text'].apply(transform_text)

In [26]:
df2.head()

Unnamed: 0,target,text,transform_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfid = TfidfVectorizer(max_features=500)

In [66]:
X = tfid.fit_transform(df2['transform_text']).toarray()
y = df2['target'].values

In [67]:
y.shape

(5169,)

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [48]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [35]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [36]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [37]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9796905222437138
Precision:  0.9558823529411765

For:  KNN
Accuracy:  0.9264990328820116
Precision:  0.9726027397260274

For:  NB
Accuracy:  0.9787234042553191
Precision:  0.984251968503937

For:  DT
Accuracy:  0.9400386847195358
Precision:  0.8429752066115702

For:  LR
Accuracy:  0.9700193423597679
Precision:  0.9318181818181818

For:  RF
Accuracy:  0.9796905222437138
Precision:  0.9558823529411765

For:  Adaboost
Accuracy:  0.9197292069632496
Precision:  0.8369565217391305

For:  Bgc
Accuracy:  0.9642166344294004
Precision:  0.8698630136986302

For:  ETC
Accuracy:  0.9825918762088974
Precision:  0.9568345323741008

For:  GBDT
Accuracy:  0.9535783365570599
Precision:  0.98989898989899

For:  xgb
Accuracy:  0.9729206963249516
Precision:  0.9398496240601504


In [40]:
data = pd.read_csv("/home/namlabs/Guru/DVC/Spam-detection/MLops_pipeline/experiments/spam.csv")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [57]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
y = data['v1']
X = data['v2']

y.shape

(5572,)

KeyError: "None of [Index([1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732,\n       ...\n       5159, 5160, 5161, 5162, 5163, 5164, 5165, 5166, 5167, 5168],\n      dtype='int64', length=3446)] are in the [columns]"

ValueError: Found input variables with inconsistent numbers of samples: [5169, 1723]

In [69]:
X.shape

(5169, 500)

In [73]:
y.shape

(1723,)

In [81]:
X = tfid.fit_transform(df2['transform_text']).toarray()
y = df2['target'].values

In [None]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

kf = KFold(n_splits=3, shuffle=True, random_state=42)
knc = KNeighborsClassifier()

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    knc.fit(X_train, y_train)
    accuracy = knc.score(X_test, y_test)
    
    print(f"KFold Accuracy: {accuracy:.4f}")


KFold Accuracy: 0.9147
KFold Accuracy: 0.9304
KFold Accuracy: 0.9298


In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
knc = KNeighborsClassifier()

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    knc.fit(X_train, y_train)
    accuracy = knc.score(X_test, y_test)
    
    print(f"StratifiedKFold Accuracy: {accuracy:.4f}")


StratifiedKFold Accuracy: 0.9099
StratifiedKFold Accuracy: 0.9156


In [85]:
from sklearn.model_selection import cross_val_score

In [87]:
cross_val_score(knc, X,y, cv=2)

array([0.91528046, 0.92066563])

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

In [None]:
# from sklearn.model_selection import KFold
# from sklearn.neighbors import KNeighborsClassifier

# kf = KFold(n_splits=3, shuffle=True, random_state=42)
# knc = KNeighborsClassifier()

# for train_idx, test_idx in kf.split(X):  # Only X is passed
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

#     knc.fit(X_train, y_train)
#     accuracy = knc.score(X_test, y_test)

#     print(f"KFold Accuracy: {accuracy:.4f}")


for train,test in kf.split(X):
    X_train, X_test = X[train], X[test]  
    y_train, y_test = y[train], y[test]  
    knc.fit(X_train,y_train)
    knc.score(X_test, y_test)