# **Linear Logistic Regression**

**Indenpendencies**

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, roc_auc_score, classification_report, adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from scipy.stats import mode
from imblearn.over_sampling import RandomOverSampler

import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import punkt
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [16]:
df = pd.read_csv("dataset/cleaned_dataset_small.csv")
df.head()

Unnamed: 0,spam,preprocessed_text
0,0,martin lin rotat jim group hi paul anad pleas ...
1,0,steve leppard hi vinc hr work mid year salari ...
2,0,lei
3,0,alway say welp
4,1,smoke qrklx lookln 4 affordabowl cigarettez co...


**Re-clean the Preprocessed Data**

In [17]:
df.duplicated().sum()

np.int64(0)

In [18]:
df.isnull().sum()

spam                 0
preprocessed_text    1
dtype: int64

In [19]:
df.dropna(subset=['preprocessed_text'], inplace=True)
df.isnull().sum()

spam                 0
preprocessed_text    0
dtype: int64

**Transformation**

In [20]:
ros = RandomOverSampler(random_state=2)
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X = tfidf.fit_transform(df['preprocessed_text']).toarray()
y = df['spam'].values

def text_transform(text):
  text = text.lower() #lowercase
  text = nltk.word_tokenize(text) #tokenize

  # remove special chars & stopwords & punctuation & stemming
  b = []
  for a in text:
    if a.isalnum() and a not in stopwords.words('english') and a not in string.punctuation:
      b.append(ps.stem(a))

  return " ".join(b)

**Split dataset**

In [21]:
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.25, random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)



**Model training**

In [22]:
clf = LogisticRegression(max_iter=300, class_weight='balanced', n_jobs=None)
clf.fit(X_train, y_train)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


**Evaluation**

In [23]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

# Evaluate (same metrics)
print("Accuracy      :", accuracy_score(y_test, y_pred))
print("Precision     :", precision_score(y_test, y_pred))
print("Recall        :", recall_score(y_test, y_pred))
print("F1-Score      :", f1_score(y_test, y_pred))
print("ROC-AUC       :", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy      : 0.9645283018867925
Precision     : 0.8823529411764706
Recall        : 0.9464627151051626
F1-Score      : 0.9132841328413284
ROC-AUC       : 0.9914079291922753
Confusion Matrix:
 [[2061   66]
 [  28  495]]


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


**User INPUT to Check Spam Detection**

Replace the text in **mess="*text*"** of your choice to check the spam and non spam

In [25]:
#=================================================================

#Replace your text here to check
mess = "Buy one get one free! Limited time offer, don't miss out!!!"

#=================================================================
print(f'Message: \"{mess}\"')
print('=================================')
mess_clean = text_transform(mess)
mess_vec = tfidf.transform([mess_clean])
prediction = clf.predict(mess_vec)[0]
proba = clf.predict_proba(mess_vec)[0]
print(f"Probability of NON-SPAM: {proba[0]*100:.2f}%")
print(f"Probability of SPAM    : {proba[1]*100:.2f}%")
print('=================================')
if prediction == 1:
    print(">> Predicted label: SPAM")
else:
    print(">> Predicted label: NON-SPAM")

Message: "Buy one get one free! Limited time offer, don't miss out!!!"
Probability of NON-SPAM: 24.85%
Probability of SPAM    : 75.15%
>> Predicted label: SPAM
