# **SKLearn 12 - Logistic Regression pada Binary Classification Task**
#### *227328_Agus Ghanidhio Putunemachwie*

### Dataset: SMS Spam Collection Data Set

In [14]:
import pandas as pd

df = pd.read_csv('./dataset/SMSSpamCollection',
                 sep='\t',
                 header=None,
                 names=['label', 'sms'])
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './dataset/SMSSpamCollection'

In [None]:
df['label'].value_counts()

### Training & Testing Dataset

In [None]:
from sklearn.preprocessing import LabelBinarizer

X = df['sms'].values
y = df['label'].values

lb = LabelBinarizer()
y = lb.fit_transform(y).ravel()
lb.classes_

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=0)
print(X_train, '\n')
print(y_train)

### Feature Extraction dengan TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(X_train_tfidf)

### Binary Classification dengan Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

for pred, sms in zip(y_pred[:5], X_test[:5]):
    print(f'PRED: {pred} - SMS: {sms}\n')

### Evaluation Metrics pada Binary Classification
- Confusion Matrix
- Accuracy
- Precision & Recall
- F1 Score
- ROC

#### Teriminologi Dasar
- True Positive (TP)
- True Negative (TN)
- False Positive (FP)
- False Negative (FN)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, y_pred)
matrix

In [None]:
tn, fp, fn, tp = matrix.ravel()

print(f'TN: {tn}')
print(f'FP: {fp}')
print(f'FN: {fn}')
print(f'TP: {tp}')

In [None]:
import matplotlib.pyplot as plt

plt.matshow(matrix)
plt.colorbar()

plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

### Accuracy

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

### Precision & Recall

In [None]:
# precision
from sklearn.metrics import precision_score

precision_score(y_test, y_pred)

In [None]:
# recall
from sklearn.metrics import recall_score

recall_score(y_test, y_pred)

### F1-Score

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred)

### ROC: Receiver Operating Characteristic

In [None]:
from sklearn.metrics import roc_curve, auc

prob_estimates = model.predict_proba(X_test_tfidf)

fpr, tpr, threshhold = roc_curve(y_test, prob_estimates[:, 1])
nilai_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, 'b', label=f'AUC={nilai_auc}')
plt.plot([0,1], [0,1], 'r--', label='Random Classifier')

plt.title('ROC: Receiver Operating Characteristic')
plt.xlabel('Fallout or False Positive Rate')
plt.ylabel('Recall or True Positive Rate')
plt.legend()
plt.show()