In [None]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [None]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [None]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [None]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [None]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [None]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [None]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [8]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 211,
 'tn': 2627,
 'fp': 10,
 'fn': 152,
 'misclassification_rate': 0.054,
 'false_positive_rate': 0.0037921880925293893,
 'false_negative_rate': 0.418732782369146}

### Check results on the test set (new data not yet seen by the model)

In [9]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 34,
 'tn': 847,
 'fp': 27,
 'fn': 92,
 'misclassification_rate': 0.119,
 'false_positive_rate': 0.030892448512585814,
 'false_negative_rate': 0.7301587301587301}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

Based on the misclassification rate of 0.119, the model correctly predicts around 88% of cases. This gives me a high level of confidence in its ability to predict a bot. This shows that the model learned patterns in the data and performs reliably most of the time. However, it is still not perfect and there are false positives and false negatives.

### What are potential ramifications of false positives from the model?

False positives happen when the model incorrectly labels a real user as a bot. This can lead to real users getting flagged, or blocked unfairly. In a real scenario that would not go well with the users and can make them not trust the platform. Too many false positives can damage the reputation of the system. Minimizing false positives is important to protect real normal users.

### What are potential ramifications of false negatives from the model?

False negatives happen when the model fails to detect a bot and allows it to opperate as a real user. This would be harmful as it lets the automated accounts spread spam, misinformation, manipulate, or do whatever it was designed to. In a real world setting missing bots could reduce system security and negatively impact users. High false negatives means the system is less effective at preventing these bots. Reducing it is important to maintain a trustworthy system.