In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Logistic Regression

In [2]:
df = pd.read_csv("candy-data.csv")
df.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [3]:
df.shape

(85, 13)

In [4]:
df = df.drop("competitorname", axis=1)

In [5]:
y = df['chocolate']
x = df.loc[:, df.columns != 'chocolate']

In [6]:
df.columns

Index(['chocolate', 'fruity', 'caramel', 'peanutyalmondy', 'nougat',
       'crispedricewafer', 'hard', 'bar', 'pluribus', 'sugarpercent',
       'pricepercent', 'winpercent'],
      dtype='object')

In [7]:
df.columns != 'chocolate'

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [8]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: chocolate, dtype: int64

In [9]:
x.head()

Unnamed: 0,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [12]:
model.score(X_train, y_train)

0.9411764705882353

In [13]:
model.score(X_test, y_test)

0.7647058823529411

In [14]:
model.predict_proba(X_test.iloc[:2, :])

array([[0.97184013, 0.02815987],
       [0.46987279, 0.53012721]])

In [15]:
proba = model.predict_proba(X_test.iloc[:2, :])

In [16]:
def custom_threshold(proba, thresh):
    class_list = list()
    for row in proba:
        if row[1] > thresh:
            class_list.append('Yes, Chocolate')
        else:
            class_list.append('No chocolate')
    return class_list

In [17]:
custom_threshold(proba, 0.5)

['No chocolate', 'Yes, Chocolate']

In [18]:
model.predict(X_test.iloc[:2, :])

array([0, 1])

In [19]:
confusion_matrix(y_test, model.predict(X_test))

array([[9, 1],
       [3, 4]])

Sample confusion matrix from sklearn
![image.png](cm.png)