In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
data = pd.read_csv('Customer_reboot_training_table.csv')

In [None]:
data.isna().sum()

In [None]:
data.dropna(inplace=True)
print(data.size)

In [None]:
target0, target1 = list(data['reset'].value_counts())
total = target0 + target1
print(f'Percentage of non reboots: {target0*100/total}')
print(f'Percentage of reboots: {target1*100/total}')

### Error preprocessing
Some errors from wifi boxes where packet counts are misreported (eg. packetloss > 2billion) were apparent in this dataset, and so the 95th percentile was taken in order to combat this reporting error.

In [None]:
PktLoss_perc95 = np.percentile(data['med_PktLoss_2_4'], 95)
fdata = data[data.med_PktLoss_2_4 <= PktLoss_perc95]
UPkt_perc95 = np.percentile(fdata['med_UPkts_2_4'], 95)
fdata = fdata[fdata.med_UPkts_2_4 <= UPkt_perc95]
MPkt_perc95 = np.percentile(fdata['med_MPkts_2_4'], 95)
fdata = fdata[fdata.med_MPkts_2_4 <= MPkt_perc95]

percentile_cols = ['med_PktLoss_2_4', 'med_UPkts_2_4', 'med_MPkts_2_4']
for col in percentile_cols:
    plt.hist(fdata[col])
    plt.title(str(col))
    plt.show()

### Imbalanced Classes
Downsampling was done in order to combat extreme downsampling, this was taken down to a mild imbalance of ~25/75 split.

In [None]:
target0, target1 = list(fdata['reset'].value_counts())
total = target0 + target1
print(f'Percentage of non reboots: {target0*100/total}')
print(f'Percentage of reboots: {target1*100/total}')

In [None]:
from sklearn.utils import resample

In [None]:
data_maj = fdata[fdata.reset==0]
data_min = fdata[fdata.reset==1]
data_maj_downsampled = resample(data_maj, replace=False, n_samples=target1*3, random_state=123)
data_downsampled = pd.concat([data_maj_downsampled, data_min])
target0, target1 = list(data_downsampled['reset'].value_counts())
total = target0 + target1
print(f'Percentage of non reboots: {target0*100/total}')
print(f'Percentage of reboots: {target1*100/total}')
print(f'Size of dataset: {data_downsampled.size}')

In [None]:
cols = list(data_downsampled.columns)
feature_cols = cols[1:]
xdata = data_downsampled[feature_cols]
target = cols[0]
ydata = data_downsampled[target]

In [None]:
def heatmap(data):
    cm = data.corr()
    mask = np.triu(cm)
    sns.heatmap(cm, mask=mask)
    plt.show()

In [None]:
heatmap(data_downsampled)

In [None]:
data_downsampled.drop(['Enable_2_4'], axis=1, inplace=True)
cols = list(data_downsampled.columns)
feature_cols = cols[1:]
xdata = data_downsampled[feature_cols]
ydata = data_downsampled[target]
heatmap(xdata)

In [None]:
xdata.head()

### Training Model
Decision tree classification model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.3, random_state=123)

In [None]:
target0, target1 = list(y_train.value_counts())
total = target0 + target1
print(f'Percentage of non reboots in training set: {target0*100/total}')
print(f'Percentage of reboots in training set: {target1*100/total}')
print()
target0, target1 = list(y_test.value_counts())
total = target0 + target1
print(f'Percentage of non reboots in testing set: {target0*100/total}')
print(f'Percentage of reboots in testing set: {target1*100/total}')

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_prob = model.predict_proba(x_test)

### Accuracy metrics and checking overfitting

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve, accuracy_score

In [None]:
def conf_mat(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.matshow(cm, cmap='RdYlGn')
    plt.xlabel('True Class')
    plt.ylabel('Predicted Class')
    for (x, y), value in np.ndenumerate(cm):
        plt.text(x, y, f"{value }", va="center", ha="center")
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
print(classification_report(y_test, y_pred))
conf_mat(y_test, y_pred)

#### Confusion Matrix:
True negatives: 6058

False negatives: 17

False positives: 68

True positives: 2049

In [None]:
importance = model.feature_importances_
feature_cols = list(xdata.columns)

plt.bar(feature_cols, importance)
plt.title('Feature Importance on Downsampled Data')
plt.ylabel('Importance')
plt.xticks(rotation=80)
plt.show()

In [None]:
depth = np.array(range(1,20))
train_scores = []
test_scores = []

for i in depth:
	model = DecisionTreeClassifier(max_depth=i)
	model.fit(x_train, y_train)

	y_pred_train = model.predict(x_train)
	train_acc = accuracy_score(y_train, y_pred_train)
	train_scores.append(train_acc)

	y_pred_test = model.predict(x_test)
	test_acc = accuracy_score(y_test, y_pred_test)
	test_scores.append(test_acc)

plt.plot(depth, train_scores, '-x', label='Train')
plt.plot(depth, test_scores, '-x', label='Test')
plt.legend()
plt.xlabel('Tree Depth')
plt.ylabel('Accuracy')
plt.show()


### Checking model integrity 
This accuracy seems to high, so I used a kfold cross validation and then removed the most important feature in order to see how this impacts the model.

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
strKf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

In [None]:
kfold = strKf.split(xdata, ydata)
scores = []

model = DecisionTreeClassifier()

for k, (train, test) in enumerate(kfold):
    x_train = xdata.iloc[train, :]
    y_train = ydata.iloc[train]
    model.fit(x_train, y_train)
    score = model.score(xdata.iloc[test, :], ydata.iloc[test])
    scores.append(score)
    print(f'Fold {k+1}: Training/Test Split Distribution: {np.bincount(ydata.iloc[train])}, Accuracy: {score.round(3)}')

print()
print(f'Cross validation accuracy: {np.mean(scores).round(3)} +/- {np.std(scores).round(4)}')

#### KFold accuracy
Cross validation accuracy: 0.993 +/- 0.0012

In [None]:
xdata.drop(['med_MPkts_2_4'], axis=1, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.3, random_state=123)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
conf_mat(y_test, y_pred)

In [None]:
importance = model.feature_importances_
feature_cols = list(xdata.columns)

plt.bar(feature_cols, importance)
plt.title('Feature Importance on Downsampled Data')
plt.ylabel('Importance')
plt.xticks(rotation=80)
plt.show()