In [154]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import svm
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout, TimeDistributed, RepeatVector
import pandas as pd


In [155]:
# Read csv file
data = pd.read_csv('/content/Traffic.csv')
data.head()

Unnamed: 0,Time,Delay
0,00:00:00,15000
1,00:00:10,0
2,00:00:20,0
3,00:00:30,11000
4,00:00:40,19000


In [156]:
data['y'] = 1 
data.loc[data['Delay'] >= 54000, 'y'] = 2 # Anomalies

In [157]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Time    8192 non-null   object
 1   Delay   8192 non-null   int64 
 2   y       8192 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 192.1+ KB


In [158]:
# Time conversion

def convert(x):
  (h, m, s) = x.split(':')
  t = int(h) * 3600 + int(m) * 60 + int(s)
  return t

data['Time'] = data['Time'].apply(convert)
data.head()

Unnamed: 0,Time,Delay,y
0,0,15000,1
1,10,0,1
2,20,0,1
3,30,11000,1
4,40,19000,1


In [159]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(data[['Time', 'Delay']], data[['y']], train_size=0.8, random_state=0)

print("Input Training:", X_train.shape)
print("Input Test:", X_test.shape)
print("Output Training:", y_train.shape)
print("Output Test:", y_test.shape)

Input Training: (6553, 2)
Input Test: (1639, 2)
Output Training: (6553, 1)
Output Test: (1639, 1)


In [160]:
X_train, X_unl, y_train, y_unl = train_test_split(X_train, y_train, test_size=0.7, random_state=0)

In [161]:
print(X_train.info())
print(X_test.info())
print(X_unl.info())

print(y_train.info())
print(y_test.info())
print(y_unl.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1965 entries, 6157 to 1530
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Time    1965 non-null   int64
 1   Delay   1965 non-null   int64
dtypes: int64(2)
memory usage: 46.1 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 2310 to 6485
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Time    1639 non-null   int64
 1   Delay   1639 non-null   int64
dtypes: int64(2)
memory usage: 38.4 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4588 entries, 1078 to 4830
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Time    4588 non-null   int64
 1   Delay   4588 non-null   int64
dtypes: int64(2)
memory usage: 107.5 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1965 entries, 6157 to 1530
Data columns (total 1 columns):
 #   Column

In [162]:
clf = svm.SVC(kernel='linear', probability=True, C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


1.0

In [163]:
clp = clf.predict_proba(X_unl)
lab = clf.predict(X_unl)

In [164]:
data = pd.DataFrame(clp, columns = ['C1Prob', 'C2Prob']) 
data['lab'] = lab
#data['actual'] = y_unl
data['max'] = data[["C1Prob", "C2Prob"]].max(axis=1)

In [165]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4588 entries, 0 to 4587
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C1Prob  4588 non-null   float64
 1   C2Prob  4588 non-null   float64
 2   lab     4588 non-null   int64  
 3   max     4588 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 143.5 KB


In [166]:
# Choosing the right confidence level

nc = np.arange(.35, 1, .03)
acc = np.empty(22)
i = 0
for k in np.nditer(nc):
    conf_ind = data["max"] > k
    X_train1 = np.append(X_train, X_unl[conf_ind, :], axis=0)
    y_train1 = np.append(y_train, data.loc[conf_ind,['lab']])
    clf = svm.SVC(kernel='linear', probability=True, C=1).fit(X_train1, y_train1)
    acc[i] =  clf.score(X_test, y_test)
    i = i + 1

TypeError: ignored

In [None]:
plt.hist(data["max"])

In [None]:
import matplotlib.pyplot as plt
x = pd.Series(acc,index=nc)
x.plot()

plt.title('Confidence vs Accuracy')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.show()