In [20]:
#1.1

#Sources:
#https://scikit-learn.org/stable/modules/tree.html
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
#https://www.datacamp.com/tutorial/decision-tree-classification-python


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

traindata = pd.read_csv("hw4Train.csv")
testdata = pd.read_csv("hw4Test.csv")
#print(testdata.head())
#print(traindata.head())

X = traindata.iloc[:, :-1]  # Features
y = traindata.iloc[:, -1]   # Target

Xtest = testdata.iloc[:, :-1]
ytest = testdata.iloc[:, -1]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.2, random_state=42)

maxdepth = 10 
predefinedAcc = []


#Tree with no max depth
print("No max depth")
tree0 = DecisionTreeClassifier(random_state=42)
tree0.fit(Xtrain, ytrain)
testpred0 = tree0.predict(Xtest) 
accuracytest0 = accuracy_score(ytest, testpred0)
print(f'Accuracy on test set no max depth: {accuracytest0:.8f}')
#print('Classification Report:')
#print(classification_report(ytest, testpred0, zero_division=1))
predefinedAcc.append(accuracytest0)

print("10 max depth")
tree10 = DecisionTreeClassifier(max_depth=10,random_state=42)
tree10.fit(Xtrain, ytrain)
testpred10 = tree10.predict(Xtest) 
accuracytest10 = accuracy_score(ytest, testpred10)
print(f'Accuracy on test set 10 max depth: {accuracytest10:.8f}')
#print('Classification Report:')
#print(classification_report(ytest, testpred10, zero_division=1))
predefinedAcc.append(accuracytest10)

print("25 max depth")
tree25 = DecisionTreeClassifier(max_depth=25,random_state=42)
tree25.fit(Xtrain, ytrain)
testpred25 = tree25.predict(Xtest) 
accuracytest25 = accuracy_score(ytest, testpred25)
print(f'Accuracy on test set 25 max depth: {accuracytest25:.8f}')
#print('Classification Report:')
#print(classification_report(ytest, testpred25, zero_division=1))
predefinedAcc.append(accuracytest25)

#Searching for best depth
def depthsearch(Xtrain, ytrain, Xval, yval):
    best_depth = None
    best_accuracy = 0
    for depth in range(1, 25):  # Trying depths from 1 to 50
        tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
        tree.fit(Xtrain, ytrain)
        valpred = tree.predict(Xval)
        accuracy = accuracy_score(yval, valpred)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_depth = depth
    return best_depth


# Find the best max_depth using the validation set
best_max_depth = depthsearch(Xtrain, ytrain, Xval, yval)
print(f'Best max_depth found: {best_max_depth}')
tree2 = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42)
tree2.fit(Xtrain, ytrain)
testpred = tree2.predict(Xtest)
accuracy = accuracy_score(ytest, testpred)
print(f'Accuracy on test set with optimal depth: {accuracy:.8f}')
#print('Classification Report:')
#print(classification_report(ytest, testpred, zero_division=1))





No max depth
Accuracy on test set no max depth: 0.99147430
10 max depth
Accuracy on test set 10 max depth: 0.95121287
25 max depth
Accuracy on test set 25 max depth: 0.99180946
Best max_depth found: 23
Accuracy on test set with optimal depth: 0.99193515


### 1.1a & 1.2 
Accuracy on test set no max depth: 0.99147430

Accuracy on test set with optimal depth(23): 0.99193515

As we can see from these results, setting depths will influence accuracy, but only very slightly when the depth is in the twenties. Max depth of 10 did however reduce the accuract much more.



Code for attempting to combine multiple optimal features, however using optimal max depth found earlier with optimal feature count resulted in lower accuracy, indicating the the relationship between the hyperparameters is not linear.


In [18]:
#Searching for best feature count, not used
#Source:
#https://scikit-learn.org/stable/modules/tree.html
def featuresearch(Xtrain, ytrain, Xval, yval):
    best_featurecount = None
    best_accuracy = 0
    for featurecount in range(1, 50):  # Trying features from 1 to 50
        tree = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42, max_features=featurecount)
        tree.fit(Xtrain, ytrain)
        valpred = tree.predict(Xval)
        accuracy = accuracy_score(yval, valpred)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_featurecount = featurecount
    return best_featurecount


#best_featurecount_found = featuresearch(Xtrain, ytrain, Xval, yval)
#print(f'Best feature count found: {best_featurecount_found}')
#tree4 = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42, max_features=best_featurecount_found)
#tree4.fit(Xtrain, ytrain)
#testpred = tree4.predict(Xtest)
#accuracy = accuracy_score(ytest, testpred)
#print(f'Accuracy on test set with optimal featurecount: {accuracy:.4f}')
#print('Classification Report:')
#print(classification_report(ytest, testpred, zero_division=1))

#def leafsearch(Xtrain, ytrain, Xval, yval):
#    best_featurecount = None
#    best_accuracy = 0
#    for featurecount in range(1, 20):  # 
#        tree = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42, max_features=featurecount)
#        tree.fit(Xtrain, ytrain)
#        valpred = tree.predict(Xval)
#        accuracy = accuracy_score(yval, valpred)
#        if accuracy > best_accuracy:
#            best_accuracy = accuracy
#            best_featurecount = featurecount
#    return best_featurecount

### 1.3

The two parameters that prevents overfitting are max_depth and min_samples_split/min_samples_leaf. 

Setting a max depth prevents the tree from becoming too complex and specialized on the training set which will lead to low generalization and lessened usefullness in the real world. Configuring min_samples_split or min_samples_leaf ensures that every decision in the tree are informed by multiple samples. Setting a low amount of samples per split will lead to overfitting.

In general it is also useful to pre-process the data by reducing features and gaining a better dimensionality on the data.

Source:
https://scikit-learn.org/stable/modules/tree.html

### 1.4

SVM may not be optimal for this dataset due to the high number of features that would require kernel tuning. When working with datasets that have a large number of features, one should use models like decision trees that are less sensitive to high dimensionality or to apply robust feature selection and dimensionality reduction techniques before using SVM. 


### 2.1

Feature selection is an imporant step in the building a model for classifying data. It involves selecting a subset of relevant features for use in model construction, based on their importance and relevance to the target variable. Feature selection can achieve the following goals
* Improved performance - removing unimportant features can in some cases lead to increase in performance in terms of accuracy and generalization on unseen data.
* Reducing overfitting - by removing unimportant features, noise will also be removed, which will reduce overfitting by making the model more focused on the general trends and patterns.
* Decreasing training time - Reduced features leads to reduces complexity, which can reduce the time needed for the model to train and test on data.

A redundant feature is a feature that in theory contributes to the result, but in practice do not because it is correlating with another more important feature. An irrelevant feature is a feature that has no impact on the the results and shoulød be ignored.


### 2.2

Using WEKA attribute selection i was able to rank the features on the hw4train.csv dataset. My chosen evaluation methods were Information Gain and chi-square.

Information Gain:

<img src="infogain.png" alt="infogain" width="400"/>

Chi square:

<img src="chi.png" alt="chi square" width="400"/>

As we can see, these rankings broadly indicate the same features as being more important like IAT and tot size, while features like telnet and ARP having no impact on the results.

### 2.3

To test the effect feature ranking on J48 decision tree effectiveness i removed all but the top 10 features from infogain rankings and compared the results vs using the full dataset

Features included:

40  IAT

39  Tot size

42  Magnitue

37  AVG

34  Tot sum

35  Min

2   Header_Length

36  Max

3   Protocol Type

27  TCP

#### Results

=== Reduced Dataset ===

Time taken to build model: 2.96 seconds

Time taken to test model on supplied test set: 0.17 seconds
    
| Result | Number | Accuracy |
|---|---|---|
| Correctly Classified Instances | 47254 | 98.9861 % |
| Incorrectly Classified Instances  | 484  | 1.0139 %  |

=== Full Dataset ===

Time taken to build model: 11.87 seconds

Time taken to test model on supplied test set: 0.68 seconds
    
| Result | Number | Accuracy |
|---|---|---|
| Correctly Classified Instances | 47358 | 99.204  % |
| Incorrectly Classified Instances  | 380  | 0.796  %  |


As seen from the results, reducing the number of features led to a very slight reduction in accuracy from 99.204 to 98.986, while the new accuracy is still very high it will need to be discussed if reduction in accuracy is acceptable when working with security data. The reduced dataset did however significantly reduce the time usage, both for modeling and testing. Even if the original times for testing (0.68 seconds), are quite low, the new time (0.17 seconds), is still 400% faster than the old time, which can be useful in low latency required operations.
