In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import joblib
import pickle

In [3]:
# Importing the dataset and splitting into training and testing set
df_name = 'Network flow dataframe'
df = pd.read_pickle(df_name)
df

Unnamed: 0,Port src,Port dst,Bytes,Duration(ms),Max PS,IP_app_converted,Category
0,38735.0,41292.0,817.0,1312.0,146.0,3303860334,Network
1,46198.0,443.0,1505.0,96.0,80.0,2398795850,Web
2,10505.0,443.0,1314.0,1261.0,78.0,3303353100,Web
3,49385.0,1813.0,2449.0,1292.0,409.0,168564441,Network
4,34851.0,443.0,105792.0,1321.0,1392.0,1682156403,Cloud
...,...,...,...,...,...,...,...
111685,38736.133724,3478.0,1141.615583,1081.140636,172.674672,1682212122,VoIP
111686,37993.304231,3478.0,16364.844054,1200.353032,1025.658017,1682708511,VoIP
111687,57361.334219,3478.0,1330.849653,1166.172697,259.415041,1682735925,VoIP
111688,49968.43213,3478.0,34398.806709,1295.878886,1062.771468,1682012299,VoIP


In [4]:
X = df.drop(columns=['Category'])
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Streaming        11169
Email            11169
Web              11169
VoIP             11169
Network          11169
Game             11169
Cloud            11169
SocialNetwork    11169
Chat             11169
Others           11169
Name: Category, dtype: int64

In [3]:
# Defining the model
rfc = RandomForestClassifier()

In [4]:
# Hyperparameter tuning with Grid Search
parameters_rfc = {'bootstrap': [True], # Hyperparameters to test and combine
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]}
grid_search_rfc = GridSearchCV(estimator = rfc, param_grid = parameters_rfc, 
                           cv = 3, n_jobs = -1, verbose = 2) # Initiating the search

grid_search_rfc.fit(X_train, y_train) # Fitting the grid function
best_rfc = grid_search_rfc.best_estimator_ # Best hyperparams for the model
print(best_rfc)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
RandomForestClassifier(max_depth=110, max_features=2, min_samples_leaf=3,
                       min_samples_split=12)


In [5]:
# Model Verification and validation
y_train_pred = best_rfc.predict(X_train)
y_test_pred = best_rfc.predict(X_test)

rfc_train_accuracy = format(accuracy_score(y_train, y_train_pred)*100, ".2f")
rfc_test_accuracy = format(accuracy_score(y_test, y_test_pred)*100, ".2f")
rfc_f1_score = format(f1_score(y_test, y_test_pred, average='weighted')*100, ".2f")
rfc_precision_score = format(precision_score(y_test, y_test_pred, average='weighted')*100, ".2f")
print ("Train Accuracy", rfc_train_accuracy, "%\n",
       "Test Accuracy", rfc_test_accuracy, "%\n",
       "F1 Score", rfc_f1_score, "%\n", 
       "Precision", rfc_precision_score, "%\n")

Train Accuracy 97.23 %
 Test Accuracy 96.61 %
 F1 Score 96.61 %
 Precision 96.99 %



In [6]:
filename = "RF_Model.pkl" #Saving the model after fitting
joblib.dump(best_rfc, filename)

['RF_Model.pkl']