In [1]:
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn. functional as F
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
from torch.optim.lr_scheduler import LinearLR

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
CONFIGURATION= {'LEARNING_RATE': 0.001,
                'EPOCHS': 50,
                'ADAM_BETAS': (0.9, 0.999),
                'ADAM_EPS': 1e-07,
                'SEED': 42,
                'BATCH_SIZE': 16,
                'LINEAR_LR': False,
                'DROPOUTS': 0.1,
                'L2_REGULARISATION': 0.01
                }

In [23]:
from clearml import Task, Logger

task= Task.init(project_name='profanity_detection', task_name='adima_training_try')

ClearML Task: overwriting (reusing) task id=e9e10bfb460f4b089c68c00a0fa5d4c8
ClearML results page: http://13.233.63.202:8080/projects/1d29852235204d9098563d5acdf7d416/experiments/e9e10bfb460f4b089c68c00a0fa5d4c8/output/log


In [37]:
task.connect(CONFIGURATION)

{'LEARNING_RATE': 0.001,
 'EPOCHS': 50,
 'ADAM_BETAS': (0.9, 0.999),
 'ADAM_EPS': 1e-07,
 'SEED': 42,
 'BATCH_SIZE': 16,
 'LINEAR_LR': False,
 'DROPOUTS': 0.1,
 'L2_REGULARISATION': 0.01}

In [7]:
task.close()

NameError: name 'task' is not defined

### Data Pipeline

In [4]:
def time_shift(aud, shift_limit):
    sig, sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)


def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
    return aug_spec

class AudioDataset (Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd. read_csv (csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self) :
        return len(self.annotations)

    def __getitem__(self, index) :

        file_path = os.path.join(self. root_dir, self.annotations. iloc[index, 0])
        #print(file_path)
        audio_feature= np.load(file_path)
        
        y_label = self .annotations. iloc [index, 1]
        
        if self.transform:
            audio_feature = self.transform(audio_feature)
        
        audio_feature=torch.from_numpy(audio_feature)

        audio_feature, sr = time_shift((audio_feature, 16000), 0.1)
        audio_feature = audio_feature.unsqueeze(0)
        audio_feature = spectro_augment(audio_feature, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
        audio_feature = audio_feature.squeeze()

        max_pool, _ = torch.max(audio_feature, dim=1)

        return (max_pool, y_label)
    

combined_dataset = AudioDataset (csv_file = str(Path.home())+'/Daniyal/profanity_detection/data/features/train_val.csv', root_dir =str(Path.home())+ "/Daniyal/profanity_detection/data/features/")
combined_loader = DataLoader (dataset=combined_dataset, batch_size=CONFIGURATION['BATCH_SIZE'], shuffle=True)#, sampler=train_sampler)


#train_dataset = AudioDataset (csv_file = str(Path.home())+'/Daniyal/profanity_detection/data/features/train/np_train_oversampled.csv', root_dir =str(Path.home())+ "/Daniyal/profanity_detection/data/features/train/")
train_dataset = AudioDataset (csv_file = str(Path.home())+'/Daniyal/profanity_detection/data/features/train/np_train.csv', root_dir =str(Path.home())+ "/Daniyal/profanity_detection/data/features/train/")
val_dataset = AudioDataset (csv_file = str(Path.home())+'/Daniyal/profanity_detection/data/features/val/np_val.csv', root_dir = str(Path.home())+ "/Daniyal/profanity_detection/data/features/val/")

train_loader = DataLoader (dataset=train_dataset, batch_size=16, shuffle=True)#, sampler=train_sampler)
val_loader = DataLoader (dataset=val_dataset, batch_size=16, shuffle=True)#, sampler=val_sampler)

test_dataset = AudioDataset (csv_file = str(Path.home())+'/Daniyal/profanity_detection/data/features/test/np_test.csv', root_dir =str(Path.home())+ "/Daniyal/profanity_detection/data/features/test/")
test_loader = DataLoader(dataset=test_dataset, batch_size=CONFIGURATION['BATCH_SIZE'], shuffle=True)


In [5]:
len(train_loader), len(val_loader), len(test_loader), len(combined_loader)

(44, 8, 24, 52)

### Model

In [6]:
#Sequential model
class NN(nn.Module): #inherit n module
    def __init__(self) :
        super(NN, self).__init__()
        self.fc1 = nn. Linear (512, 256)
        self.d1= nn. Dropout (p=CONFIGURATION['DROPOUTS'])
        self. fc2 = nn. Linear (256, 128)
        self.d2= nn. Dropout (p=CONFIGURATION['DROPOUTS'])
        self. fc3 = nn. Linear (128, 2)

        nn.init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.fc3.weight, nonlinearity='relu')


    def forward (self, x):
        x = self.d1(F.gelu(self.fc1(x) ) )
        x = self.d2(F.gelu(self. fc2(x) ) )
        x= self. fc3(x)
        return x

In [7]:
model= NN().to(device)
print(model)

NN(
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (d1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (d2): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=128, out_features=2, bias=True)
)


### Training

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(),
                        lr=CONFIGURATION['LEARNING_RATE'], 
                        betas=CONFIGURATION['ADAM_BETAS'], 
                        eps=CONFIGURATION['ADAM_EPS'], 
                        weight_decay=CONFIGURATION['L2_REGULARISATION'])
scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=50)

train_accuracy_list=[]
val_accuracy_list=[]
train_precision_list=[]
val_precision_list=[]
train_recall_list=[]
val_recall_list=[]
loss_list=[]
val_loss_list=[]

In [9]:
len(train_loader)

57

In [48]:
for epoch in range(CONFIGURATION['EPOCHS']):
  model.train()

  print(f"epoch: {epoch+1}/{CONFIGURATION['EPOCHS']}")
  
  pred=torch.tensor([]).to(device=device)
  label=torch.tensor([]).to(device=device)
  running_loss = 0.0

  for batch_idx, (data, targets) in enumerate(train_loader): #enumerate(combined_loader):

    data = data.to(device=device)
    targets = targets.to(device=device)

    data = data.reshape (data.shape [0], -1)

    scores = model(data)
    loss = criterion(scores, targets) 

    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    running_loss += loss.item()
    _, prediction = torch.max(scores, 1)
    pred=torch.cat([pred, prediction])
    label=torch.cat([label, targets])

  num_batches = len(train_loader) #len(combined_loader)
  avg_loss = running_loss / num_batches

  loss_list.append(avg_loss)


  train_acc=accuracy_score(label.cpu().detach().numpy(), pred.cpu().detach().numpy())
  train_accuracy_list.append(train_acc)
  train_precision= precision_score(label.cpu().detach().numpy(), pred.cpu().detach().numpy(), zero_division=0, pos_label=1)
  train_precision_list.append(train_precision)
  train_recall=recall_score(label.cpu().detach().numpy(), pred.cpu().detach().numpy(), pos_label=1)
  train_recall_list.append(train_recall)
  macro_f1=f1_score(label.cpu().detach().numpy(), pred.cpu().detach().numpy(), average='macro')

  #print(f'Training: Epoch: {epoch}, loss: {avg_loss:.2f}, train_accuracy: {accuracy(pred, label):.2f}, train_precision: {precision(pred, label): .2f}, train_recall: {recall(pred, label): .2f},')
  print(f'Training: Epoch: {epoch}, loss: {avg_loss}, train_accuracy: {train_acc:.2f}, train_precision: {train_precision: .2f}, train_recall: {train_recall: .2f}, train_f1_score: {macro_f1}, ')


  val_pred=torch.tensor([]).to(device=device)
  val_label=torch.tensor([]).to(device=device)
  val_running_loss = 0.0

  model.eval()
  with torch.no_grad(): 
    for batch_idx, (data, targets) in enumerate(val_loader):

      data = data.to(device=device)
      targets = targets.to(device=device)

      #data = data.reshape (data.shape [0], -1)

      scores = model(data)
      val_loss = criterion(scores, targets)
      val_running_loss += val_loss.item()

      _, prediction = torch.max(scores, 1)

      val_pred=torch.cat([val_pred, prediction])
      val_label=torch.cat([val_label, targets])


    num_val_batches = len(val_loader)
    avg_val_loss = val_running_loss / num_val_batches
    val_loss_list.append(avg_val_loss)

    val_acc=accuracy_score(val_label.cpu().detach().numpy(), val_pred.cpu().detach().numpy() )
    val_accuracy_list.append(val_acc)
    val_precision=precision_score(val_label.cpu().detach().numpy(), val_pred.cpu().detach().numpy(), zero_division=0, pos_label=1 )
    val_precision_list.append(val_precision)
    val_recall=recall_score(val_label.cpu().detach().numpy(), val_pred.cpu().detach().numpy(), pos_label=1 )
    val_recall_list.append(val_recall)
    macro_f1 = f1_score(val_label.cpu().detach().numpy(), val_pred.cpu().detach().numpy(), average='macro')

    #print(f'val_accuracy: {accuracy(val_pred, val_label):.2f}, val_precision: {precision(val_pred, val_label): .2f}, val_recall: {recall(val_pred, val_label): .2f},')
    print(f"val_loss: {avg_val_loss}, val_accuracy: {val_acc:.2f}, val_precision: {val_precision: .2f}, val_recall: {val_recall: .2f}, val_f1_score: {macro_f1}, lr: {optimizer.param_groups[0]['lr']}")

  Logger.current_logger().report_scalar("Loss", "train", iteration=epoch, value=avg_loss)
  Logger.current_logger().report_scalar("Loss", "val", iteration=epoch, value=avg_val_loss)
  Logger.current_logger().report_scalar("Accuracy", "train", iteration=epoch, value=train_acc)
  Logger.current_logger().report_scalar("Accuracy", "val", iteration=epoch, value=val_acc)
  Logger.current_logger().report_scalar("Precision", "train", iteration=epoch, value=train_precision)
  Logger.current_logger().report_scalar("Precision", "val", iteration=epoch, value=val_precision)
  Logger.current_logger().report_scalar("Recall", "train", iteration=epoch, value=train_recall)
  Logger.current_logger().report_scalar("Recall", "val", iteration=epoch, value=val_precision)

  if CONFIGURATION['LINEAR_LR']:
    scheduler.step()
  


epoch: 1/50
Training: Epoch: 0, loss: 0.5942090734055168, train_accuracy: 0.68, train_precision:  0.66, train_recall:  0.73, train_f1_score: 0.678728609370282, 
val_loss: 0.5719836950302124, val_accuracy: 0.66, val_precision:  0.52, val_recall:  0.59, val_f1_score: 0.638437849944009, lr: 0.001
epoch: 2/50
Training: Epoch: 1, loss: 0.5303530410716408, train_accuracy: 0.72, train_precision:  0.71, train_recall:  0.76, train_f1_score: 0.7235305196219991, 
val_loss: 0.5675017721951008, val_accuracy: 0.64, val_precision:  0.50, val_recall:  0.57, val_f1_score: 0.6212206047032475, lr: 0.001
epoch: 3/50
Training: Epoch: 2, loss: 0.4936559367598149, train_accuracy: 0.76, train_precision:  0.75, train_recall:  0.78, train_f1_score: 0.7559775898790464, 
val_loss: 0.7162769660353661, val_accuracy: 0.56, val_precision:  0.44, val_recall:  0.82, val_f1_score: 0.5607142857142857, lr: 0.001
epoch: 4/50
Training: Epoch: 3, loss: 0.4703150459548883, train_accuracy: 0.77, train_precision:  0.75, train_r

In [60]:
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%d%m%Y_%H%M")

torch.save(model.state_dict(),f'/home/ubuntu/Daniyal/work/model/model_trained_on_{dt_string}.pt')

In [53]:
#test performance
model.eval()
test_pred=torch.tensor([]).to(device=device)
test_label=torch.tensor([]).to(device=device)

model.eval()
with torch.no_grad(): 
    for batch_idx, (data, targets) in enumerate(test_loader):

        data = data.to(device=device)
        targets = targets.to(device=device)
        scores = model(data)

        _, prediction = torch.max(scores, 1)

        test_pred=torch.cat([test_pred, prediction])
        test_label=torch.cat([test_label, targets])

    test_acc=accuracy_score(test_label.cpu().detach().numpy(), test_pred.cpu().detach().numpy() )
    test_precision=precision_score(test_label.cpu().detach().numpy(), test_pred.cpu().detach().numpy(), zero_division=0 )
    test_recall=recall_score(test_label.cpu().detach().numpy(), test_pred.cpu().detach().numpy() )

    #print(f'val_accuracy: {accuracy(val_pred, val_label):.2f}, val_precision: {precision(val_pred, val_label): .2f}, val_recall: {recall(val_pred, val_label): .2f},')
    print(f"test_accuracy: {val_acc:.2f}, test_precision: {val_precision: .2f}, test_recall: {val_recall: .2f}")


test_accuracy: 0.68, test_precision:  0.55, test_recall:  0.66


### Sklearn models

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [28]:
X_train=[]
y_train=[]
X_val=[]
y_val=[]
X_test=[]
y_test=[]

for au, l in combined_loader:
    X_train.append(au.cpu().detach().numpy())
    y_train.append(l.cpu().detach().numpy())

temp1, temp2= X_train[0],y_train[0]
for i, j in zip(X_train[1:], y_train[1:]):
    temp1=np.concatenate((temp1, i), axis=0)
    temp2=np.concatenate((temp2, j), axis=0)
X_train=temp1
y_train=temp2

for au, l in val_loader:
    X_val.append(au.cpu().detach().numpy())
    y_val.append(l.cpu().detach().numpy())

temp1, temp2= X_val[0],y_val[0]
for i, j in zip(X_val[1:], y_val[1:]):
    temp1=np.concatenate((temp1, i), axis=0)
    temp2=np.concatenate((temp2, j), axis=0)
X_val=temp1
y_val=temp2

for au, l in test_loader:
    X_test.append(au.cpu().detach().numpy())
    y_test.append(l.cpu().detach().numpy())

temp1, temp2= X_test[0],y_test[0]
for i, j in zip(X_test[1:], y_test[1:]):
    temp1=np.concatenate((temp1, i), axis=0)
    temp2=np.concatenate((temp2, j), axis=0)
X_test=temp1
y_test=temp2

X_val.shape, y_val.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape


((123, 512), (123,), (819, 512), (819,), (369, 512), (369,))

#### PCA

In [29]:
#PCA
#2d
pca=PCA(n_components=2)
X_train_trf=pca.fit_transform(X_train)
X_val_trf=pca.fit_transform(X_val)
X_test_trf=pca.fit_transform(X_test)
import plotly.express as px

fig = px.scatter(x=X_train_trf[:,0], y=X_train_trf[:,1],
              color=y_train)
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

#### SMOTE

In [12]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority', k_neighbors=5)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

In [13]:
# smote = SMOTE(sampling_strategy='all', k_neighbors=10)
# X_smm, y_smm = smote.fit_resample(X_sm, y_sm)

In [14]:
print(np.sum(y_sm==0))
print(np.sum(y_sm==1))
X_sm.shape,X_train.shape

530
530


((1060, 512), (819, 512))

In [15]:
from sklearn.decomposition import PCA
import plotly.express as px
pca=PCA(n_components=2)
X_train_trf=pca.fit_transform(X_sm)


fig = px.scatter(x=X_train_trf[:,0], y=X_train_trf[:,1],
              color=y_sm)
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

#### RF

In [30]:
param_grid = {'n_estimators': [50,100,150], #1000],
               'max_depth': [5, 10, 50, 100],# 230, 340],
               'min_samples_split':[5, 10,14],
               'min_samples_leaf' : [4, 6, 8, 16],
               'criterion':['entropy','gini']
             }

clf_RF = RandomForestClassifier()

rf_grid = RandomizedSearchCV(estimator = clf_RF,
                       param_distributions = param_grid,
                       cv = 5, #for every combination 5 folds done to get better estimation
                       verbose=1,
                       n_jobs = -1,
                       random_state=42,
                       n_iter=100)
rf_grid.fit(X_sm,y_sm)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [31]:
print(rf_grid.best_params_)
print(rf_grid.best_score_)
best_grid=rf_grid.best_estimator_
best_grid

{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 100, 'criterion': 'entropy'}
0.8141509433962264


In [32]:
y_pred = best_grid.predict(X_train)
print('Train')
print(f"accuracy: {accuracy_score(y_train,y_pred)}, precision: {precision_score(y_train, y_pred, pos_label=1) }, recall: {recall_score(y_train, y_pred, pos_label=1 )}, f1_score: {f1_score(y_train, y_pred, average='macro')}")
print(confusion_matrix(y_train,y_pred))
print("Classification report: {}".format(classification_report(y_train,y_pred)))

Train
accuracy: 0.8681318681318682, precision: 0.7784615384615384, recall: 0.8754325259515571, f1_score: 0.8593177422638436
[[458  72]
 [ 36 253]]
Classification report:               precision    recall  f1-score   support

           0       0.93      0.86      0.89       530
           1       0.78      0.88      0.82       289

    accuracy                           0.87       819
   macro avg       0.85      0.87      0.86       819
weighted avg       0.87      0.87      0.87       819



In [33]:
y_pred = best_grid.predict(X_test)
print('Test')
print(f"test accuracy: {accuracy_score(y_test,y_pred)}, precision: {precision_score(y_test, y_pred, pos_label=1 )}, recall: {recall_score(y_test, y_pred, pos_label=1 )}, f1_score: {f1_score(y_test, y_pred, average='macro')}")
print(confusion_matrix(y_test,y_pred))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

Test
test accuracy: 0.7588075880758808, precision: 0.6081871345029239, recall: 0.8253968253968254, f1_score: 0.7492613206898922
[[176  67]
 [ 22 104]]
Classification report:               precision    recall  f1-score   support

           0       0.89      0.72      0.80       243
           1       0.61      0.83      0.70       126

    accuracy                           0.76       369
   macro avg       0.75      0.77      0.75       369
weighted avg       0.79      0.76      0.76       369



In [34]:
y_pred = best_grid.predict(X_val)
print('Val')
print(f"val accuracy: {accuracy_score(y_val,y_pred)}, precision: {precision_score(y_val, y_pred, pos_label=1 )}, recall: {recall_score(y_val, y_pred, pos_label=1 )}, f1_score: {f1_score(y_val, y_pred, average='macro')}")
print(confusion_matrix(y_val,y_pred))
print("Classification report: {}".format(classification_report(y_val,y_pred)))

Val
val accuracy: 0.8780487804878049, precision: 0.8222222222222222, recall: 0.8409090909090909, f1_score: 0.867959636441709
[[71  8]
 [ 7 37]]
Classification report:               precision    recall  f1-score   support

           0       0.91      0.90      0.90        79
           1       0.82      0.84      0.83        44

    accuracy                           0.88       123
   macro avg       0.87      0.87      0.87       123
weighted avg       0.88      0.88      0.88       123



In [38]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

rf= RandomForestClassifier(n_estimators= 100, min_samples_split= 5, min_samples_leaf= 4, max_depth= 100, criterion= 'entropy')
scorers = {
    'accuracy': 'accuracy',
    'precision_class_1': make_scorer(precision_score, pos_label=1),
    'recall_class_1': make_scorer(recall_score, pos_label=1),
    'macro_f1': make_scorer(f1_score, average='macro')
}

scores = cross_validate(rf, X_train, y_train, cv=10, scoring=scorers)

# Print the results
print("Cross Val score")
print("Accuracy: ", np.mean(scores['test_accuracy']))
print("Precision (Class 1): ", np.mean(scores['test_precision_class_1']))
print("Recall (Class 1): ", np.mean(scores['test_recall_class_1']))
print("Macro F1 Score: ", np.mean(scores['test_macro_f1']))


Cross Val score
Accuracy:  0.7619241192411924
Precision (Class 1):  0.7135573973664797
Recall (Class 1):  0.560344827586207
Macro F1 Score:  0.7248028467440206


#### SVM

In [None]:
param_grid = {'C': [1,10,50,100],
               'kernel': ['poly','linear','rbf','precomputer'],
             }

clf_svm = SVC()

svm_grid = GridSearchCV(estimator = clf_svm,
                       param_grid = param_grid,
                       cv = 5, #for every combination 5 folds done to get better estimation
                       verbose=1,
                       n_jobs = -1)
svm_grid.fit(X_sm,y_sm)

In [40]:
print(svm_grid.best_params_)
print(svm_grid.best_score_)
best_grid=svm_grid.best_estimator_
best_grid

{'C': 10, 'kernel': 'poly'}
0.8424528301886791


In [41]:
y_pred = best_grid.predict(X_train)
print('Train')
print(f"accuracy: {accuracy_score(y_train,y_pred)}, precision: {precision_score(y_train, y_pred, pos_label=1) }, recall: {recall_score(y_train, y_pred, pos_label=1 )}, f1_score: {f1_score(y_train, y_pred, average='macro')}")
print(confusion_matrix(y_train,y_pred))
print("Classification report: {}".format(classification_report(y_train,y_pred)))

Train
accuracy: 0.8168498168498168, precision: 0.7622641509433963, recall: 0.698961937716263, f1_score: 0.7954327467462401
[[467  63]
 [ 87 202]]
Classification report:               precision    recall  f1-score   support

           0       0.84      0.88      0.86       530
           1       0.76      0.70      0.73       289

    accuracy                           0.82       819
   macro avg       0.80      0.79      0.80       819
weighted avg       0.81      0.82      0.81       819



In [42]:
y_pred = best_grid.predict(X_test)
print('Test')
print(f"test accuracy: {accuracy_score(y_test,y_pred)}, precision: {precision_score(y_test, y_pred, pos_label=1 )}, recall: {recall_score(y_test, y_pred, pos_label=1 )}, f1_score: {f1_score(y_test, y_pred, average='macro')}")
print(confusion_matrix(y_test,y_pred))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

Test
test accuracy: 0.7100271002710027, precision: 0.5748031496062992, recall: 0.5793650793650794, f1_score: 0.6782282710565992
[[189  54]
 [ 53  73]]
Classification report:               precision    recall  f1-score   support

           0       0.78      0.78      0.78       243
           1       0.57      0.58      0.58       126

    accuracy                           0.71       369
   macro avg       0.68      0.68      0.68       369
weighted avg       0.71      0.71      0.71       369



In [43]:
y_pred = best_grid.predict(X_val)
print('Val')
print(f"val accuracy: {accuracy_score(y_val,y_pred)}, precision: {precision_score(y_val, y_pred, pos_label=1 )}, recall: {recall_score(y_val, y_pred, pos_label=1 )}, f1_score: {f1_score(y_val, y_pred, average='macro')}")
print(confusion_matrix(y_val,y_pred))
print("Classification report: {}".format(classification_report(y_val,y_pred)))

Val
val accuracy: 0.8536585365853658, precision: 0.8095238095238095, recall: 0.7727272727272727, f1_score: 0.8390988372093022
[[71  8]
 [10 34]]
Classification report:               precision    recall  f1-score   support

           0       0.88      0.90      0.89        79
           1       0.81      0.77      0.79        44

    accuracy                           0.85       123
   macro avg       0.84      0.84      0.84       123
weighted avg       0.85      0.85      0.85       123



In [44]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

svm= SVC(C= 10, kernel= 'poly')
scorers = {
    'accuracy': 'accuracy',
    'precision_class_1': make_scorer(precision_score, pos_label=1),
    'recall_class_1': make_scorer(recall_score, pos_label=1),
    'macro_f1': make_scorer(f1_score, average='macro')
}

scores = cross_validate(svm, X_train, y_train, cv=10, scoring=scorers)

# Print the results
print("Cross Val score")
print("Accuracy: ", np.mean(scores['test_accuracy']))
print("Precision (Class 1): ", np.mean(scores['test_precision_class_1']))
print("Recall (Class 1): ", np.mean(scores['test_recall_class_1']))
print("Macro F1 Score: ", np.mean(scores['test_macro_f1']))


Cross Val score
Accuracy:  0.7216199939777175
Precision (Class 1):  0.6264490408858165
Recall (Class 1):  0.536576354679803
Macro F1 Score:  0.6838045535993877


#### TPOT

In [35]:
from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import RandomizedSearchCV

#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators=[50,100,150]
max_features = ['log2', 'sqrt']

# max_depth = [int(x) for x in np.linspace(10, 1000,10)]
max_depth=[10,50,100]
# min_samples_split = [2, 5, 10,14]
min_samples_split=[5,10]
# min_samples_leaf = [1, 2, 4,6,8]
min_samples_leaf=[4,8]
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy', 'gini']}
# param = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#               'criterion':['entropy','gini']}
print(param)

tpot_classifier = TPOTClassifier(generations= 5, population_size= 50, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param},
                                 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42),
                                 scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

{'n_estimators': [50, 100, 150], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 50, 100], 'min_samples_split': [5, 10], 'min_samples_leaf': [4, 8], 'criterion': ['entropy', 'gini']}


Optimization Progress:   0%|          | 0/110 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.809119496855346

Generation 2 - Current best internal CV score: 0.809119496855346

Generation 3 - Current best internal CV score: 0.809119496855346

Generation 4 - Current best internal CV score: 0.809119496855346

Generation 5 - Current best internal CV score: 0.809119496855346

Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, input_matrix), criterion=entropy, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=150)


In [37]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.7560975609756098


In [39]:
print(tpot_classifier.score(X_train, y_train))

1.0


In [38]:
tpot_classifier.export('tpot_best_model.py')