# Inital Testing Features for OpenSmile

OpenSmile has a large & diverse feature set that may be useful for differentiating audio based deep fakes.

### Import Packages & Setup

In [None]:
import opensmile
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")

### Load the Data

In [None]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [None]:
def load_files(file_paths, label):
    
    '''
    Parameters:
    file_paths: list of file paths for wav files
    label: 1 or 0 label categorization
    '''
    
    results = pd.DataFrame()
    
    for file_path in file_paths:
        df = smile.process_file(file_path)
        results = pd.concat([results,df])
        
    return results


In [None]:
def split_data(df, label_col_name='label', test_size=0.1):
    
    y = df[label_col_name].copy()
    X = df.drop(columns=[label_col_name]).copy()
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) #X_train, X_test, y_train, y_test
    return {X_train:X_train, X_test:X_test, y_train:y_train, y_test:y_test}


In [None]:
def run_pca(X_train, X_test):
    
    pca = PCA()
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca, pca

In [None]:
def train_and_evaluate_model(data,  )

In [None]:
real_path = '/Users/romitbarua/Documents/Berkeley/Spring 2023/world_leaders/jb_bo_audio/biden_wav_audio'
biden_df = load_files(biden_path)
biden_df['label'] = 0

#obama_path = '/Users/romitbarua/Documents/Berkeley/Spring 2023/world_leaders/jb_bo_audio/obama_wav_audio'
#obama_df = load_files(obama_path)
#obama_df['label'] = 1

fake_path = '/Users/romitbarua/Documents/Berkeley/Spring 2023/world_leaders/ElevenLabsDeepFakeWav'
fake_biden_df = load_files(fake_biden_path)
fake_biden_df['label'] = 1

In [None]:
df = pd.concat([biden_df, fake_biden_df]).reset_index()
df = df.drop(columns=['file', 'start', 'end'])
df.head()

In [None]:
y = df['label'].copy()
X = df.drop(columns=['label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test) 

In [None]:
pred = model.predict(X_test_pca)

In [None]:
acc = accuracy_score(y_true=y_test, y_pred=pred)
acc

In [None]:
components = np.arange(1,100,1)
acc_plot = []

for component in components:
    model = LogisticRegression()
    model.fit(X_train_pca[:,:component], y_train)
    pred = model.predict(X_test_pca[:,:component])
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    acc_plot.append(acc)

    

In [None]:
sns.lineplot(x=components, y=acc_plot)
plt.xlabel('# of Components')
plt.ylabel('Acc %')
plt.title('Biden vs. Obama Accuracy')
plt.show()

### Forward Feature Selection

In [None]:
all_features = list(X_train.columns)
num_features = 15

selected_features = []
train_acc = []
test_acc = []

best_feature_set = None

while len(selected_features) <= num_features:
    
    print(f'Selecting Feature {len(selected_features)+1} out of {num_features}')
    
    best_test_acc = 0
    best_train_acc = 0
    best_feature = None
    
    for idx, feature in enumerate(all_features):
        
        if idx % 100 == 0:
            print(f'Currently testing feature #{idx}')
        
        feature_test = selected_features + [feature]
        
        X_train_feature = X_train[feature_test]
        X_test_feature = X_test[feature_test]
        
        model = LogisticRegression()
        model.fit(X_train_feature, y_train)
        
        pred_train = model.predict(X_train_feature)
        pred_test = model.predict(X_test_feature)
        
        acc_train = accuracy_score(y_true=y_train, y_pred=pred_train)
        acc_test = accuracy_score(y_true=y_test, y_pred=pred_test)
        
        if acc_test > best_test_acc:
            best_feature = feature
            best_train_acc = acc_train
            best_test_acc = acc_test
            
    selected_features.append(best_feature)
    train_acc.append(best_train_acc)
    test_acc.append(best_test_acc)
    
    all_features.remove(best_feature)
    
    print(f'Best Feature: {best_feature}')
    print(f'Train Acc: {best_train_acc}')
    print(f'Test Acc: {best_test_acc}')
    print(f'Feaure List: {selected_features}')
    print('-----------------------------------')
    
       
        
        
        
        
        

In [None]:
sns.histplot(data=df, x='mfcc_sma[3]_percentile1.0', hue='label')

## Testing on New Dataset

In [None]:
base_path = "/home/ubuntu/"

LJ_original = base_path + 'data/LJSpeech_1.1'

LJ_fbmelgan = base_path + 'data/generated_audio/ljspeech_full_band_melgan'
LJ_parallel_wavegan = base_path + 'data/generated_audio/ljspeech_parallel_wavegan'

In [None]:
real_df = pd.DataFrame()

original_files = []
for dirpath,_,filenames in os.walk(LJ_original):
    for file in filenames:
        if file.startswith("LJ001-") and file.endswith('.wav'):
        #if file.endswith('.wav'):
            original_files.append(os.path.abspath(os.path.join(dirpath, file)))
original_files.sort()
print(len(original_files))

In [None]:
melgan_files = []
for dirpath,_,filenames in os.walk(LJ_fbmelgan):
    for file in filenames:
        if file.startswith("LJ001-") and file.endswith('.wav'):
        #if file.endswith('.wav'):
            melgan_files.append(os.path.abspath(os.path.join(dirpath, file)))
melgan_files.sort()
print(len(melgan_files))

In [None]:
def load_files(file_paths):
    
    results = pd.DataFrame()
    
    for idx, file in enumerate(file_paths):
        df = smile.process_file(file)
        results = pd.concat([results,df])
        
    return results


In [None]:
real_df = load_files(original_files)
real_df['label'] = 0

In [None]:
fake_df = load_files(melgan_files)
fake_df['label'] = 1

In [None]:
df = pd.concat([real_df, fake_df]).reset_index()
df = df.drop(columns=['file', 'start', 'end'])
df.head()

In [None]:
y = df['label'].copy()
X = df.drop(columns=['label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
all_features = list(X_train.columns)
num_features = 15

selected_features = []
train_acc = []
test_acc = []

best_feature_set = None

while len(selected_features) <= num_features:
    
    print(f'Selecting Feature {len(selected_features)+1} out of {num_features}')
    
    best_test_acc = 0
    best_train_acc = 0
    best_feature = None
    
    for idx, feature in enumerate(all_features):
        
        if idx % 100 == 0:
            print(f'Currently testing feature #{idx}')
        
        feature_test = selected_features + [feature]
        
        X_train_feature = X_train[feature_test]
        X_test_feature = X_test[feature_test]
        
        model = LogisticRegression()
        model.fit(X_train_feature, y_train)
        
        pred_train = model.predict(X_train_feature)
        pred_test = model.predict(X_test_feature)
        
        acc_train = accuracy_score(y_true=y_train, y_pred=pred_train)
        acc_test = accuracy_score(y_true=y_test, y_pred=pred_test)
        
        if acc_test > best_test_acc:
            best_feature = feature
            best_train_acc = acc_train
            best_test_acc = acc_test
            
    selected_features.append(best_feature)
    train_acc.append(best_train_acc)
    test_acc.append(best_test_acc)
    
    all_features.remove(best_feature)
    
    print(f'Best Feature: {best_feature}')
    print(f'Train Acc: {best_train_acc}')
    print(f'Test Acc: {best_test_acc}')
    print(f'Feaure List: {selected_features}')
    print('-----------------------------------')
    print()
    print()

In [None]:
selected_features

In [None]:
sns.scatterplot(data=df, x='jitterDDP_sma_flatness', y='audSpec_Rfilt_sma[8]_lpc1', hue='label')