In [2]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)  # Exclude "Part_id"
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)



Average Metrics:
precision : 0.5485322484466408
recall : 0.6175836602568225
f1-score : 0.558779660355334
accuracy : 0.6175836602568225


In [3]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = SVC()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.516292237695767
recall : 0.6203646536754774
f1-score : 0.5463310181006971
accuracy : 0.6203646536754774


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = GradientBoostingClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5640689721362689
recall : 0.5766833879992282
f1-score : 0.55653971689649
accuracy : 0.5766833879992282


In [5]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = KNeighborsClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.541516598055434
recall : 0.5708223465603361
f1-score : 0.5503969641601241
accuracy : 0.5708223465603361


In [6]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = AdaBoostClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5459648118953842
recall : 0.5538785988380817
f1-score : 0.544021637118804
accuracy : 0.5538785988380817


In [7]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5753105265062418
recall : 0.6090579244110017
f1-score : 0.5763067226852828
accuracy : 0.6090579244110017


In [8]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam')
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)




Average Metrics:
precision : 0.5573031202665018
recall : 0.570473449525157
f1-score : 0.5554619936622013
accuracy : 0.570473449525157




In [9]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)


# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/modified_EMDAT.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id", "Part_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = DummyClassifier(strategy="stratified")
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['weighted avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['weighted avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5655713450672435
recall : 0.5538416189679937
f1-score : 0.5564895160310402
accuracy : 0.5538416189679937
