In [1]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)


# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.587058984533605
recall : 0.5329011366833617
f1-score : 0.5014722303025569
accuracy : 0.6398460779900101


In [2]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = SVC()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.4552307077491588
recall : 0.5147812491502842
f1-score : 0.46123022732158414
accuracy : 0.6313996612858277


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = GradientBoostingClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5144859629009024
recall : 0.505412285770581
f1-score : 0.4953519339017996
accuracy : 0.5874488177160375


In [4]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = KNeighborsClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.4906442726790294
recall : 0.49029179405939854
f1-score : 0.48431186442557
accuracy : 0.5706647801573521


In [5]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = AdaBoostClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.45015432679019146
recall : 0.46013500613338165
f1-score : 0.4497919497903259
accuracy : 0.5203887709820567


In [6]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)


Average Metrics:
precision : 0.5346852518853917
recall : 0.527692754036925
f1-score : 0.518520305741258
accuracy : 0.6035012969751538


In [7]:
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

np.random.seed(3112)

# Read the TSV file
data = pd.read_csv(f"C:/Users/Sina/Desktop/Stage V2/Sripts/emdat and openface clean up/merged_data.tsv", delimiter='\t')

# Separate features (X) and target variable (y)
X = data.drop(["answer_openface", "Sc_id"], axis=1)
y = data["answer"]

# Replace infinite values with np.nan
X = X.replace([np.inf, -np.inf], np.nan)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the GroupKFold cross-validator
group_kfold = GroupKFold(n_splits=5)

# Perform cross-validation
metrics = []
accuracies = []
for train_index, test_index in group_kfold.split(X, y, groups=data["Part_id"]):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the classifier
    classifier = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam')
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    metrics.append(report)
    accuracies.append(accuracy)

# Calculate average metrics across all folds
average_metrics = {}
for metric in metrics[0]['macro avg'].keys():
    if metric == 'support':
        continue
    average_metrics[metric] = np.mean([m['macro avg'][metric] for m in metrics])

average_metrics['accuracy'] = np.mean(accuracies)

# Print average metrics
print("Average Metrics:")
for metric, value in average_metrics.items():
    print(metric, ":", value)




Average Metrics:
precision : 0.48888094349510525
recall : 0.4877223091331979
f1-score : 0.4818753889556243
accuracy : 0.5592851630329925


