In [1]:
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import datetime as dt
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
label_fields = [
    'label_minority_coping',
    'label_prej_event',
    'label_exp_reject',
    'label_identity_conceal',
    'label_internal_stigma',
    'label_dysphoria',
    'label_minority_stress'
]

In [3]:
# Load the original dataset
df = pd.read_csv('missom_annotated.csv')
df = df.drop(columns=['post_id','how_annotated'])

empty_cells =  df.isnull().sum()
print(empty_cells)
df.dropna(inplace = True)


text                      0
label_minority_coping     0
label_prej_event          0
label_exp_reject          0
label_identity_conceal    0
label_internal_stigma     0
label_dysphoria           0
label_minority_stress     0
dtype: int64


In [4]:
X = df["text"]
y = df[label_fields]

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset
data = df



train_ratio = 0.70
valid_ratio = 0.15
test_ratio = 0.15

train_df, remaining_df = train_test_split(df, test_size=1 - train_ratio, random_state=42)
valid_df, test_df = train_test_split(remaining_df, test_size=test_ratio/(valid_ratio + test_ratio), random_state=42)

# Split the data into training and testing sets
train_data, test_data = train_df,test_df

# Define the columns for labels
label_columns = ['label_minority_coping', 'label_prej_event', 'label_exp_reject',
                 'label_identity_conceal', 'label_internal_stigma', 'label_dysphoria',
                 'label_minority_stress']

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])

# Multi-label SVM
svm_classifier = OneVsRestClassifier(SVC(kernel='linear'))
svm_classifier.fit(X_train, train_data[label_columns])
svm_pred = svm_classifier.predict(X_test)

# Multi-label Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, train_data[label_columns])
rf_pred = rf_classifier.predict(X_test)

# Generate classification report for each label using scikit-learn
svm_classification_reports = {}
rf_classification_reports = {}

for idx, label in enumerate(label_columns):
    svm_classification_reports[label] = classification_report(test_data[label], svm_pred[:, idx], target_names=['0', '1'])
    rf_classification_reports[label] = classification_report(test_data[label], rf_pred[:, idx], target_names=['0', '1'])

# Print classification reports for each label
for label in label_columns:
    print(f"Classification Report for '{label}' using SVM:\n")
    print(svm_classification_reports[label])
    print("\n----------------------------------------------------\n")

    print(f"Classification Report for '{label}' using Random Forest:\n")
    print(rf_classification_reports[label])
    print("\n====================================================\n")

Classification Report for 'label_minority_coping' using SVM:

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       739
           1       0.67      0.06      0.11       130

    accuracy                           0.86       869
   macro avg       0.76      0.53      0.52       869
weighted avg       0.83      0.86      0.80       869


----------------------------------------------------

Classification Report for 'label_minority_coping' using Random Forest:

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       739
           1       0.50      0.01      0.02       130

    accuracy                           0.85       869
   macro avg       0.68      0.50      0.47       869
weighted avg       0.80      0.85      0.78       869



Classification Report for 'label_prej_event' using SVM:

              precision    recall  f1-score   support

           0       0.91      0.99      0.95