<a href="https://colab.research.google.com/github/claudiarichardxx/Decoding-Personality-Types-from-Text-using-Myers-Briggs-Dimensions/blob/main/setup/datasetCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [None]:
%%capture
!pip install -U accelerate
!pip install -U transformers
!pip install iterative-stratification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from itertools import chain
import pandas as pd
from datasets import load_dataset
from huggingface_hub import notebook_login

# Data Creation

In [None]:
def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from:
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """

    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

In [None]:
def createData(path = '/content/mbtiDf.csv'):

    df = pd.read_csv(path)
    df = df[~df['post'].str.contains('http')]
    df = df.dropna()
    X_train, X_test, y_train, y_test = multilabel_train_test_split(X,y,stratify=y, test_size=0.40, random_state = 22)
    train = y_train
    train['post'] = X_train
    val =  y_test
    val['post'] = X_test
    X = val[['post']]
    y = val[['I/E', 'N/S', 'T/F', 'J/P']]
    X_train, X_test, y_train, y_test = multilabel_train_test_split(X,y,stratify=y, test_size=0.60, random_state = 22)
    validation = y_train
    validation['post'] = X_train
    test =  y_test
    test['post'] = X_test

    validation.to_csv('validation.csv', index = False)
    test.to_csv('test.csv', index = False)
    train.to_csv('train.csv', index = False)


In [None]:
#need to download the mbti data and specify the path, this version is processed
createData(path = '/content/mbtiDf.csv')

In [None]:
#the data was processed as below before upload
train.head()

Unnamed: 0,I/E,N/S,T/F,J/P,post
4,0,0,1,0,"Prozac, wellbrutin, at least thirty minutes of..."
5,0,0,1,0,Basically come up with three items you've dete...
6,0,0,1,0,All things in moderation. Sims is indeed a vi...
8,0,0,1,0,It appears to be too late. :sad:
15,0,0,1,0,"Get high in backyard, roast and eat marshmello..."


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142748 entries, 0 to 396508
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   I/E     142748 non-null  int64 
 1   N/S     142748 non-null  int64 
 2   T/F     142748 non-null  int64 
 3   J/P     142748 non-null  int64 
 4   post    142748 non-null  object
dtypes: int64(4), object(1)
memory usage: 6.5+ MB


In [None]:
train.describe()

Unnamed: 0,I/E,N/S,T/F,J/P
count,168701.0,168701.0,168701.0,168701.0
mean,0.230342,0.137249,0.54238,0.604537
std,0.421053,0.344111,0.498202,0.488951
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,1.0
75%,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [None]:
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "/content/test.csv", "validation":"/content/validation.csv"})
dataset

DatasetDict({
    train: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 95166
    })
    test: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 38067
    })
    validation: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 25377
    })
})

In [None]:
notebook_login()

In [None]:
dataset.push_to_hub("ClaudiaRichard/mbti_classification_v2")

In [None]:
labels = [label for label in dataset['train'].features if label not in ['post']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['I/E', 'N/S', 'T/F', 'J/P']