In [None]:
# import LogisticRegression
from sklearn.linear_model import LogisticRegression

# import LinearSVC
from sklearn.svm import LinearSVC

import numpy as np
import pandas as pd

from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler

from ast import literal_eval

In [None]:
seed = 3052011

In [None]:
def select_classifier(clf_name):
    if clf_name == 'LogR':
        return LogisticRegression(max_iter=1000, random_state=24091993, class_weight='balanced')
    elif clf_name == 'LinSVM':
        return LinearSVC(max_iter=1000, random_state=24091993, class_weight='balanced')
    
def select_params(clf_name):
    if clf_name == 'LogR':
        return {'C': np.logspace(-4, 4, 20)}
    elif clf_name == 'LinSVM':
        return {'C': np.logspace(-4, 4, 20)}

In [None]:
datasets = ['depression_severity', 'dep_sign']

# $M_f$

In [None]:
features_pad = ['Valence', 'Valence_nwords', 'Arousal', 'Arousal_nwords', 'Dominance', 'Dominance_nwords', 
                'Longing_GALC', 'Lust_GALC', 'Arousal_GI',
                 'Wlbgain_Lasswell', 
                'Wlbloss_Lasswell', 'Wlbphys_Lasswell', 'Wlbpsyc_Lasswell', 'Wlbtot_Lasswell',
                'hu_liu_prop', 'polarity']

features_emotion = ['Admiration/Awe_GALC', 'Amusement_GALC', 'Anger_GALC', 'Anxiety_GALC', 
                    'Boredom_GALC', 'Contempt_GALC', 'Contentment_GALC', 'Desperation_GALC', 
                    'Disappointment_GALC', 'Disgust_GALC', 'Dissatisfaction_GALC', 'Envy_GALC', 'Fear_GALC', 
                    'Feelinglove_GALC', 'Guilt_GALC', 'Happiness_GALC', 'Hatred_GALC', 
                    'Hope_GALC', 'Interest/Enthusiasm_GALC', 'Irritation_GALC', 'Jealousy_GALC', 
                    'Joy_GALC', 'Pleasure/Enjoyment_GALC', 'Pride_GALC', 'Relaxation/Serenity_GALC', 'Relief_GALC', 
                    'Sadness_GALC', 'Shame_GALC', 'Tension/Stress_GALC', 'Positive_GALC', 'Negative_GALC',
                   'Negativ_GI', 'Ngtv_GI', 'Hostile_GI', 'No_GI', 'Pain_GI', 'Fail_GI', 'Negate_GI',
                   'Positiv_GI', 'Pstv_GI', 'Pleasur_GI', 'Yes_GI', 'Feel_GI', 'Emot_GI',
                   'Affloss_Lasswell', 'Wlbloss_Lasswell', 'Endslw_Lasswell', 'Anomie_Lasswell', 'Negaff_Lasswell', 
                    'Notlw_Lasswell', 'Affoth_Lasswell', 'Afftot_Lasswell', 'Meanslw_Lasswell',
                   'Affgain_Lasswell', 'Posaff_Lasswell', 
                    'hu_liu_pos_perc', 'hu_liu_neg_perc', 'hu_liu_pos_nwords', 'hu_liu_neg_nwords',
                   'Anger_EmoLex', 'Disgust_EmoLex', 'Fear_EmoLex', 'Negative_EmoLex', 'Sadness_EmoLex', 'Joy_EmoLex', 'Positive_EmoLex',
                    'Surprise_EmoLex', 'Anticipation_EmoLex', 'joy_component', 'fear_and_digust_component', 'Sv_GI',
                    'pleasantness', 'sensitivity', 'vader_negative', 'vader_neutral', 'vader_compound', 'vader_positive']

sentiment = ['negative_adjectives_component', 'positive_adjectives_component', 'polarity_nouns_component', 
             'polarity_verbs_component', 'virtue_adverbs_component', 'positive_nouns_component', 
            'positive_verbs_component', 'well_being_component', 'Surelw_Lasswell', 'If_Lasswell']


mood = ['Virtue_GI', 'Vice_GI',
       'attention', 'aptitude', 'affect', 'posemo',
       'negemo', 'anx', 'anger', 'sad']

social = ['Beingtouched_GALC', 'Compassion_GALC', 'Gratitude_GALC', 'Humility_GALC', 'Surprise_GALC', 'Submit_GI', 
                   'Trust_EmoLex', 'Affpt_Lasswell', 'Wlbpt_Lasswell', 'Affil_GI', 'Role_GI', 'Coll_GI', 'Powcon_Lasswell', 'Powcoop_Lasswell', 
                   'Work_GI', 'Ritual_GI', 'Socrel_GI', 'Race_GI', 'Kin_2_GI', 'Male_GI', 'Female_GI', 'Nonadlt_GI',
                   'Hu_GI', 'Social_GI', 'Rel_GI', 'Intrj_GI', 'Ipadj_GI', 'Indadj_GI', 'Powaupt_Lasswell', 'Powpt_Lasswell', 'Powdoct_Lasswell', 'Powauth_Lasswell', 
                   'social_order_component', 'affect_friends_and_family_component', 'respect_component', 'trust_verbs_component', 'Ptlw_Lasswell', 'Wltpt_Lasswell',
                    'Active_GI', 'Passive_GI',  'Rspgain_Lasswell', 'Rsploss_Lasswell', 'Rspoth_Lasswell', 'Rsptot_Lasswell',
                    'Rcethic_Lasswell', 'Rcloss_Lasswell', 'Rcgain_Lasswell',
                   'social',
       'family', 'friend', 'female', 'male']

syntactic = ['Self_GI', 'Our_GI', 'You_GI', 'Name_GI', 'politeness_component', 'formlw_Lasswell', 'nwords',
          'Comnobj_GI', 'Comform_GI', 'Com_GI', 'WC', 'Analytic',
       'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr',
       'Dic', 'function', 'pronoun', 'ppron', 'i',
       'we', 'you', 'shehe', 'they', 'ipron',
       'article', 'prep', 'auxverb', 'adverb', 'conj',
       'negate', 'verb', 'adj', 'compare', 'interrog',
       'number', 'quant', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'AllPunc', 'Period',
       'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam',
       'Dash', 'Quote', 'Apostro', 'Parenth',
       'OtherP'] + ['readability_Kincaid',
 'readability_ARI',
 'readability_Coleman-Liau',
 'readability_FleschReadingEase',
 'readability_GunningFogIndex',
 'readability_LIX',
 'readability_SMOGIndex',
 'readability_RIX',
 'readability_DaleChallIndex',
 'readability_characters_per_word',
 'readability_syll_per_word',
 'readability_words_per_sentence',
 'readability_sentences_per_paragraph',
 'readability_type_token_ratio',
 'readability_directspeech_ratio',
 'readability_characters',
 'readability_syllables',
 'readability_words',
 'readability_wordtypes',
 'readability_sentences',
 'readability_paragraphs',
 'readability_long_words',
 'readability_complex_words',
 'readability_complex_words_dc',
 'readability_tobeverb',
 'readability_auxverb',
 'readability_conjunction',
 'readability_pronoun',
 'readability_preposition',
 'readability_nominalization',
 'readability_interrogative',
 'readability_article',
 'readability_subordination']

topic = ['Goal_GI', 'Try_GI', 'Means_GI', 'Persist_GI', 'Complet_GI', 'Finish_GI',
          'Exert_GI', 'Fetch_GI', 'Ovrst_GI', 'Undrst_GI', 'Causal_GI', 'Ought_GI', 'Powoth_Lasswell', 
                'Powtot_Lasswell', 'Wlttran_Lasswell', 'Wltoth_Lasswell', 'Wlttot_Lasswell', 
          'Eval_2_GI', 'Eval_GI', 'Iav_GI', 'Ani_GI', 'Aquatic_GI', 'Land_GI', 'Sky_GI', 'Object_GI', 'Tool_GI',
            'Food_GI', 'Vehicle_GI', 'Bldgpt_GI', 'Natobj_GI', 'Bodypt_GI', 'Natrpro_GI', 'Color_GI',
         'Increas_GI', 'Decreas_GI', 'Quality_GI', 'Quan_GI', 'Numb_GI', 'Ord_GI', 'Card_GI', 'Freq_GI', 'Dist_GI',
         'Place_GI', 'Region_GI', 'Route_GI', 'Begin_GI', 'Stay_GI', 'Rise_GI', 'Travel_GI', 'Fall_GI', 'Time_2_GI',
         'Time_GI', 'Space_GI', 'Pos_GI', 'Dim_GI', 'Doctrin_GI', 'Econ_2_GI', 'Exch_GI', 'Econ_GI', 'Legal_GI',
         'Milit_GI', 'Polit_2_GI', 'Polit_GI', 'Relig_GI', 'Say_GI',
         'Academ_GI', 'Exprsv_GI', 'Need_GI', 'Vary_GI', 'Think_GI', 'Know_GI', 'Perceiv_GI', 'Compare_GI', 'Solve_GI',
        'Abs_2_GI', 'Abs_GI', 'action_component', 'economy_component', 'certainty_component', 'failure_component',
         'Rcrelig_Lasswell', 'Strong_GI', 'Power_GI', 'Weak_GI', 'Powgain_Lasswell', 
                'Powloss_Lasswell', 'Powends_Lasswell', 'Powaren_Lasswell', 
          'Rcends_Lasswell', 'Rctot_Lasswell', 'Sklpt_Lasswell', 'Skloth_Lasswell', 'Skltot_Lasswell', 'Nation_Lasswell', 'Dav_GI',  
        'Enlgain_Lasswell', 'Enlloss_Lasswell', 'Enlends_Lasswell', 'Enlpt_Lasswell', 'Enloth_Lasswell', 'Enltot_Lasswell', 'Sklasth_Lasswell', 'Timespc_Lasswell',
         'objects_component',
         'cogproc', 'insight', 'cause', 'discrep',
         'Trngain_Lasswell', 'Trnloss_Lasswell', 'Tranlw_Lasswell', 'Arenalw_Lasswell', 
       'tentat', 'certain', 'differ', 'percept',
       'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives',
       'affiliation', 'achieve', 'power', 'reward',
       'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work',
       'leisure', 'home', 'money', 'relig', 'death']


total = features_pad + features_emotion + mood  + sentiment + social + syntactic + topic
affective = features_pad + features_emotion + mood  + sentiment

In [None]:
features_group = [affective, social, syntactic, topic, total]

In [None]:
results = pd.DataFrame(columns=['dataset', 'clf', 'best_params', 'n_features', 'features', 'f_score', 'precision', 'recall'])

for dataset in datasets:
    for feat in features_group:
        
        # load dataset
        data = pd.read_csv('../data/processed/{}.csv'.format(dataset))

        X = data[feat]
        y = data['label']

        # drop high correlated features
        # Create correlation matrix
        corr_matrix = X.corr().abs()

        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Find features with correlation greater than 0.95
        to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

        # Drop features 
        X.drop(to_drop, axis=1, inplace=True)

        dropped_columns = to_drop
        non_dropped_columns = [x for x in X.columns if x not in dropped_columns]

        # create scaler
        scaler = MinMaxScaler()

        # fit and transform in one step
        X = scaler.fit_transform(X)

        
        for rfe in [False, True]:
            for clf_name in ['LogR', 'LinSVM']:
                
                X_it = X.copy()

                selected_features = non_dropped_columns

                clf = select_classifier(clf_name)
                params = select_params(clf_name)

                # create cross validation
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)
                
                # create model
                rs = RandomizedSearchCV(clf, params, scoring='f1_weighted', cv=cv, n_jobs=-1, random_state=24091993)

                # evaluate model
                rs.fit(X_it, y)

                

                if rfe:
                    # create model
                    model = select_classifier(clf_name)
                    
                    model.set_params(**rs.best_params_)

                    # create model
                    rfecv = RFECV(estimator=model, step=1, cv=cv, scoring='f1_weighted', n_jobs=-1)

                    # fit the model on all available data
                    rfecv.fit(X_it, y)

                    # transform X
                    X_it = rfecv.transform(X_it) 
                    
                    # get selected features
                    selected_features = [non_dropped_columns[i] for i in range(len(rfecv.support_)) if rfecv.support_[i]]

                # create model
                model = select_classifier(clf_name)
                    
                model.set_params(**rs.best_params_)

                # evaluate model with selected features through cross validation
                scores = cross_validate(model, X_it, y, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted'], cv=cv, n_jobs=-1)

                # save results with concat
                results = pd.concat([results, pd.DataFrame({'dataset': [dataset], 'clf': [clf_name], 'best_params': [rs.best_params_], 'n_features': [len(selected_features)], 'features': [selected_features], 'f_score': [scores['test_f1_weighted'].mean()], 'precision': [scores['test_precision_weighted'].mean()], 'recall': [scores['test_recall_weighted'].mean()]})], ignore_index=True)




In [None]:
results

In [None]:
results.to_csv('results/results_mf_depressionintensity.csv', index=False)

## $M_{we}$

In [None]:
results_we = pd.DataFrame(columns=['dataset', 'clf', 'best_params', 'embeddings', 'f_score', 'precision', 'recall'])

for dataset in datasets:
    # transformer_embeddings = all-mpnet-base-v2
    # distil_roberta_embeddings = all-distilroberta-v1
    # ft_embeddings = FastText
    for embeddings in ['transformer_embeddings', 'distil_roberta_embeddings', 'ft_embeddings']:
        
        # load dataset
        data = pd.read_csv('../data/processed/{}.csv'.format(dataset))

        X = np.array(data[embeddings].apply(literal_eval).tolist())
        y = data['label']

        # create scaler
        scaler = MinMaxScaler()

        # fit and transform in one step
        X = scaler.fit_transform(X)

        for clf_name in ['LogR', 'LinSVM']:

            clf = select_classifier(clf_name)
            params = select_params(clf_name)

            # create cross validation
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)

            # create model
            rs = RandomizedSearchCV(clf, params, scoring='f1_weighted', cv=cv, n_jobs=-1, random_state=24091993)

            # evaluate model
            rs.fit(X, y)

            # create model
            model = select_classifier(clf_name)
                
            model.set_params(**rs.best_params_)

            # evaluate model with selected features through cross validation
            scores = cross_validate(model, X, y, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted'], cv=cv, n_jobs=-1)

            # save results with concat
            results_we = pd.concat([results_we, pd.DataFrame({'dataset' : dataset, 'clf': [clf_name], 'best_params': [rs.best_params_], 'embeddings': [embeddings], 'f_score': [scores['test_f1_weighted'].mean()], 'precision': [scores['test_precision_weighted'].mean()], 'recall': [scores['test_recall_weighted'].mean()]})], ignore_index=True)

In [None]:
# save results
results_we.to_csv('results/results_we_depressionintensity.csv', index=False)

# $M_E$

In [None]:
results_e = pd.DataFrame(columns=['dataset', 'clf', 'best_params', 'f_score', 'precision', 'recall'])

for dataset in datasets:

    # load dataset
    data = pd.read_csv('../data/processed/{}.csv'.format(dataset))

    # get best features
    best_features = results_mf[results_mf['dataset'] == dataset].sort_values(by='f_score', ascending=False).iloc[0]['features']

    X = data[literal_eval(best_features)]
    y = data['label']

    # get best embeddings
    best_embeddings = 'all-distilroberta-v1'# results_we[(results_we['dataset'] == dataset)].sort_values(by='f_score', ascending=False).iloc[0]['embeddings']

    # load embeddings
    embeddings = np.array(data[best_embeddings].apply(literal_eval).tolist())

    # concatenate features and embeddings
    X = np.concatenate((X, embeddings), axis=1)

    # create scaler
    scaler = MinMaxScaler()

    # fit and transform in one step
    X = scaler.fit_transform(X)
    
    for clf_name in ['LogR', 'LinSVM']:

        clf = select_classifier(clf_name)
        params = select_params(clf_name)

        # create cross validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        # create model
        rs = RandomizedSearchCV(clf, params, scoring='f1_weighted', cv=cv, n_jobs=-1, random_state=seed)

        # evaluate model
        rs.fit(X, y)

        # create model
        model = select_classifier(clf_name)
            
        model.set_params(**rs.best_params_)

        # evaluate model with selected features through cross validation
        scores = cross_validate(model, X, y, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted'], cv=cv, n_jobs=-1)

        # save results with concat
        results_e = pd.concat([results_e, pd.DataFrame({'dataset' : dataset, 'clf': [clf_name], 'best_params': [rs.best_params_], 'f_score': [scores['test_f1_weighted'].mean()], 'precision': [scores['test_precision_weighted'].mean()], 'recall': [scores['test_recall_weighted'].mean()]})], ignore_index=True)

In [None]:
# save results
results_e.to_csv('results/results_e_depressionintensity.csv', index=False)