# Feature Processing
This notebook loads all linguistic feature files calculated in the [linguistic feature calculation notebook](linguistic_feature_calculation.ipynb), and produces a single DataFrame containing the relevant variables for classification analysis. The complete list of features included in classification can be found in [this README](README.md). In addition, it normalizes variables with z-scoring as pre-processing for the classification algorithm.

In [66]:
# Importing necessary libraries
import pandas as pd
from scipy.stats import zscore

In [67]:
# Loading the linguistic features file
ling_features = pd.read_csv('linguistic_features/linguistic_features.csv', index_col = 0)

# Creating a DataFrame with the relevant features
ling_features_finalized = ling_features[
    ['Coherence', 
    'n_words', 
    'POS:PRON', 
    'POS:CCONJ', 
    'POS:SCONJ', 
    'open_closed_ratio', 
    'type_token_ratio', 
    'propositional_density', 
    'n_logical_operators', 
    'log10_freq_mean', 
    'semantic_diversity_mean',
    'semantic_thematic_similarity', 
    'surprisal',
    'coref_local',
    'coref_global',
    'n_constituents',
    'mean_np_length',	
    'mean_vp_length']
]

display(ling_features_finalized)

Unnamed: 0,Coherence,n_words,POS:PRON,POS:CCONJ,POS:SCONJ,open_closed_ratio,type_token_ratio,propositional_density,n_logical_operators,log10_freq_mean,semantic_diversity_mean,semantic_thematic_similarity,surprisal,coref_local,coref_global,n_constituents,mean_np_length,mean_vp_length
0,4,210.0,0.165992,0.020243,0.024291,0.820513,0.609524,0.356275,4.0,4.682792,2.107979,0.198185,4.895232,0.550000,0.433333,152.0,2.354430,4.632653
1,5,256.0,0.162252,0.026490,0.016556,0.788732,0.535156,0.380795,7.0,4.666692,2.069585,0.192592,4.108495,0.684211,0.705263,196.0,2.516129,7.161765
2,5,758.0,0.103784,0.022703,0.028108,0.879795,0.509235,0.375135,18.0,4.396347,2.102881,0.180181,5.146196,0.293103,0.188194,599.0,3.094595,7.650273
3,4,812.0,0.119324,0.030623,0.007392,1.036082,0.492611,0.376980,30.0,4.421817,2.076899,0.176816,5.522682,0.175676,0.139820,546.0,3.010526,6.168605
4,5,155.0,0.150000,0.044444,0.033333,0.878049,0.593548,0.338889,9.0,4.596490,2.064892,0.161092,4.702588,0.600000,0.545455,123.0,2.484375,6.775000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,206.0,0.144068,0.055085,0.038136,1.067961,0.669903,0.453390,10.0,4.404882,2.118607,0.207649,3.565189,0.461538,0.384615,153.0,2.871429,9.181818
1535,4,96.0,0.138889,0.046296,0.037037,1.000000,0.791667,0.481481,3.0,4.578900,2.106413,0.222453,3.443641,0.600000,0.400000,64.0,1.703704,6.481481
1536,3,101.0,0.107143,0.035714,0.035714,1.061224,0.801980,0.419643,4.0,4.368046,2.086000,0.172722,4.055240,0.166667,0.285714,70.0,2.424242,7.450000
1537,5,131.0,0.151316,0.032895,0.019737,0.830986,0.633588,0.368421,3.0,4.474590,2.176371,0.196065,3.793617,0.454545,0.515152,97.0,2.204082,5.694444


In [68]:
# Loading the local coherence file
coherence = pd.read_csv('linguistic_features/coherence_results.csv')

# Calculating the mean local coherence each transcript
grouped_df = coherence.groupby('pt', as_index=False).mean(numeric_only = True)

# Creating a DataFrame with local coherence scores
local_coherence = grouped_df[['lc']].rename(columns = {'lc': 'local_coherence'})

display(local_coherence)

Unnamed: 0,local_coherence
0,0.274387
1,0.215685
2,0.161755
3,0.199047
4,0.375034
...,...
1534,0.268190
1535,0.323834
1536,0.295389
1537,0.258032


In [69]:
# Loading the text description features file
text_description = pd.read_csv('linguistic_features/text_description_ordered.csv', index_col = 0).drop('text', axis = 1)

display(text_description)

Unnamed: 0,dependency_distance_mean,prop_adjacent_dependency_relation_mean,first_order_coherence,second_order_coherence
0,1.984713,0.442565,0.366634,0.316578
1,2.568674,0.377021,0.310618,0.329741
2,2.731996,0.413946,0.352635,0.331568
3,2.332504,0.423813,0.308754,0.263125
4,2.442240,0.426583,0.318063,0.399475
...,...,...,...,...
1534,2.708896,0.422875,0.377143,0.412284
1535,2.563026,0.405010,0.401308,0.352386
1536,2.669339,0.444015,0.319754,0.358499
1537,2.715251,0.364776,0.314997,0.391217


In [70]:
# Concatenating the various feature DataFrames into a large set
linguistic_feature_set = pd.concat([ling_features_finalized, local_coherence, text_description], axis = 1)

# Dropping any rows with NaN values to avoid issues in classification
linguistic_feature_set = linguistic_feature_set.dropna()

# Normalizing the number of constituents and phrasal types by the number of words in a transcript (number of high-level constituents per word)
linguistic_feature_set[['n_constituents']] = linguistic_feature_set[['n_constituents']].div(linguistic_feature_set['n_words'], axis = 0)

In [71]:
# Normalizing each linguistic feature using z-scores
coherence_df = linguistic_feature_set['Coherence']
linguistic_feature_set_norm = linguistic_feature_set.drop('Coherence', axis = 1)
linguistic_feature_set_norm = linguistic_feature_set_norm.apply(zscore)
linguistic_feature_set_final = pd.concat([coherence_df, linguistic_feature_set_norm], axis = 1)

In [72]:
pd.set_option('display.max_columns', None)
display(linguistic_feature_set_final)

Unnamed: 0,Coherence,n_words,POS:PRON,POS:CCONJ,POS:SCONJ,open_closed_ratio,type_token_ratio,propositional_density,n_logical_operators,log10_freq_mean,semantic_diversity_mean,semantic_thematic_similarity,surprisal,coref_local,coref_global,n_constituents,mean_np_length,mean_vp_length,local_coherence,dependency_distance_mean,prop_adjacent_dependency_relation_mean,first_order_coherence,second_order_coherence
0,4,0.030298,0.626178,-0.637601,0.205818,0.010465,0.017717,-0.124776,-0.465551,0.229789,0.282633,0.143489,0.748834,0.453128,0.181288,-0.545836,-0.326540,-0.998199,-0.262331,-1.098855,0.380124,-0.396815,-0.757395
1,5,0.370347,0.538812,-0.206452,-0.341530,-0.151900,-0.551671,0.236712,0.076550,0.160041,-0.543757,-0.059565,-0.322534,0.990431,1.357669,0.115351,-0.161010,-0.006175,-0.761177,0.515349,-1.391047,-0.934960,-0.636063
2,5,4.081314,-0.826902,-0.467838,0.475884,0.313338,-0.750135,0.153273,2.064255,-1.011167,0.172905,-0.510221,1.090593,-0.575340,-0.879196,0.504524,0.431163,0.185438,-1.219473,0.966807,-0.393236,-0.531296,-0.619221
3,4,4.480501,-0.463904,0.078786,-0.990017,1.111803,-0.877415,0.180471,4.232661,-0.900825,-0.386338,-0.632424,1.603287,-1.045454,-1.088465,-1.358506,0.345102,-0.395735,-0.902561,-0.137480,-0.126593,-0.952868,-1.250109
4,5,-0.376282,0.252633,1.032676,0.845624,0.304415,-0.104598,-0.381104,0.437951,-0.144093,-0.644774,-1.203369,0.486494,0.653300,0.666330,0.556876,-0.193516,-0.157881,0.592963,0.165854,-0.051748,-0.863433,0.006718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,0.000729,0.114067,1.767021,1.185435,1.274671,0.480003,1.306979,0.618651,-0.974192,0.511398,0.487145,-1.062400,0.098978,-0.029468,-0.246848,0.202709,0.786174,-0.314994,0.902954,-0.151956,-0.295848,0.124786
1535,4,-0.812431,-0.006904,1.160483,1.107701,0.927459,1.412274,1.721133,-0.646252,-0.220298,0.248933,1.024702,-1.227923,0.653300,0.037087,-1.449379,-0.992686,-0.273011,0.157871,0.499735,-0.634713,-0.063700,-0.427333
1536,3,-0.775470,-0.748439,0.430161,1.014102,1.240253,1.491239,0.809449,-0.465551,-1.133775,-0.190442,-0.781072,-0.395056,-1.081521,-0.457319,-1.031901,-0.255074,0.106883,-0.083855,0.793608,0.419320,-0.847183,-0.370988
1537,5,-0.553699,0.283368,0.235569,-0.116472,0.063972,0.201960,0.054288,-0.646252,-0.672196,1.754725,0.066530,-0.751330,0.070982,0.535238,-0.282590,-0.480451,-0.581720,-0.401311,0.920519,-1.721949,-0.892888,-0.069401


In [73]:
# Saving the final linguistic feature set
linguistic_feature_set_final.to_csv('linguistic_features/linguistic_feature_set.csv')