# Feature Processing
This notebook loads all linguistic feature files calculated in the [linguistic feature calculation notebook](linguistic_feature_calculation.ipynb), and produces a single DataFrame containing the relevant variables for classification analysis. The complete list of features included in classification can be found in [this README](README.md). In addition, it normalizes variables with z-scoring as pre-processing for the classification algorithm.

In [50]:
# Importing necessary libraries
import pandas as pd
from scipy.stats import zscore

In [None]:
# Loading the linguistic features file
ling_features = pd.read_csv('linguistic_features/linguistic_features.csv', index_col = 0)

# TODO: finish this!! update the features

# Creating a DataFrame with the relevant features
ling_features_finalized = ling_features[
    ['Coherence', 
    'n_words', 
    'POS:NOUN', 
    'POS:PRON', 
    'POS:CCONJ', 
    'POS:SCONJ', 
    'POS:VERB',
    'open_closed_ratio', 
    'type_token_ratio', 
    'propositional_density', 
    'n_logical_operators', 
    'log10_freq_mean', 
    'semantic_diversity_mean',
    'semantic_thematic_distance', 
    'surprisal',
    'coref_local',
    'coref_global',
    'n_constituents',
    'mean_np_length',	
    'mean_vp_length']
]

display(ling_features_finalized)

Unnamed: 0,Coherence,n_words,POS:NOUN,POS:PRON,POS:CCONJ,POS:SCONJ,POS:VERB,open_closed_ratio,type_token_ratio,propositional_density,n_logical_operators,log10_freq_mean,semantic_diversity_mean,semantic_thematic_distance,surprisal,coref_local,coref_global,n_constituents,mean_np_length,mean_vp_length
0,4,210.0,0.141700,0.165992,0.020243,0.024291,0.125506,0.820513,0.609524,0.356275,4.0,4.682792,2.107979,0.198185,4.895232,0.550000,0.433333,152.0,2.354430,4.632653
1,5,256.0,0.109272,0.162252,0.026490,0.016556,0.142384,0.788732,0.535156,0.380795,7.0,4.666692,2.069585,0.192592,4.183853,0.650000,0.638095,196.0,2.516129,7.161765
2,5,758.0,0.139459,0.103784,0.022703,0.028108,0.123243,0.879795,0.509235,0.375135,18.0,4.396347,2.102881,0.180181,5.146196,0.148148,0.096959,599.0,3.094595,7.650273
3,4,812.0,0.165787,0.119324,0.030623,0.007392,0.117212,1.036082,0.492611,0.376980,30.0,4.421817,2.076899,0.176816,5.596125,0.152941,0.105062,546.0,3.010526,6.168605
4,5,155.0,0.177778,0.150000,0.044444,0.033333,0.155556,0.878049,0.593548,0.338889,9.0,4.596490,2.064892,0.161092,4.702588,0.600000,0.545455,123.0,2.484375,6.775000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,206.0,0.165254,0.144068,0.055085,0.038136,0.144068,1.067961,0.669903,0.453390,10.0,4.404882,2.118607,0.207649,3.565189,0.461538,0.384615,153.0,2.871429,9.181818
1535,4,96.0,0.111111,0.138889,0.046296,0.037037,0.138889,1.000000,0.791667,0.481481,3.0,4.578900,2.106413,0.222453,3.443641,0.600000,0.400000,64.0,1.703704,6.481481
1536,3,101.0,0.214286,0.107143,0.035714,0.035714,0.133929,1.061224,0.801980,0.419643,4.0,4.368046,2.086000,0.172722,4.055240,0.166667,0.285714,70.0,2.424242,7.450000
1537,5,131.0,0.138158,0.151316,0.032895,0.019737,0.164474,0.830986,0.633588,0.368421,3.0,4.474590,2.176371,0.196065,3.690323,0.500000,0.600000,97.0,2.204082,5.694444


In [52]:
# Loading the local coherence file
coherence = pd.read_csv('linguistic_features/coherence_results.csv')

# Calculating the mean local coherence each transcript
grouped_df = coherence.groupby('pt', as_index=False).mean(numeric_only = True)

# Creating a DataFrame with local coherence scores
local_coherence = grouped_df[['lc']].rename(columns = {'lc': 'local_coherence'})

display(local_coherence)

Unnamed: 0,local_coherence
0,0.274387
1,0.215685
2,0.161755
3,0.199047
4,0.375034
...,...
1534,0.268190
1535,0.323834
1536,0.295389
1537,0.258032


In [53]:
# Loading the text description features file
text_description = pd.read_csv('linguistic_features/text_description_ordered.csv', index_col = 0).drop('text', axis = 1)

display(text_description)

Unnamed: 0,dependency_distance_mean,prop_adjacent_dependency_relation_mean,first_order_coherence,second_order_coherence
0,1.984713,0.442565,0.366634,0.316578
1,2.568674,0.377021,0.310618,0.329741
2,2.731996,0.413946,0.352635,0.331568
3,2.332504,0.423813,0.308754,0.263125
4,2.442240,0.426583,0.318063,0.399475
...,...,...,...,...
1534,2.708896,0.422875,0.377143,0.412284
1535,2.563026,0.405010,0.401308,0.352386
1536,2.669339,0.444015,0.319754,0.358499
1537,2.715251,0.364776,0.314997,0.391217


In [54]:
# Concatenating the various feature DataFrames into a large set
linguistic_feature_set = pd.concat([ling_features_finalized, local_coherence, text_description], axis = 1)

# Dropping any rows with NaN values to avoid issues in classification
linguistic_feature_set = linguistic_feature_set.dropna()

# Normalizing the number of constituents and phrasal types by the number of words in a transcript (number of high-level constituents per word)
linguistic_feature_set[['n_constituents']] = linguistic_feature_set[['n_constituents']].div(linguistic_feature_set['n_words'], axis = 0)

In [55]:
# Normalizing each linguistic feature using z-scores
coherence_df = linguistic_feature_set['Coherence']
linguistic_feature_set_norm = linguistic_feature_set.drop('Coherence', axis = 1)
linguistic_feature_set_norm = linguistic_feature_set_norm.apply(zscore)
linguistic_feature_set_final = pd.concat([coherence_df, linguistic_feature_set_norm], axis = 1)

In [56]:
pd.set_option('display.max_columns', None)
display(linguistic_feature_set_final)

Unnamed: 0,Coherence,n_words,POS:NOUN,POS:PRON,POS:CCONJ,POS:SCONJ,POS:VERB,open_closed_ratio,type_token_ratio,propositional_density,n_logical_operators,log10_freq_mean,semantic_diversity_mean,semantic_thematic_distance,surprisal,coref_local,coref_global,n_constituents,mean_np_length,mean_vp_length,local_coherence,dependency_distance_mean,prop_adjacent_dependency_relation_mean,first_order_coherence,second_order_coherence
0,4,0.031521,0.007830,0.624773,-0.641477,0.204631,-0.063119,0.007916,0.012986,-0.134134,-0.465670,0.230932,0.281715,0.143735,0.759521,0.538403,0.276915,-0.555212,-0.328190,-0.997350,-0.262331,-1.087732,0.369707,-0.390776,-0.751544
1,5,0.371692,-0.719002,0.537476,-0.209887,-0.342357,0.512165,-0.154417,-0.555387,0.228985,0.076432,0.161142,-0.545229,-0.059321,-0.213751,0.915415,1.115458,0.105653,-0.162578,-0.006094,-0.761177,0.514575,-1.412746,-0.927500,-0.630440
2,5,4.083993,-0.042397,-0.827151,-0.471540,0.474520,-0.140248,0.310729,-0.753499,0.145170,2.064141,-1.010760,0.171913,-0.509978,1.102876,-0.976627,-1.100609,0.494636,0.429886,0.185372,-1.219473,0.962705,-0.408579,-0.524902,-0.613629
3,4,4.483324,0.547678,-0.464441,0.075642,-0.990419,-0.345813,1.109038,-0.880552,0.172490,4.232550,-0.900353,-0.387705,-0.632182,1.718446,-0.958557,-1.067427,-1.367487,0.343783,-0.395351,-0.902561,-0.133443,-0.140238,-0.945360,-1.243337
4,5,-0.375206,0.816435,0.251526,1.030509,0.844017,0.961112,0.301808,-0.109110,-0.391618,0.437834,-0.143172,-0.646314,-1.203129,0.495955,0.726909,0.736075,0.546963,-0.195101,-0.157682,0.592963,0.167656,-0.064916,-0.856162,0.011139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,0.001940,0.535744,0.113069,1.765604,1.183605,0.569554,1.271873,0.474449,1.304080,0.618535,-0.973763,0.510633,0.487392,-1.060174,0.204892,0.077405,-0.256370,0.201319,0.785644,-0.314994,0.899323,-0.165763,-0.290076,0.128986
1535,4,-0.811512,-0.677771,-0.007805,1.158446,1.105922,0.393032,0.924730,1.405061,1.720102,-0.646371,-0.219422,0.247992,1.024951,-1.226470,0.726909,0.140408,-1.458316,-0.994663,-0.272723,0.157871,0.499077,-0.651595,-0.058541,-0.422100
1536,3,-0.774537,1.634691,-0.748751,0.427377,1.012384,0.223960,1.237463,1.483884,0.804305,-0.465670,-1.133441,-0.191677,-0.780830,-0.389713,-0.906810,-0.327616,-1.041040,-0.256688,0.106877,-0.083855,0.790783,0.409152,-0.839955,-0.365860
1537,5,-0.552686,-0.071569,0.282236,0.232586,-0.117447,1.265085,0.061412,0.196901,0.045738,-0.646371,-0.671588,1.754793,0.066776,-0.888973,0.349897,0.959450,-0.292094,-0.482177,-0.581193,-0.401311,0.916759,-1.745756,-0.885538,-0.064838


In [57]:
# Saving the final linguistic feature set
linguistic_feature_set_final.to_csv('linguistic_features/linguistic_feature_set.csv')