In [88]:
# This cell can be skipped if the needed libraries are already installed
# !pip install transformers
# !pip install minicons

In [155]:
# Importing necessary libraries
import warnings
warnings.filterwarnings("ignore", message="NotOpenSSLWarning")
warnings.filterwarnings("ignore", message = r"\[W007\].*", category = UserWarning)
from typing import Tuple
import numpy as np
import pandas as pd
import re
import spacy
import gensim.downloader as api
from gensim.models import KeyedVectors
from minicons import scorer
import csv
import textdescriptives as td
import contractions
import stanza

### Data Loading
This section loads the narrative data necessary for the remainder of the script, including the text transcripts in a variety of formats (cleaned with punctuation, cleaned without punctuation, or separated by utterance). Additionally, it loads the language models and datasets for lexical feature analysis (SpaCy, word2vec, and minicons GPT2 surprisal scorer).

In [90]:
# Loading the processed transcripts (cleaned and formatted, but retains original punctuation for SpaCy tokenization)
spacy_transcripts = pd.read_csv('processed_data/transcripts_spacy_formatted.csv')

# Creating a linguistic feature DataFrame that includes coherence scores
linguistic_features = spacy_transcripts[['Coherence']].copy()

In [91]:
# Loading the transcripts with no punctuation
transcripts_no_punc = pd.read_csv('processed_data/transcripts_no_punc_formatted.csv', index_col = 0)

In [92]:
# Loading the utterance-based transcriptions into a list
story_lists = []
with open('processed_data/sentences.csv', 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        story_lists.append(row)

In [93]:
# Loading the SpaCy language processing model
nlp = spacy.load("en_core_web_sm")

In [94]:
# Loading the word2vec model
path = api.load("word2vec-google-news-300", return_path=True)
sem_sim_model = KeyedVectors.load_word2vec_format(path, binary=True)

In [95]:
# Loading the model for surprisal
surprisal_model = scorer.IncrementalLMScorer('gpt2')

In [119]:
# Loading the stanza model
stanza.download('en')
stanza_model = stanza.Pipeline(lang='en', processors='tokenize, pos, constituency')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-12-03 13:34:15 INFO: Downloaded file to /Users/emicatx/stanza_resources/resources.json
2025-12-03 13:34:15 INFO: Downloading default packages for language: en (English) ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-12-03 13:34:17 INFO: File exists: /Users/emicatx/stanza_resources/en/default.zip
2025-12-03 13:34:18 INFO: Finished downloading models and saved to /Users/emicatx/stanza_resources
2025-12-03 13:34:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-12-03 13:34:18 INFO: Downloaded file to /Users/emicatx/stanza_resources/resources.json
2025-12-03 13:34:19 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2025-12-03 13:34:19 INFO: Using device: cpu
2025-12-03 13:34:19 INFO: Loading: tokenize
2025-12-03 13:34:19 INFO: Loading: mwt
2025-12-03 13:34:19 INFO: Loading: pos
2025-12-03 13:34:20 INFO: Loading: constituency
2025-12-03 13:34:20 INFO: Done loading processors!


### Feature Calculations
This section contains functions and code to calculate a series of linguistic features, particularly lexical and macrolinguistic metrics. It includes the following features (see [this document](README.md) for a more comprehensive description):
- *Part-of-Speech Proportions*: number of words tagged as a particular part of speech (adjectives, adpositions, adverbs, auxiliaries, coordinating and subordinating conjunctions, determiners, interjections, nouns, numerals, particles, pronouns, proper nouns, verbs, punctuation, spaces, symbols, and unidentified characters) divided by total number of words
- *Open-Closed Class Ratio*: ratio of number of open-class words (nouns, verbs, adjectives, and adverbs) to number of closed-class words (adpositions, auxiliaries, coordinating and subordinating conjunctions, determiners, interjections, numerals, particles, and pronouns)
- *Propositional Density*: number of words introducing or carrying ideas (adjectives, adverbs, adpositions, coordinating and subordinating conjunctions, verbs) divided by total number of words
- *Number of Words*: total number of words in a transcript
- *Number of Utterances*: total number of utterances in a transcript
- *Mean Words per Utterance*: mean number of words per utterance in a transcript
- *Number of Logical Operators*: total number of logical operators (and, or, not, if...then) in a transcript
- *Type-Token Ratio*: number of unique words divided by total number of words
- *Semantic Diversity*: ***finish this***
- *Age of Acquisition*: approximate age at which a word is learned, averaged across all words in a transcript with provided ratings
- *Concreteness*: the extent to which the concept a word denotes refers to a perceptible entity ***citation!!!***, averaged across all words in a transcript with provided ratings
- *Semantic Thematic Distance*: ***finish this***
- *Surprisal*: degree to which a word is unexpected based on prior words, averaged across all words in a transcript

In [96]:
def part_of_speech_features(input_df: pd.DataFrame, feature_df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Calculates the proportion of part-of-speech tags, open-closed class ratio, and propositional density
        in a transcript
        
        Parameters 
        ----------
        input_df: pd.DataFrame
            Input DataFrame with column storing transcripts as 'Text'
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features

        Returns
        -------
        pd.DataFrame
            Feature DataFrame with columns for part-of-speech proportions, open-closed class ratio, and 
            propositional density
    """

    pos_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X']
    open_class_tags = ['NOUN', 'VERB', 'ADJ', 'ADV']
    closed_class_tags = ['ADP', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NUM', 'PART', 'PRON', 'SCONJ']
    propositional_density_tags = ['ADJ', 'ADV', 'ADP', 'CCONJ', 'SCONJ', 'VERB']

    for i in feature_df.index:  

        string = input_df['Text'].iloc[i]
        doc = nlp(string)
        testing = []
        open_class = 0
        closed_class = 0
        prop_density = 0

        for token in doc:
            testing.append(token.pos_)
            if token.pos_ in open_class_tags:
                open_class += 1
            elif token.pos_ in closed_class_tags:
                closed_class += 1
            if token.pos_ in propositional_density_tags:
                prop_density += 1
        
        for _, pos in enumerate(pos_tags):
            try:
                feature_df.loc[i, f'POS:{pos}'] = testing.count(pos)/len(testing) # TODO: make sure this covers everything, and add checks
            except ZeroDivisionError:
                print("No tokens have been identified in the input")

        try:
            feature_df.loc[i, 'open_closed_ratio'] = open_class / closed_class
        except ZeroDivisionError:
            print(f"Index {i} has no closed class words; check input")

        feature_df.loc[i, 'propositional_density'] = prop_density / len(testing)

    return feature_df

In [97]:
# Applying the part-of-speech feature function to the SpaCy-formatted transcripts (cleaned with punctuation)
linguistic_features = part_of_speech_features(spacy_transcripts ,linguistic_features)

In [98]:
def utterance_number_features(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Calculates the number of utterances and the mean number of words per utterance in a transcript
        
        Parameters 
        ----------
        input_lists: list
            Input lists with each sub-list containing story transcripts separated by utterance
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features
            
        Returns
        -------
        pd.DataFrame
            Feature DataFrame with columns for number of utterances and mean number of words per utterance
    """

    for i, story in enumerate(input_lists):
        
        feature_df.loc[i, 'n_utterances'] = len(story)

        sentence_lengths = []
        for sentence in story:
            replaced_sentence = re.sub(r'[^A-Za-z0-9\s]', '', sentence)
            replaced_sentence = re.sub(r' +|\n', ' ', replaced_sentence)
            split_sentence = replaced_sentence.split(' ')
            split_sentence = [word for word in split_sentence if word != '']
            length = len(split_sentence)
            sentence_lengths.append(length)

        sentence_array = np.array(sentence_lengths)
        mean = np.nanmean(sentence_array)
        feature_df.loc[i, 'n_words_per_utterance'] = mean

    return feature_df


In [99]:
linguistic_features = utterance_number_features(story_lists, linguistic_features)
display(linguistic_features)

Unnamed: 0,Coherence,POS:ADJ,POS:ADP,POS:ADV,POS:AUX,POS:CCONJ,POS:DET,POS:INTJ,POS:NOUN,POS:NUM,...,POS:PUNCT,POS:SCONJ,POS:SPACE,POS:SYM,POS:VERB,POS:X,open_closed_ratio,propositional_density,n_utterances,n_words_per_utterance
0,4,0.044534,0.064777,0.076923,0.068826,0.020243,0.093117,0.000000,0.141700,0.008097,...,0.125506,0.024291,0.0,0.0,0.125506,0.0,0.820513,0.356275,21.0,10.000000
1,5,0.043046,0.076159,0.076159,0.059603,0.026490,0.076159,0.000000,0.109272,0.009934,...,0.125828,0.016556,0.0,0.0,0.142384,0.0,0.788732,0.380795,21.0,12.190476
2,5,0.054054,0.091892,0.055135,0.060541,0.022703,0.077838,0.002162,0.139459,0.009730,...,0.152432,0.028108,0.0,0.0,0.123243,0.0,0.879795,0.375135,82.0,9.243902
3,4,0.071806,0.080253,0.069694,0.055966,0.030623,0.080253,0.001056,0.165787,0.013728,...,0.141499,0.007392,0.0,0.0,0.117212,0.0,1.036082,0.376980,86.0,9.441860
4,5,0.033333,0.038889,0.033333,0.061111,0.044444,0.094444,0.000000,0.177778,0.016667,...,0.138889,0.033333,0.0,0.0,0.155556,0.0,0.878049,0.338889,11.0,14.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,0.084746,0.059322,0.072034,0.067797,0.055085,0.042373,0.000000,0.165254,0.004237,...,0.097458,0.038136,0.0,0.0,0.144068,0.0,1.067961,0.453390,14.0,14.714286
1535,4,0.074074,0.055556,0.129630,0.074074,0.046296,0.055556,0.000000,0.111111,0.000000,...,0.092593,0.037037,0.0,0.0,0.138889,0.0,1.000000,0.481481,6.0,16.000000
1536,3,0.071429,0.098214,0.044643,0.044643,0.035714,0.107143,0.000000,0.214286,0.008929,...,0.098214,0.035714,0.0,0.0,0.133929,0.0,1.061224,0.419643,7.0,14.428571
1537,5,0.032895,0.065789,0.052632,0.072368,0.032895,0.078947,0.000000,0.138158,0.013158,...,0.118421,0.019737,0.0,0.0,0.164474,0.0,0.830986,0.368421,11.0,11.909091


In [100]:
def logical_operators(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:
    
    """ 
        Calculates the sum of logical operators (and, or, not, if...then) in a transcript
        
        Parameters 
        ----------
        input_lists: list
            Input lists with each sub-list containing story transcripts separated by utterance
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features
            
        Returns
        -------
        pd.DataFrame
            Feature DataFrame with column for the number of logical operators
    """

    logical_operator_list = [
        (r'^And'),
        (r' and[!?\-,.:;\'"\s]'),
        (r' not[!?\-,.:;\'"\s]'),
        (r' or[!?\-,.:;\'"\s]'),
        (r' (if) (.*?)(then)[!?\-…,.:;\'"\s]')
    ]

    for i, story in enumerate(input_lists):
        logical_operator_number = []
        for string in story:
            for pattern in logical_operator_list:
                match = re.findall(pattern, string, flags = re.IGNORECASE)
                n_operators = len(match)
                logical_operator_number.append(n_operators)
        feature_df.loc[i, 'n_logical_operators'] = np.sum(logical_operator_number)

    return feature_df

In [101]:
linguistic_features = logical_operators(story_lists, linguistic_features)

In [102]:
def string_ttr(string: str) -> Tuple[float, float]:

    """ 
        Calculates the type-token ratio (number of unique words divided by total number of words)
        of a string
        
        Parameters 
        ----------
        string: str
            String containing text without punctuation
            
        Returns
        -------
        Tuple[float, float]
            Value of type-token ratio and number of tokens in the string

        Raises
        ------
        ValueError 
            If type-token ratio is greater than 1
        ValueError
            If type-token ratio is negative
    """

    string_lower = string.lower()
    string_split = string_lower.split(' ')
    string_set = set(string_split)
    n_tokens = len(string_split)
    types = len(string_set)

    try:
        ttr = types / n_tokens
    except ZeroDivisionError:
        print(f"Input string has no tokens (empty string)")

    if ttr > 1:
        raise ValueError('Type-token ratio cannot exceed 1')
    if ttr < 0:
        raise ValueError('Type-token ratio cannot be negative')

    return ttr, n_tokens


def word_count_features(input_df: pd.DataFrame, feature_df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Calculates the number of words and type-token ratio for each transcript
        
        Parameters 
        ----------
        input_df: pd.DataFrame
            Input DataFrame with column storing transcripts as 'Text'
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features

        Returns
        -------
        pd.DataFrame
            Feature DataFrame with columns for type-token ratio and total number of words
    """

    for i in input_df.index:
        string = input_df['Text'].iloc[i]
        feature_df.loc[i, 'type_token_ratio'], feature_df.loc[i, 'n_words'] = string_ttr(string)

    return feature_df

In [103]:
linguistic_features = word_count_features(transcripts_no_punc, linguistic_features)

In [104]:
def lexical_features(dictionary: dict, feature: str, feature_df: pd.DataFrame, input_df: pd.DataFrame = transcripts_no_punc) -> pd.DataFrame:

    """ 
        Base function for calculating a series of lexical features (used for concreteness, age of 
        acquisition, word frequency, and semantic diversity)
        
        Parameters 
        ----------
        dictionary: dict:
            Dictionary containing words and associated pre-defined values, imported from
            external sources
        feature: str
            Feature used to name column of feature DataFrame
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features
        input_df: pd.DataFrame
            Input DataFrame with column storing transcripts as 'Text', defaults to transcripts
            with no punctuation

        Returns
        -------
        pd.DataFrame
            Feature DataFrame with columns for the relevant feature
    """

    for i in input_df.index:
        string = input_df['Text'].iloc[i]
        string_split = (string.lower()).split(' ')
        values = []

        for item in string_split:
            value = dictionary.get(item)

            if value is not None:
                values.append(value)
        mean = np.nanmean(np.array(values))

        feature_df.loc[i, feature] = mean

    return feature_df

In [105]:
# Word Frequency (logarithm)

frequency = pd.read_csv('lexical_feature_data/SUBTLEXusExcel2007.csv')
frequency_dict_original = dict(zip(frequency['Word'], frequency['Lg10WF']))
frequency_dict = {(word.lower() if isinstance(word, str) else word): value for word, value in frequency_dict_original.items()}

linguistic_features = lexical_features(frequency_dict, 'log10_freq_mean', linguistic_features)

In [106]:
# Semantic Diversity

semantic_diversity = pd.read_csv('lexical_feature_data/13428_2012_278_MOESM1_ESM.csv', dtype = str)
semantic_diversity = (semantic_diversity.rename(columns = {'Supplementary Materials: SemD values': 'word', 'Unnamed: 2': 'sem_diversity'})).drop(0)
semantic_diversity_dict_original = dict(zip(semantic_diversity['word'], semantic_diversity['sem_diversity']))
semantic_diversity_dict = {(word.lower() if isinstance(word, str) else word): (float(value)) for word, value in semantic_diversity_dict_original.items()}


linguistic_features = lexical_features(semantic_diversity_dict, 'semantic_diversity_mean', linguistic_features)

In [107]:
# Age of Acquisition

aoa = pd.read_csv('lexical_feature_data/AoA_ratings_Kuperman_et_al_BRM.txt', sep = '\t')
aoa_dict_original = dict(zip(aoa['Word'], aoa['Rating.Mean']))
aoa_dict = {(word.lower() if isinstance(word, str) else word): (float(value)) for word, value in aoa_dict_original.items()}

linguistic_features = lexical_features(aoa_dict, 'aoa_mean', linguistic_features)

In [108]:
# Concreteness

concreteness = pd.read_csv('lexical_feature_data/concreteness_ratings.csv')
concreteness_dict_original = dict(zip(concreteness['Word'], concreteness['Conc.M']))
concreteness_dict = {(word.lower() if isinstance(word, str) else word): (float(value)) for word, value in concreteness_dict_original.items()}

linguistic_features = lexical_features(concreteness_dict, 'concreteness_mean', linguistic_features)

In [109]:
def semantic_thematic_distance(input_df: pd.DataFrame, feature_df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Calculates semantic thematic distance (cosine similarity between each adjacent word pair
        in a transcript)
        
        Parameters 
        ----------
        input_df: pd.DataFrame
            Input DataFrame with column storing transcripts as 'Text'
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features

        Returns
        -------
        pd.DataFrame
            Feature DataFrame with column for semantic thematic distance
        """
   
    for i in input_df.index:  

        string = input_df['Text'].iloc[i]
        string_split = (string.lower()).split(' ')
        sem_sim = []

        for j in range(len(string_split) - 1):
            try:
                sem_sim_value = sem_sim_model.similarity(string_split[j], string_split[j + 1])
            except KeyError:
                continue
            
            sem_sim.append(float(sem_sim_value))
        
        sem_thematic_distance = np.nanmean(np.array(sem_sim))

        feature_df.loc[i, 'semantic_thematic_distance'] = sem_thematic_distance

    return feature_df

In [110]:
linguistic_features = semantic_thematic_distance(transcripts_no_punc, linguistic_features)

In [111]:
def surprisal(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Calculates average surprisal for a transcript
        
        Parameters 
        ----------
        input_lists: list
            Input lists with each sub-list containing story transcripts separated by utterance
        feature_df: pd.DataFrame
            DataFrame storing coherence scores and linguistic features

        Returns
        -------
        pd.DataFrame
            Feature DataFrame with column for surprisal
        """
    
    for i, story in enumerate(input_lists):
        transcript_surprisal = []
        
        for sentence in story:
            sentence_surprisal = surprisal_model.sequence_score(sentence, reduction = lambda x: -x.mean(0).item())
            transcript_surprisal.append(sentence_surprisal)
        
        transcript_surprisal_array = np.array(transcript_surprisal)
        surprisal_mean = np.nanmean(transcript_surprisal_array)

        feature_df.loc[i, 'surprisal'] = surprisal_mean
    
    return feature_df

In [112]:
linguistic_features = surprisal(story_lists, linguistic_features)

In [113]:
display(linguistic_features)

Unnamed: 0,Coherence,POS:ADJ,POS:ADP,POS:ADV,POS:AUX,POS:CCONJ,POS:DET,POS:INTJ,POS:NOUN,POS:NUM,...,n_words_per_utterance,n_logical_operators,type_token_ratio,n_words,log10_freq_mean,semantic_diversity_mean,aoa_mean,concreteness_mean,semantic_thematic_distance,surprisal
0,4,0.044534,0.064777,0.076923,0.068826,0.020243,0.093117,0.000000,0.141700,0.008097,...,10.000000,4.0,0.609524,210.0,4.682792,2.107979,4.616221,2.597958,0.198185,4.895232
1,5,0.043046,0.076159,0.076159,0.059603,0.026490,0.076159,0.000000,0.109272,0.009934,...,12.190476,7.0,0.535156,256.0,4.666692,2.069585,4.606250,2.615833,0.192592,4.183853
2,5,0.054054,0.091892,0.055135,0.060541,0.022703,0.077838,0.002162,0.139459,0.009730,...,9.243902,18.0,0.509235,758.0,4.396347,2.102881,5.347357,2.420347,0.180181,5.146196
3,4,0.071806,0.080253,0.069694,0.055966,0.030623,0.080253,0.001056,0.165787,0.013728,...,9.441860,30.0,0.492611,812.0,4.421817,2.076899,5.135905,2.673849,0.176816,5.596125
4,5,0.033333,0.038889,0.033333,0.061111,0.044444,0.094444,0.000000,0.177778,0.016667,...,14.090909,9.0,0.593548,155.0,4.596490,2.064892,5.063814,2.591143,0.161092,4.702588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,3,0.084746,0.059322,0.072034,0.067797,0.055085,0.042373,0.000000,0.165254,0.004237,...,14.714286,10.0,0.669903,206.0,4.404882,2.118607,5.127013,2.415114,0.207649,3.565189
1535,4,0.074074,0.055556,0.129630,0.074074,0.046296,0.055556,0.000000,0.111111,0.000000,...,16.000000,3.0,0.791667,96.0,4.578900,2.106413,5.049481,2.350449,0.222453,3.443641
1536,3,0.071429,0.098214,0.044643,0.044643,0.035714,0.107143,0.000000,0.214286,0.008929,...,14.428571,4.0,0.801980,101.0,4.368046,2.086000,5.140270,2.389186,0.172722,4.055240
1537,5,0.032895,0.065789,0.052632,0.072368,0.032895,0.078947,0.000000,0.138158,0.013158,...,11.909091,3.0,0.633588,131.0,4.474590,2.176371,5.095055,2.484435,0.196065,3.690323


# ***TODO: SEGMENT OUT THESE FEATURE TYPES***

In [115]:
def noun_cohesion_arrays(input_lists: list) -> np.ndarray:
    
    # TODO: type hints, docstrings
    noun_pos = ['NOUN', 'PROPN', 'PRON']

    story_lists_updated = []
    for story in input_lists:
        story_nouns = []
        for sentence in story:
            sentence_nouns = []
            sentence_str = str(sentence)
            doc = nlp(contractions.fix(sentence_str.lower()))
            for token in doc:
                if token.pos_ in noun_pos:
                    sentence_nouns.append(str(token))
            story_nouns.append(sentence_nouns)
        story_lists_updated.append(story_nouns)

        
    all_cohesion_arrays = np.ndarray((len(input_lists),), dtype = np.ndarray)

    for k in range(len(story_lists_updated)):
        string = story_lists_updated[k]
        cohesion_array = np.zeros((len(string), len(string)))
        for i in range(len(string)):
            current_string = string[i]
            for j in range(len(string)):
                new_string = string[j]
                if any(item in new_string for item in current_string):
                    cohesion_array[i, j] = 1
        all_cohesion_arrays[k] = cohesion_array

    for array in all_cohesion_arrays: # TODO: describe check for symmetry
        if not np.array_equal(array, array.T):
            raise ValueError("Array must be symmetric; check input")

    return all_cohesion_arrays

def local_global_cohesion(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:

    # TODO: type hints, docstrings

    input_array = noun_cohesion_arrays(input_lists)

    for n, array in enumerate(input_array):
        counter = 0
        for i in range(array.shape[0] - 1):
                counter += array[i, i+1]
        try:
            coref_local = counter / (array.shape[0] - 1)
        except ZeroDivisionError:
            coref_local = 0 

        feature_df.loc[n, 'coref_local'] = coref_local

    feature_df = feature_df.sort_index()

    np.seterr('raise')

    for i, array in enumerate(input_array):
        n = array.shape[0]
        counter = (array.sum() - np.diag(array).sum()) / 2
        try:
            coref_global = counter / (n * (n - 1) / 2)
        except Exception as e:
            coref_global = 0 

        feature_df.loc[i, 'coref_global'] = coref_global

    return feature_df

In [116]:
linguistic_features = local_global_cohesion(story_lists, linguistic_features)

In [122]:
# Segmentation

def parser(input_df: pd.DataFrame) -> list:

    parsed_transcripts = []

    for i in input_df.index:
        string = input_df['Text'].loc[i]
        doc = stanza_model(string)
        story_updated = []
        for sentence in doc.sentences:
            tree = sentence.constituency
            string_tree = str(tree).split(' ')
            story_updated.append(string_tree)

        parsed_transcripts.append(story_updated)
    
    return parsed_transcripts

In [123]:
parsed_transcripts = parser(spacy_transcripts)

In [145]:
def constituent_counter(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:

    # TODO: type hints, docstrings

    counter_pos = ["(NP", "(VP", "(SBAR", "(PP"]

    for i, story in enumerate(input_lists):
        counter = 0
        type_counter = [0, 0, 0, 0]
        for sentence in story:
            for item in sentence:
                if item in counter_pos:
                    counter += 1
                    idx = counter_pos.index(item)
                    type_counter[idx] += 1

        feature_df.loc[i, 'n_constituents'] = counter
        feature_df.loc[i, 'n_noun_phrases'] = type_counter[0]
        feature_df.loc[i, 'n_verb_phrases'] = type_counter[1]
        feature_df.loc[i, 'n_sub_clauses'] = type_counter[2]
        feature_df.loc[i, 'n_prep_phrases'] = type_counter[3]

    return feature_df


def phrase_parser(input_lists: list) -> Tuple[list, list]:
    
    
    # TODO: Noun and verb phrase parser

    noun_phrases_corpus = []
    verb_phrases_corpus = []
    closing_pattern = r'\)'

    phrasal_pos = ["(NP", "(VP"]

    for story in input_lists:
        noun_phrases_overall = []
        verb_phrases_overall = []
        for sentence in story:
            noun_phrases = []
            verb_phrases = []
            for i in range(len(sentence)):
                item = sentence[i]
                if item in phrasal_pos:
                    open_counter = 1
                    for j in range(i + 1, len(sentence)):
                        first_element = sentence[j][0]
                        match = re.findall(closing_pattern, sentence[j])
                        closed = len(match)
                        if first_element == '(':
                            open_counter += 1
                        open_counter -= closed
                        if open_counter <= 0:
                            break
                    phrase = [item for item in sentence[i:j + 1] if item[-1] == ')']
                    if item == phrasal_pos[0]:
                        noun_phrases.append(phrase)
                    else:
                        verb_phrases.append(phrase)

            noun_phrases_overall.append(noun_phrases)
            verb_phrases_overall.append(verb_phrases)

        noun_phrases_corpus.append(noun_phrases_overall)
        verb_phrases_corpus.append(verb_phrases_overall)

    return noun_phrases_corpus, verb_phrases_corpus

def phrase_length(phrase_list: list) -> list:
    
    # TODO: type hints + docstrings

    phrase_lengths = []

    for story in phrase_list:
        length_list = []
        for sentence in story:
            for phrase in sentence:
                length = len(phrase)
                length_list.append(length)

        length_array = np.array(length_list)
        mean = length_array.mean()
        phrase_lengths.append(float(mean))

    return phrase_lengths

def phrase_mean_length(input_lists: list, feature_df: pd.DataFrame) -> pd.DataFrame:
    
    # TODO: type hints + docstrings

    noun_phrases, verb_phrases = phrase_parser(input_lists)

    noun_phrase_lengths = phrase_length(noun_phrases)
    verb_phrase_length = phrase_length(verb_phrases)
    
    feature_df['mean_np_length'] = noun_phrase_lengths
    feature_df['mean_vp_length'] = verb_phrase_length

    return feature_df

In [146]:
linguistic_features = constituent_counter(parsed_transcripts, linguistic_features)
linguistic_features = phrase_mean_length(parsed_transcripts, linguistic_features)

In [149]:
# Saving the linguistic feature DataFrame
linguistic_features.to_csv('linguistic_features/linguistic_features.csv')
# TODO: move anything that doesn't require the spacy ling features to another section (lexical, macrolinguistic, syntactic)

In [150]:
def text_descriptions(input_df: pd.DataFrame) -> pd.DataFrame:
    
    text_description = pd.DataFrame()

    for i in input_df.index:  
        string = input_df['Text'].loc[i]
        text_description_df = td.extract_metrics(text = string, metrics = ["readability", "dependency_distance", "coherence"], spacy_model = 'en_core_web_sm')
        text_description = pd.concat([text_description_df, text_description])

    text_description_ordered = (text_description[::-1]).reset_index()
    text_description_ordered = text_description_ordered[['text', 'dependency_distance_mean', 'prop_adjacent_dependency_relation_mean', 'first_order_coherence', 'second_order_coherence']]
    
    return text_description_ordered

In [156]:
text_description_ordered = text_descriptions(spacy_transcripts)

In [158]:
text_description_ordered.to_csv('linguistic_features/text_description_ordered.csv')