In [60]:
import webvtt
import spacy
import pandas as pd

# Load spaCy for NLP processing
nlp = spacy.load("en_core_web_sm")

# Define pronoun lists
first_person_pronouns = ["I", "me", "my", "we", "us"]
second_person_pronouns = ["you", "your", "yours"]
third_person_pronouns = ["he", "him", "his", "she", "her", "hers", "they", "them", "their", "theirs"]

# Function to count pronouns
def count_pronouns(text):
    doc = nlp(text)
    first_person = sum(1 for token in doc if token.text.lower() in first_person_pronouns)
    second_person = sum(1 for token in doc if token.text.lower() in second_person_pronouns)
    third_person = sum(1 for token in doc if token.text.lower() in third_person_pronouns)
    total_pronouns = first_person + second_person + third_person
    return first_person, second_person, third_person, total_pronouns

# Function to parse VTT file and store results in a DataFrame
def analyze_vtt(file_path):
    data = []  # List to store rows of data

    for caption in webvtt.read(file_path):
        speaker_text = caption.text.strip()  # Assuming speaker's text is stored here
        first_person, second_person, third_person, total_pronouns = count_pronouns(speaker_text)

        # Append data for this caption
        data.append({
            "File Name": file_path,
            "Start Time": caption.start,
            "End Time": caption.end,
            "First-Person Pronouns": first_person,
            "Second-Person Pronouns": second_person,
            "Third-Person Pronouns": third_person,
            "Total Pronouns": total_pronouns
        })

    # Create DataFrame from the collected data
    df = pd.DataFrame(data)
    return df

# Function to handle multiple VTT files
def analyze_multiple_vtt(files):
    all_data = []  # List to accumulate data from all files

    # Process each VTT file
    for file_path in files:
        file_data = analyze_vtt(file_path)  # Get DataFrame for the current file
        all_data.append(file_data)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Example usage
vtt_files = ["HAKA3_copy.vtt", "meeting1_copy.vtt", "meeting2_copy.vtt"]  # List of VTT files
pronoun_df = analyze_multiple_vtt(vtt_files)
pronoun_df


Unnamed: 0,File Name,Start Time,End Time,First-Person Pronouns,Second-Person Pronouns,Third-Person Pronouns,Total Pronouns
0,HAKA3_copy.vtt,00:00:04.000,00:00:08.000,0,1,0,1
1,HAKA3_copy.vtt,00:00:08.000,00:00:12.000,0,1,0,1
2,HAKA3_copy.vtt,00:00:16.000,00:00:19.000,0,1,0,1
3,HAKA3_copy.vtt,00:00:19.000,00:00:26.000,0,1,0,1
4,HAKA3_copy.vtt,00:00:26.000,00:00:30.000,1,1,0,2
5,HAKA3_copy.vtt,00:00:30.000,00:00:32.000,0,0,0,0
6,HAKA3_copy.vtt,00:00:32.000,00:00:38.000,0,1,0,1
7,HAKA3_copy.vtt,00:00:38.000,00:00:41.000,0,0,0,0
8,HAKA3_copy.vtt,00:00:41.000,00:00:51.000,0,0,0,0
9,HAKA3_copy.vtt,00:00:51.000,00:00:55.000,0,0,0,0


In [61]:
import webvtt
import spacy
import pandas as pd

# Load spaCy for NLP processing
nlp = spacy.load("en_core_web_sm")

# Function to count syntactic complexity features
def count_syntactic_complexity(text):
    doc = nlp(text)
    
    # Sentence length (number of tokens)
    sentence_length = len(doc)

    # Count the number of dependent clauses (e.g., clauses with an auxiliary verb or subordinating conjunction)
    dependent_clauses = sum(1 for token in doc if token.dep_ in ['ccomp', 'xcomp', 'acl', 'advcl', 'relcl'])
    
    # Count the number of noun phrases (this can give a sense of complexity in terms of subject-verb structures)
    noun_phrases = len(list(doc.noun_chunks))
    
    # Count the number of verbs (as an indicator of action complexity)
    verbs = len([token for token in doc if token.pos_ == 'VERB'])
    
    # Calculate depth of the sentence (based on number of tokens with the 'punct' dependency relation)
    # This can give us an approximation of syntactic complexity as deeper sentences tend to have more punctuations
    sentence_depth = sum(1 for token in doc if token.dep_ == 'punct')
    
    return sentence_length, dependent_clauses, noun_phrases, verbs, sentence_depth

# Parse VTT file and extract text for syntactic complexity analysis
def analyze_vtt_syntactic_complexity(file_path):
    data = []  # List to store results for each caption

    for caption in webvtt.read(file_path):
        speaker_text = caption.text.strip()  # Get the caption text
        sentence_length, dependent_clauses, noun_phrases, verbs, sentence_depth = count_syntactic_complexity(speaker_text)
        
        # Collect the results along with the file name
        data.append({
            "File Name": file_path,  # Add the file name to track which file this data comes from
            "Start Time": caption.start,
            "End Time": caption.end,
            "Sentence Length": sentence_length,
            "Dependent Clauses": dependent_clauses,
            "Noun Phrases": noun_phrases,
            "Verbs": verbs,
            "Sentence Depth": sentence_depth
        })
    
    # Convert the collected data into a DataFrame
    df = pd.DataFrame(data)
    return df

# Function to handle multiple VTT files
def analyze_multiple_vtt(files):
    all_data = []  # List to accumulate data from all files

    # Process each VTT file
    for file_path in files:
        file_data = analyze_vtt_syntactic_complexity(file_path)  # Get DataFrame for the current file
        all_data.append(file_data)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Example usage
vtt_files = ["HAKA3_copy.vtt", "meeting1_copy.vtt", "meeting2_copy.vtt"]  # List of VTT files
complexity_df = analyze_multiple_vtt(vtt_files)
complexity_df


Unnamed: 0,File Name,Start Time,End Time,Sentence Length,Dependent Clauses,Noun Phrases,Verbs,Sentence Depth
0,HAKA3_copy.vtt,00:00:04.000,00:00:08.000,14,0,2,0,4
1,HAKA3_copy.vtt,00:00:08.000,00:00:12.000,28,0,8,0,8
2,HAKA3_copy.vtt,00:00:16.000,00:00:19.000,11,0,4,0,3
3,HAKA3_copy.vtt,00:00:19.000,00:00:26.000,19,0,5,0,4
4,HAKA3_copy.vtt,00:00:26.000,00:00:30.000,13,1,5,1,4
5,HAKA3_copy.vtt,00:00:30.000,00:00:32.000,12,0,3,0,4
6,HAKA3_copy.vtt,00:00:32.000,00:00:38.000,16,0,4,2,4
7,HAKA3_copy.vtt,00:00:38.000,00:00:41.000,13,0,4,1,2
8,HAKA3_copy.vtt,00:00:41.000,00:00:51.000,26,2,7,1,5
9,HAKA3_copy.vtt,00:00:51.000,00:00:55.000,12,0,1,0,5


In [62]:
# Merge the two DataFrames on 'Start Time' and 'End Time'
pronoun_and_complexity_df = pd.merge(
        pronoun_df,  # First DataFrame with pronoun counts
        complexity_df,  # Second DataFrame with syntactic complexity features
        on=["Start Time", "End Time", "File Name"],  # Common columns to merge on
        how="outer"  # 'outer' merge will include all rows, adjust as necessary
    )

In [63]:
pronoun_and_complexity_df.drop(['Start Time', 'End Time', 'File Name'], axis=1, inplace=True)

In [64]:
pronoun_and_complexity_df

Unnamed: 0,First-Person Pronouns,Second-Person Pronouns,Third-Person Pronouns,Total Pronouns,Sentence Length,Dependent Clauses,Noun Phrases,Verbs,Sentence Depth
0,0,0,0,0,8,0,2,0,2
1,0,0,0,0,8,0,2,0,2
2,0,1,0,1,14,0,2,0,4
3,0,0,0,0,14,2,2,3,2
4,0,0,0,0,14,2,2,3,2
5,0,1,0,1,28,0,8,0,8
6,0,0,0,0,9,1,2,2,2
7,0,0,0,0,9,1,2,2,2
8,0,1,0,1,6,0,2,1,2
9,0,1,0,1,6,0,2,1,2


In [66]:
pronoun_and_complexity_df.to_csv('pronoun_and_complexity_df.csv', index=False)

Stored 'pronoun_and_complexity_df' (DataFrame)
