In [None]:
import pandas as pd 
import spacy 
nlp = spacy.load('en_core_web_lg')

In [None]:
###NOTES FOR DOCUMENTATION###
#### overlap is computed as the percentage of noun types used with both
## determiners divided by the total number of noun types used with either determiner (or both determiners) ###
### We quantify the bias toward one or another
## determiner for each speaker. For each noun in a corpus, we count and compare the number of
## times the noun appears with a/an and the number of times it occurs with the. The ratio of the
## larger number of determiners to the total number is the measure of determiner bias.

In [None]:
subj_ids = [
22,24,25,27,28,29,33,37,38,39,40,42,43,44,45,47,48,49,50,51,54,55,58,
59,61,62,64,65,66,68,72,73,74,75,76,77,78,79,80,81,82,83,84,85,87,88,
89,91,92,100,102,103,105,106,107,108,109,110,119,123,124,125,126,127]

sessions=[1,2,3]


In [None]:
class DeterminerNounConstruction():
   
    def det_noun_phrase(self, x):
        x = nlp(x)
        for token in x:
            if (token.pos_ == 'NOUN') and (x[token.i - 1].pos_ == 'DET'):
                return 'noun_phrase'

    def noun_phrase_indef_def(self, x):
        x = nlp(x)
        indef = ['a', 'an']
        indef_count = 0
        def_count = 0
        for token in x:
            if (token.pos_ == 'NOUN') and (x[token.i - 1].pos_ == 'DET') and (x[token.i - 1].text in indef):
                indef_str = 'a' + ' ' + token.text
                indef_count += 1
            if (token.pos_ == 'NOUN') and (x[token.i - 1].pos_ == 'DET') and (x[token.i - 1].text == 'the'):
                def_count += 1
                def_str = 'the' + ' ' + token.text
        if (indef_count > 0) and (def_count > 0):
            return "definite_&_indefinite_noun_phrases"
        if indef_count > 0:
            return ["noun_phrase_indefinite", indef_str]
        if def_count > 0:
            return ["noun_phrase_definite", def_str]


    def noun_phrase_possessives(self, x):
        x = nlp(x)
        poss = ['my', 'mine', 'your', 'yours', 'their', 'theirs', 'his', 'her', 'hers', 'its', 'ours', 'our']
        indef_count = 0
        def_count = 0
        for token in x:
            if (token.pos_ == 'NOUN') and (x[token.i - 1].pos_ == 'DET') and (x[token.i - 1].text in poss):
                return 'noun_phrase_possessive'
            if token.dep_ == 'poss':
                pos = [part.pos_ for part in x[token.i:]]
                if "NOUN" in pos:
                    return 'possessive_noun_phrase'


    def noun_phrase_demonstratives(self, x):
        x = nlp(x)
        demos = ['this', 'that', 'these', 'those']
        for token in x:
            if (token.pos_ == 'NOUN') and (x[token.i - 1].pos_ == 'DET') and (x[token.i - 1].text in demos):
                return 'noun_phrase_demonstratives'
    
    def adj_noun_phrase(self, x):
        x = nlp(x)
        adj_count = 0
        adj_np = False
        for i, token in enumerate(x):
            if token.pos_ == 'DET':
                segs =[seg.pos_ for seg in x[i:]]
                segs = ['NOUN' if (s == 'PRON') or (s=='PROPN') else s for s in segs]
                if "NOUN" not in segs:
                    return ['', '']
                for s in segs:
                    if s == 'ADJ':
                        adj_count += 1
                        adj_np = True
        if adj_np == True:
            return adj_count, "adj_noun_phrase"
        else:
            return ['', '']
        
    def noun_phrase(self, x):
        x = nlp(x)
        count = 0
        np = False
        nps = ['NOUN', 'PROPN', 'PRON']

        segs =[seg.pos_ for seg in x[:]]
        segs = ['NOUN' if (s == 'PRON') or (s=='PROPN') else s for s in segs]
        if "NOUN" not in segs:
            return None 
        for s in segs:
            if s in nps:
                count += 1
                np = True
        if np == True and len(x) == 2:
            return "one_word_noun_phrase"
        elif np == True and len(x) > 2:
            return 'noun_phrase'
    

ndc = DeterminerNounConstruction()

ndc.noun_phrase_indef_def('where can we find the whale')


In [None]:
def det_use(x):
    
    all_nouns = []
    a_nouns = []
    the_nouns = []
    both_nouns = []
    for np in x:
        all_nouns.append(np.split()[1])
        if np.split()[0] == 'a':
            a_nouns.append(np.split()[1])
        else:
            the_nouns.append(np.split()[1])
    for word in a_nouns:
        if word in the_nouns:
            if word not in both_nouns:
                both_nouns.append(word)
            
    return [len(both_nouns), len(a_nouns), len(the_nouns), len(list(set(all_nouns))), both_nouns]

In [None]:
df_1_3 = pd.read_csv('sesssions_1_3_noun_phrases.csv')

In [None]:
df_1_3 = df_1_3[['subject', 'session', 'p_utts_orig', 'c_utts_orig', 
                'parent_noun_phrase', 'parent_noun_phrase_indef_def',
                'parent_noun_phrase_possessives', 'parent_noun_phrase_demonstratives',
                'child_noun_phrase', 'child_noun_phrase_indef_def', 'child_noun_phrase_possessives',
                'child_noun_phrase_demonstratives']]

In [None]:
df_1_3['parent_noun_phrase_indef'] = df_1_3['parent_noun_phrase_indef_def'].map(lambda x: True if x=='noun_phrase_indefinite' else False)

df_1_3['parent_noun_phrase_def'] = df_1_3['parent_noun_phrase_indef_def'].map(lambda x: True if x=='noun_phrase_definite' else False)

df_1_3['parent_indefinite_string'] = df_1_3[df_1_3['parent_noun_phrase_indef'] == True]['p_utts_orig'].map(lambda x: ndc.noun_phrase_indef_def(x)[1])

df_1_3['parent_definite_string'] = df_1_3[df_1_3['parent_noun_phrase_def'] == True]['p_utts_orig'].map(lambda x: ndc.noun_phrase_indef_def(x)[1])

In [None]:
df_1_3['child_noun_phrase_indef'] = df_1_3['child_noun_phrase_indef_def'].map(lambda x: True if x=='noun_phrase_indefinite' else False)

df_1_3['child_noun_phrase_def'] = df_1_3['child_noun_phrase_indef_def'].map(lambda x: True if x=='noun_phrase_definite' else False)

df_1_3['child_indefinite_string'] = df_1_3[df_1_3['child_noun_phrase_indef'] == True]['c_utts_orig'].map(lambda x: ndc.noun_phrase_indef_def(x)[1])

df_1_3['child_definite_string'] = df_1_3[df_1_3['child_noun_phrase_def'] == True]['c_utts_orig'].map(lambda x: ndc.noun_phrase_indef_def(x)[1])

In [None]:
dfs = []


for i in subj_ids:
    for s in sessions:
        df = df_1_3[(df_1_3['subject'] == i) & (df_1_3['session'] == s)]
        df = df_1_3[(df_1_3['subject'] == i) & (df_1_3['session'] == s)]
        dfs.append(df)

        

In [None]:
count = 0
the_np_counts = []
a_np_counts = []
overlap_proportion = []
sessions = []
subject = []
for df in dfs:
    if len(df) > 1:
        det_a_num = len(df[df['parent_noun_phrase_indef'] == True])
        det_the_num = len(df[df['parent_noun_phrase_def'] == True])
        a_np_counts.append(det_a_num)
        the_np_counts.append(det_the_num)
        sub = df.subject.tolist()[0]
        session = df.session.tolist()[0]
        subject.append(sub)
        sessions.append(session)
        count += 1


        det_list = df['parent_definite_string'].fillna('').tolist() + df['parent_indefinite_string'].fillna('').tolist()
        det_list = [x for x in det_list if x != '']

        overlap_vals = det_use(det_list)

        try:
            overlap = overlap_vals[0]/overlap_vals[3]

            overlap_proportion.append(overlap)
        except:
            overlap_proportion.append(0)
    

        
        

In [None]:
det_df = pd.DataFrame(data={'subject':subject, 'session':sessions, 'a_nps_counts': a_np_counts, 'the_nps_counts': the_np_counts, 'overlap': overlap_proportion})

In [None]:
df1 = det_df[det_df['session'] == 1]
df2 = det_df[det_df['session'] == 2]
df3 = det_df[det_df['session'] == 3]


from scipy import stats

t_test_s1 = stats.ttest_rel(df1.a_nps_counts, df1.the_nps_counts)

t_test_s2 = stats.ttest_rel(df2.a_nps_counts, df2.the_nps_counts)

t_test_s3 = stats.ttest_rel(df3.a_nps_counts, df3.the_nps_counts)