In [3]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import json 
import os
import random
import sys
import string
from sklearn.feature_extraction.text import TfidfTransformer
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
sns.set_theme(style="whitegrid")
import sys

In [4]:
analysis_data_dir = '/Users/mario/code/exp-rep/data/BNC-2014/two-speakers/analysis/'
dialign_output = analysis_data_dir + 'dialign-output/'

# dialign_output += 'nopos/'

In [5]:
shared_lexica = {}
for f in os.listdir(dialign_output + 'nopos/'):
    if f.endswith('_tsv-lexicon.tsv') and not f.startswith('.'):
        filepath = os.path.join(dialign_output + 'nopos/', f)
        dial_id = f.split('_')[0]
        if dial_id not in shared_lexica:
            shared_lexica[dial_id] = pd.read_csv(filepath, sep='\t', header=0)
        else:
            shared_lexica[dial_id] = pd.concat([shared_lexica[dial_id], pd.read_csv(filepath, sep='\t', header=0)])


self_lexica = {}
for f in os.listdir(dialign_output + 'nopos/'):
    if (f.endswith('_tsv-lexicon-self-rep-A.tsv') or f.endswith('_tsv-lexicon-self-rep-B.tsv')) and not f.startswith('.'):
        filepath = os.path.join(dialign_output + 'nopos/', f)
        dial_id = f.split('_')[0]
        if dial_id not in self_lexica:
            self_lexica[dial_id] = pd.read_csv(filepath, sep='\t', header=0)
        else:
            self_lexica[dial_id] = pd.concat([self_lexica[dial_id], pd.read_csv(filepath, sep='\t', header=0)])




print(len(shared_lexica), len(self_lexica))

187 187


In [6]:
shared_lexica_pos = {}
for f in os.listdir(dialign_output + 'pos/'):
    if f.endswith('_tsv-lexicon.tsv') and not f.startswith('.'):
        filepath = os.path.join(dialign_output + 'pos/', f)
        dial_id = f.split('_')[0]
        if dial_id not in shared_lexica_pos:
            shared_lexica_pos[dial_id] = pd.read_csv(filepath, sep='\t', header=0)
        else:
            shared_lexica_pos[dial_id] = pd.concat([shared_lexica_pos[dial_id], pd.read_csv(filepath, sep='\t', header=0)])



self_lexica_pos = {}
for f in os.listdir(dialign_output + 'pos/'):
    if (f.endswith('_tsv-lexicon-self-rep-A.tsv') or f.endswith('_tsv-lexicon-self-rep-B.tsv')) and not f.startswith('.'):
        filepath = os.path.join(dialign_output + 'pos/', f)
        dial_id = f.split('_')[0]
        if dial_id not in self_lexica_pos:
            self_lexica_pos[dial_id] = pd.read_csv(filepath, sep='\t', header=0)
        else:
            self_lexica_pos[dial_id] = pd.concat([self_lexica_pos[dial_id], pd.read_csv(filepath, sep='\t', header=0)])

print(len(shared_lexica_pos), len(self_lexica_pos))



187 187


In [7]:
with open(analysis_data_dir + 'contexts.json', 'r') as f:
    contexts = json.load(f)



In [6]:
shared_lexica_pos['SJV7'].head(3)

Unnamed: 0,Freq.,Free Freq.,Size,Surface Form,Establishment turn,Spanning,Priming,First Speaker,Turns
0,4,4,7,you#PRON know#VERB what#PRON i#PRON mean#VERB ...,1096,892,3,S0530,"205, 356, 599, 1096"
1,2,2,7,i#PRON du#VERB n#ADV no#VERB if#CONJ you#PRON ...,643,439,1,S0530,"205, 643"
2,3,3,6,i#PRON was#VERB like#ADV oh#INTERJ my#PRON god...,815,25,1,S0530,"798, 815, 822"


In [8]:
def topical_or_referential(word_seq, pos_seq):
    assert len(word_seq) == len(pos_seq), (word_seq, pos_seq)
    
    GENERIC_NOUNS = 'bit bunch fact god middle ones part rest side sort sorts stuff thanks loads lot lots kind kinds time times way ways problem problems thing things idea ideas reason reasons day days week weeks year years'
    GENERIC_NOUNS = GENERIC_NOUNS.split(' ')
    
    if pos_seq.count('SUBST') >= 1:
        is_generic = True
        for w, tag in zip(word_seq, pos_seq):
            if tag == 'SUBST' and w not in GENERIC_NOUNS:
                is_generic = False
                break
        if not is_generic:
            return True
    
    return False


def more_than_half_filled_pauses(construction):
    construction = construction.split(' ')
    FILLED_PAUSES = ['huh', 'uh', 'erm', 'hm', 'mm', 'er']
    n_filled_pauses = 0.
    for w in construction:
        if w in FILLED_PAUSES:
            n_filled_pauses += 1
    return n_filled_pauses >= len(construction) / 2
    

In [9]:
more_than_half_filled_pauses('mm mm mm')

True

In [11]:
pos_tagged_constructions = {}
pos_tagged_constructions_topical = {}

for d_id in shared_lexica_pos:
    lexicon_df = pd.concat((shared_lexica_pos[d_id], self_lexica_pos[d_id]))
    for _, row in lexicon_df.iterrows():
        constr = row['Surface Form']
        
        if not isinstance(constr, str):
            continue
        constr = constr.replace('? #STOP', '?#STOP')
        constr = constr.strip()
            
        tokens = constr.split(' ')
        w_seq = []
        pos_seq = []
        illegal_constr = False
        for token in tokens:
            try:
                w, tag = token.split('#')
            except ValueError:
                illegal_constr = True 
            w_seq.append(w)
            pos_seq.append(tag)
        
        if illegal_constr:
            print('Illegal construction:', constr)
            continue   # only exception is: "made . com#SUBST"
        
        concat_tokens = ''.join(w_seq)
        
        # Referential or topical constructions?
        if topical_or_referential(w_seq, pos_seq):
            pos_tagged_constructions_topical[concat_tokens] = pos_seq
        else:
            pos_tagged_constructions[concat_tokens] = pos_seq

Illegal construction: made . com#SUBST


In [12]:
contexts['SJV7']['205']

['B',
 'S0530',
 "whereas if you're kind of like i dunno if you're just like helping yourself like if you're like at a healthy weight and you're like stuff it's just like do you know what i mean ?"]

In [13]:
def find_subsequence(subsequence, sequence):
    try:
        l = len(subsequence)
    except TypeError:
        print(subsequence)
    ranges = []
    for i in range(len(sequence)):
        if sequence[i:i+l] == subsequence:
            if i - 1 < 0:
                space_before = True
            else:
                space_before = sequence[i-1] in " ',.!:;?"
  
            if i + l >= len(sequence):
                space_after = True
            else:
                space_after = sequence[i+l] in " ',.!:;?"
                
            if space_before and space_after:
                ranges.append((i, i+l))
    return ranges


In [14]:
ss = "and i was just like"
s = "and i was just like oh my god"
find_subsequence(ss, s)

[(0, 19)]

In [15]:
ss = "bad for you"
s = "yeah it is bad for you bad for your teeth"
find_subsequence(ss, s)

[(11, 22)]

In [16]:
corpus_counts = defaultdict(lambda: {}) #[dialogue][expression]
_data = defaultdict(lambda: {})

cnt = []
for d_id in tqdm(shared_lexica):
    lexicon_df = pd.concat((shared_lexica[d_id], self_lexica[d_id]))
    dialogue = contexts[d_id]
    
    for _, row in lexicon_df.iterrows():
        constr = row['Surface Form']
        
        if not isinstance(constr, str):
            continue
        constr = constr.strip()
            
        turns = row['Turns'].split(', ')
        
        _freq = 0
        for turn in turns:
            _, _, text = dialogue[turn]
            ranges = find_subsequence(constr, text)
            _freq += len(ranges)
        
        assert _freq >= row['Freq.']
        
        # Condition 1: at least 3 tokens long
        if row['Size'] < 3:
            continue
        
        # Condition 2: frequency >= 3 in the dialogue
        if _freq < 3:
            continue
            
        # Condition 3: free form frequency >= 2 in the dialogue
        if row['Free Freq.'] < 2:
            continue
        
        concat_tokens = constr.replace(' ', '')
        if concat_tokens in pos_tagged_constructions_topical:
            _pos_seq = pos_tagged_constructions_topical[concat_tokens]
            topical = True
            cnt.append(1)
        elif concat_tokens in pos_tagged_constructions:
            _pos_seq = pos_tagged_constructions[concat_tokens]
            topical = False
            cnt.append(1)
        else:
            # Skip constructions for which we find no POS-tagged equivalent
            cnt.append(0)
            continue
            
        # Condition 4: no punctuation in the construction
        if "STOP" in _pos_seq:
            continue
            
        # Condition 5: at least half of the construction should not correspond to filled pauses
        if more_than_half_filled_pauses(constr):
            continue

        _data[d_id][constr] = {
            'Frequency': _freq,
            'Free frequency': row['Free Freq.'],
            'Length': row['Size'],
            'POS sequence': _pos_seq,
            'First speaker': row['First Speaker'],
            'Turns': turns, 
            'Spanning turns': row['Spanning'],
            'Establishment turn': row['Establishment turn'],
            'Topical': topical
        }
        corpus_counts[d_id][constr] = _freq


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for d_id in tqdm(shared_lexica):


  0%|          | 0/187 [00:00<?, ?it/s]

In [17]:
print('Skipped {} out of {} constructions ({:.2f}%) as we find no POS-tagged equivalent.'.format(
    len([x for x in cnt if x == 0]),
    len(cnt),
    len([x for x in cnt if x == 0]) / len(cnt) * 100
))

Skipped 139 out of 19469 constructions (0.71%) as we find no POS-tagged equivalent.


In [18]:
c_data = defaultdict(lambda: defaultdict(lambda: {'Topical': 0, 'Non-topical': 0}))

for dial_id in _data:

    for c in _data[dial_id]:
        topical_str = 'Topical' if _data[dial_id][c]['Topical'] else 'Non-topical'
        c_len = _data[dial_id][c]['Length']
        for turn_id in _data[dial_id][c]['Turns']:
            c_data[dial_id][turn_id][topical_str] += c_len


In [19]:
props = []

for dial_id in contexts:
    dialogue = contexts[dial_id]
    for utt_idx in dialogue:
        _, _, utt = dialogue[utt_idx]
        total_tokens = len(utt.split())
        non_constr = total_tokens - c_data[dial_id][utt_idx]['Topical'] - c_data[dial_id][utt_idx]['Non-topical']

        props.append((
            dial_id,
            int(utt_idx),
            int(utt_idx) / len(dialogue),
            int(utt_idx) / len(dialogue) < 0.5,
            c_data[dial_id][utt_idx]['Topical'],
            c_data[dial_id][utt_idx]['Non-topical'],
            c_data[dial_id][utt_idx]['Topical'] + c_data[dial_id][utt_idx]['Non-topical'],
            non_constr,
            c_data[dial_id][utt_idx]['Topical'] / total_tokens,
            c_data[dial_id][utt_idx]['Non-topical'] / total_tokens,
            (c_data[dial_id][utt_idx]['Topical'] + c_data[dial_id][utt_idx]['Non-topical']) / total_tokens,
            non_constr / total_tokens,
        ))

prop_df = pd.DataFrame(props, columns=['dial_id', 'uttID', 'uttIDprop', 'bin', 'nTopical', 'nNonTopical', 'nConstr', 'nRest', 'propTopical', 'propNonTopical', 'propConstr', 'propRest'])

In [20]:
prop_df.to_csv('proportions.csv')