# Creating Our Dataset and Applying Our Transformer

In this notebook, we:
- download switch-board
- apply updated Transformer, Genderromantic2, to it
- analyze the resulting measures

In [1]:
!python3 -m pip install -r requirements.txt



In [2]:
import convokit
convokit

<module 'convokit' from '/Users/emilytseng/Cornell-Conversational-Analysis-Toolkit/convokit/__init__.py'>

In [3]:
from convokit import Corpus

# Download locally stored corpus

In [4]:
corpus = Corpus(filename='./datasets/switchboard-corpus/corpus')

# Use locally defined Transformer

In [5]:
from convokit import Genderromantic2
Genderromantic2

convokit.genderromantic2.genderromantic2.Genderromantic2

In [6]:
grr = Genderromantic2()

In [7]:
transformed_corpus = grr.fit_transform(corpus)

In [8]:
next(transformed_corpus.iter_utterances())

Utterance({'id': '4325-0', 'user': User([('name', '1632')]), 'root': '4325-0', 'reply_to': None, 'timestamp': None, 'text': 'Okay.  /', 'meta': {'tag': 'o', 'pos': 'Okay/UH ./.', 'trees': [[['Okay'], ['.'], ['E_S']]], 'female_about_male': False, 'male_about_female': False, 'contains_romantic': False}})

In [11]:
transformed_corpus.get_utterance('4325-0').user.meta

{'sex': 'FEMALE',
 'education': 2,
 'birth_year': 1962,
 'dialect_area': 'WESTERN'}

# Running Statistics on Utterance-Level

In [17]:
male_speaking = 0
male_speaking_romantic = 0
male_speaking_about_female = 0
male_speaking_about_female_romantic = 0

male_speaking_not_about_female = 0

female_speaking = 0
female_speaking_romantic = 0
female_speaking_about_male = 0
female_speaking_about_male_romantic = 0

female_speaking_not_about_male = 0

romantic = 0
not_romantic = 0


utterance_ids = transformed_corpus.get_utterance_ids()

for uid in utterance_ids:
    utt=transformed_corpus.get_utterance(uid)
    
    # First get whether it's a male or female speaker
    speaker_gender = utt.user.meta['sex']
    if speaker_gender == "MALE":
        male_speaking += 1
        # Then check whether he's saying something romantic
        if utt.meta['contains_romantic']:
            male_speaking_romantic += 1
    if speaker_gender == "FEMALE":
        female_speaking += 1
        # Then check whether she's saying something romantic
        if utt.meta['contains_romantic']:
            female_speaking_romantic += 1
            
    # Then get whether the utterance is a male speaking about a female:
    mf2=utt.meta["male_about_female"]
    if mf2:
        male_speaking_about_female += 1
        # And whether it was romantic
        if utt.meta["contains_romantic"]:
            male_speaking_about_female_romantic += 1
    else:
        male_speaking_not_about_female += 1
        
    # Then get whether the utterance is a female speaking about a male:
    fm2=utt.meta["female_about_male"]
    if fm2:
        female_speaking_about_male += 1
        # And whether it was romantic
        if utt.meta["contains_romantic"]:
            female_speaking_about_male_romantic += 1
    else:
        female_speaking_not_about_male += 1
        
    # Then register whether the utt is romantic, period.
    rom=utt.meta["contains_romantic"]
    if rom:
        romantic += 1
    else:
        not_romantic += 1

print('male_speaking: ', male_speaking)
print('pct male utterances that were romantic: ', float(male_speaking_romantic) / float(male_speaking))
print('male_speaking_about_female: ', male_speaking_about_female)
print('male_speaking_about_female_romantic: ', male_speaking_about_female_romantic)

# print('pct male utterances about females', float(male_speaking_about_female) / float(male_speaking))
# print('pct male utterances about females that are romantic', float(male_speaking_about_female_romantic) / float(male_speaking_about_female))

print('male_speaking_not_about_female: ', male_speaking_not_about_female)
print('\n')
print('female_speaking: ', female_speaking)
print('pct female utterances that were romantic: ', float(female_speaking_romantic) / float(female_speaking))
print('female_speaking_about_male: ', female_speaking_about_male)
print('female_speaking_about_male_romantic: ', female_speaking_about_male_romantic)

# print('pct female utterances about males', float(female_speaking_about_male) / float(female_speaking))
# print('pct female utterances about males that are romantic', float(female_speaking_about_male_romantic) / float(female_speaking_about_male))

print('female_speaking_not_about_male: ', female_speaking_not_about_male)
print('\n')
print('romantic: ', romantic)
print('pct romantic: ', float(romantic) / float(len(utterance_ids)))
print('not_romantic: ', not_romantic)

male_speaking:  92248
pct male utterances that were romantic:  0.027588674009192612
male_speaking_about_female:  0
male_speaking_about_female_romantic:  0
male_speaking_not_about_female:  221616


female_speaking:  129368
pct female utterances that were romantic:  0.03471492177354524
female_speaking_about_male:  0
female_speaking_about_male_romantic:  0
female_speaking_not_about_male:  221616


romantic:  7036
pct romantic:  0.03174861020864919
not_romantic:  214580


In [18]:
len(utterance_ids)

221616

# Running Statistics on Conversation-Level

In [197]:
convo={}
mf=[]
fm=[]
rom=[]
id2=[]
total_utt=0
utt_count=0
utterance_ids = transformed_corpus.get_utterance_ids()
current_scene=utterance_ids[0][:11]
current_scene2=1
for uid in utterance_ids:
    utt=transformed_corpus.get_utterance(uid)
    if uid[:11]==current_scene:
        total_utt=total_utt+1
        mf2=utt.meta["male_about_female"]
        if mf2==True:
            mf.append(mf2)
        fm2=utt.meta["female_about_male"]
        if fm2==True:
            fm.append(fm2)
        rom2=utt.meta["contains_romantic"]
        if rom2==True:    
            rom.append(rom2)
    else:
        current_scene=uid[:11]
        current_scene2=current_scene2+1
        mf=[]
        fm=[]
        rom=[]
        total_utt=0
    convo[current_scene2] = {'total utterances': total_utt, 'male-female': mf, 'female-male': fm, 'romantic': rom}
    

In [185]:
#convo.keys()

In [212]:
r= [(current_scene2, total_utt, len(convo[current_scene2]['romantic']), len(convo[current_scene2]['male-female']), len(convo[current_scene2]['female-male'])) for current_scene2 in convo.keys()]

In [213]:
import pandas as pd
r=pd.DataFrame.from_dict(r)
r.columns=['scene', 'total_utterances', 'romantic', 'male-female', 'female-male']
print(r)

      scene  total_utterances  romantic  male-female  female-male
0         1                20         6            4            9
1         2                20         6            5            9
2         3                20         0            0            0
3         4                20         0            0            0
4         5                20         0            0            2
...     ...               ...       ...          ...          ...
3094   3095                20         1            0            0
3095   3096                20         3            0            0
3096   3097                20         2            0            0
3097   3098                20         3            0            0
3098   3099                20         2            0            0

[3099 rows x 5 columns]


# Average Number of Romantic Utterances

In [214]:
def calculate_average(row):
    return ((row['romantic'])/(row['total_utterances']))*100

r.apply(calculate_average, axis=1)
r['avg_rom'] = r.apply(calculate_average, axis=1)

avg_rom=[]
x= r['avg_rom'].mean()
avg_rom.append(x)

y= r['avg_rom'].min()
avg_rom.append(y)

z= r['avg_rom'].max()
avg_rom.append(z)


print(avg_rom)

[8.092933204259438, 0.0, 170.0]


In [215]:
#Female 

#Total number of female-male only conversations
f_all = len(r[(r['male-female']==0) & (r['female-male']>=1)]) # (255)
print(f_all)
#Total number of female-male only conversations with no romantic 
f_nr = len(r[(r['male-female']==0) & (r['female-male']>=1)] & (r['romantic']==0))
print(f_nr)
#Total number of female-male only conversations with some romantic
f_r = len(r[(r['male-female']==0) & (r['female-male']>=1)] & (r['romantic']>=1))
print(f_r)

#Male

#Total number of male-female only conversations
f_all = len(r[(r['female-male']==0) & (r['male-female']>=1)]) # (255)
print(f_all)
#Total number of male-female only conversations with no romantic 
f_nr = len(r[(r['female-male']==0) & (r['male-female']>=1)] & (r['romantic']==0))
print(f_nr)
#Total number of male-female only conversations with some romantic
f_r = len(r[(r['female-male']==0) & (r['male-female']>=1)] & (r['romantic']>=1))
print(f_r)


413


TypeError: cannot compare a dtyped [float64] array with a scalar of type [bool]