# Creating Our Dataset and Applying Our Transformer

In this notebook, we:
- download friends-corpus
- apply our Transformer, Genderromantic, to it
- analyze the resulting measures

In [2]:
!python3 -m pip install -r requirements.txt

[33mYou are using pip version 19.0.3, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import convokit
convokit

<module 'convokit' from '/home/kcs3au/is4/Cornell-Conversational-Analysis-Toolkit/convokit/__init__.py'>

In [4]:
from convokit import Corpus

# Download locally stored corpus

In [5]:
corpus = Corpus(filename='./datasets/friends-corpus/corpus')

# Use locally defined Transformer

In [6]:
from convokit import Genderromantic
Genderromantic

convokit.genderromantic.genderromantic.Genderromantic

In [7]:
grr = Genderromantic()

In [8]:
transformed_corpus = grr.fit_transform(corpus)

In [9]:
next(transformed_corpus.iter_utterances())

Utterance({'id': 's01_e01_c01_u001', 'user': User([('name', 'Monica Geller')]), 'root': 's01_e01_c01_u001', 'reply_to': None, 'timestamp': None, 'text': "There's nothing to tell! He's just some guy I work with!", 'meta': {'tokens': [['There', "'s", 'nothing', 'to', 'tell', '!'], ['He', "'s", 'just', 'some', 'guy', 'I', 'work', 'with', '!']], 'character_entities': [[], [[0, 1, 'Paul the Wine Guy'], [4, 5, 'Paul the Wine Guy'], [5, 6, 'Monica Geller']]], 'emotion': None, 'caption': None, 'transcript_with_note': None, 'tokens_with_note': None, 'female_about_male': True, 'male_about_female': False, 'contains_romantic': False}})

In [10]:
transformed_corpus.get_utterance('s01_e01_c01_u001')

Utterance({'id': 's01_e01_c01_u001', 'user': User([('name', 'Monica Geller')]), 'root': 's01_e01_c01_u001', 'reply_to': None, 'timestamp': None, 'text': "There's nothing to tell! He's just some guy I work with!", 'meta': {'tokens': [['There', "'s", 'nothing', 'to', 'tell', '!'], ['He', "'s", 'just', 'some', 'guy', 'I', 'work', 'with', '!']], 'character_entities': [[], [[0, 1, 'Paul the Wine Guy'], [4, 5, 'Paul the Wine Guy'], [5, 6, 'Monica Geller']]], 'emotion': None, 'caption': None, 'transcript_with_note': None, 'tokens_with_note': None, 'female_about_male': True, 'male_about_female': False, 'contains_romantic': False}})

# Running Statistics on Utterance-Level

In [11]:
mf=[]
fm=[]
rom=[]
utt_count=0
utterance_ids = transformed_corpus.get_utterance_ids()
for uid in utterance_ids:
    utt=transformed_corpus.get_utterance(uid)
    utt_count=utt_count+1 
    mf2=utt.meta["male_about_female"]
    mf.append(mf2)
    fm2=utt.meta["female_about_male"]
    fm.append(fm2)
    rom2=utt.meta["contains_romantic"]
    rom.append(rom2)
print(utt_count)
counts=[]
x=mf.count(True)
counts.append(x)
xx=mf.count(False)
counts.append(xx)
y=fm.count(True)
counts.append(y)
yy=fm.count(False)
counts.append(yy)
z=rom.count(True)
counts.append(z)
zz=rom.count(False)
counts.append(zz)

print(counts)

61338
[2026, 59312, 3651, 57687, 5279, 56059]


# Running Statistics on Conversation-Level

In [12]:
convo={}
mf=[]
fm=[]
rom=[]
id2=[]
total_utt=0
utt_count=0
utterance_ids = transformed_corpus.get_utterance_ids()
current_scene=utterance_ids[0][:11]
current_scene2=1
for uid in utterance_ids:
    utt=transformed_corpus.get_utterance(uid)
    if uid[:11]==current_scene:
        total_utt=total_utt+1
        mf2=utt.meta["male_about_female"]
        if mf2==True:
            mf.append(mf2)
        fm2=utt.meta["female_about_male"]
        if fm2==True:
            fm.append(fm2)
        rom2=utt.meta["contains_romantic"]
        if rom2==True:    
            rom.append(rom2)
    else:
        current_scene=uid[:11]
        current_scene2=current_scene2+1
        mf=[]
        fm=[]
        rom=[]
        total_utt=0
    convo[current_scene2] = {'total utterances': total_utt, 'male-female': mf, 'female-male': fm, 'romantic': rom}
    print(total_utt)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
0
1
2
3
4
5
6
7
8
0
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
9
10
11
12
13
0
1
0
1
2
3
4
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
0
1
2
3
4
5
6
7
8
9
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

In [17]:
#convo.keys()
print(convo[1])

{'total utterances': 52, 'male-female': [True, True, True, True], 'female-male': [True, True, True, True, True, True, True, True, True], 'romantic': [True, True, True, True, True, True]}


In [19]:
r= [(current_scene2, convo[current_scene2]['total utterances'], len(convo[current_scene2]['romantic']), len(convo[current_scene2]['male-female']), len(convo[current_scene2]['female-male'])) for current_scene2 in convo.keys()]

In [20]:
import pandas as pd
r=pd.DataFrame.from_dict(r)
r.columns=['scene', 'total_utterances', 'romantic', 'male-female', 'female-male']
print(r)

      scene  total_utterances  romantic  male-female  female-male
0         1                52         6            4            9
1         2                55         6            5            9
2         3                 0         0            0            0
3         4                16         0            0            0
4         5                 8         0            0            2
...     ...               ...       ...          ...          ...
3094   3095                19         1            0            0
3095   3096                34         3            0            0
3096   3097                11         2            0            0
3097   3098                20         3            0            0
3098   3099                20         2            0            0

[3099 rows x 5 columns]


# Average Number of Romantic Utterances

In [24]:
def calculate_average(row):
    return ((row['romantic'])/(row['total_utterances']))*100

r.apply(calculate_average, axis=1)
r['avg_rom'] = r.apply(calculate_average, axis=1)

avg_rom=[]
x= r['avg_rom'].mean()
avg_rom.append(x)

y= r['avg_rom'].min()
avg_rom.append(y)

z= r['avg_rom'].max()
avg_rom.append(z)


print(avg_rom)

[8.924339681201685, 0.0, 100.0]


  


In [25]:
print(r['romantic'])

0       6
1       6
2       0
3       0
4       0
       ..
3094    1
3095    3
3096    2
3097    3
3098    2
Name: romantic, Length: 3099, dtype: int64


In [26]:
#Female 

#Total number of female-male only conversations
f_all = len(r[(r['male-female']==0) & (r['female-male']>=1)]) # (255)
print(f_all)
#Total number of female-male only conversations with no romantic 
f_nr = len(r[(r['male-female']==0) & (r['female-male']>=1) & (r['romantic']==0)])
print(f_nr)
#Total number of female-male only conversations with some romantic
f_r = len(r[(r['male-female']==0) & (r['female-male']>=1) & (r['romantic']>=1)])
print(f_r)

#Male

#Total number of male-female only conversations
f_all = len(r[(r['female-male']==0) & (r['male-female']>=1)]) # (255)
print(f_all)
#Total number of male-female only conversations with no romantic 
f_nr = len(r[(r['female-male']==0) & (r['male-female']>=1) & (r['romantic']==0)])
print(f_nr)
#Total number of male-female only conversations with some romantic
f_r = len(r[(r['female-male']==0) & (r['male-female']>=1) & (r['romantic']>=1)])
print(f_r)


413
155
258
116
51
65
