# Character metadata

In [102]:
import pandas as pd

characters_columns = [
    "character_id",
    "character_name",
    "movie_id",
    "movie_title",
    "gender",
    "position",
]
characters_metadata = pd.read_csv(
    "./data/raw/cornell movie-dialogs corpus/movie_characters_metadata.txt",
    error_bad_lines=False,
    header=None,
    names=characters_columns,
    sep=" \+\+\+\$\+\+\+ ",
    encoding="latin-1",
)
characters_metadata.gender = characters_metadata.gender.str.lower()
characters_metadata_with_gender = characters_metadata[
    characters_metadata.gender.str.lower().isin(["m", "f"])
]
characters_metadata_with_gender_and_position = characters_metadata_with_gender[
    characters_metadata_with_gender.position != "?"
]
characters_metadata_with_gender_and_position.position = (
    characters_metadata_with_gender_and_position.position.astype(int)
)





The error_bad_lines argument has been deprecated and will be removed in a future version.





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [103]:
import plotly_express as px
px.histogram(characters_metadata_with_gender, x="gender")

In [104]:
px.histogram(characters_metadata_with_gender_and_position[characters_metadata_with_gender_and_position.position < 100],
             x="position" , color="gender", barmode="stack")

In [79]:
px.histogram(characters_metadata_with_gender_and_position[characters_metadata_with_gender_and_position.position < 100],
             x="position" , color="gender", barmode="stack", barnorm="percent", nbins=50)

In [23]:
len(characters_metadata_with_gender_and_position)

2661

# Movie metadata

In [59]:
movie_columns = [
    "movie_id",
    "movie_title",
    "movie_year",
    "imdb_rating",
    "imdb_votes",
    "genres",
]

movie_metadata = pd.read_csv(
    "./data/raw/cornell movie-dialogs corpus/movie_titles_metadata.txt",
    error_bad_lines=False,
    header=None,
    names=movie_columns,
    sep=" \+\+\+\$\+\+\+ ",
    encoding="latin-1",
)
movie_metadata.movie_year = movie_metadata.movie_year.str.replace("/I", "").astype(int)
movie_metadata.genres = movie_metadata.genres.apply(eval)




The error_bad_lines argument has been deprecated and will be removed in a future version.





In [60]:
px.histogram(movie_metadata, x="movie_year")

In [61]:
px.histogram(movie_metadata, x="imdb_rating")

In [62]:
px.histogram(movie_metadata, x="imdb_votes")

# Metadata

In [105]:
metadata = characters_metadata.merge(movie_metadata)

In [64]:
px.histogram(metadata[metadata.gender != "?"], x="movie_year", color="gender")

In [78]:
fig = px.histogram(metadata[metadata.gender != "?"], x="movie_year", color="gender", barmode="stack", barnorm="percent", nbins=50)
fig.add_hline(y=50)

=> no clear trend between year and gender representativity

In [77]:
fig = px.histogram(metadata[metadata.gender != "?"], x="imdb_rating", color="gender", barmode="stack", barnorm="percent", nbins=50)
fig.add_hline(y=50)

In [72]:
px.histogram(metadata.explode("genres"), x="genres").update_xaxes(categoryorder="total descending")

In [76]:
fig = px.histogram(metadata[metadata.gender != "?"].explode("genres"), x="genres", color="gender", barmode="stack", barnorm="percent")
fig.add_hline(y=50)

# Conversations

In [81]:
lines_columns = ["line_id", "character_id", "movie_id", "character_name", "text"]

lines = pd.read_csv(
    "./data/raw/cornell movie-dialogs corpus/movie_lines.txt",
    error_bad_lines=False,
    header=None,
    names=lines_columns,
    sep=" \+\+\+\$\+\+\+ ",
    encoding="latin-1",
)




The error_bad_lines argument has been deprecated and will be removed in a future version.





In [97]:
lines

Unnamed: 0,line_id,character_id,movie_id,character_name,text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
...,...,...,...,...,...
304708,L666371,u9030,m616,DURNFORD,Lord Chelmsford seems to want me to stay back ...
304709,L666370,u9034,m616,VEREKER,I'm to take the Sikali with the main column to...
304710,L666369,u9030,m616,DURNFORD,"Your orders, Mr Vereker?"
304711,L666257,u9030,m616,DURNFORD,"Good ones, yes, Mr Vereker. Gentlemen who can ..."


In [88]:
conversations_columns = [
    "first_character_id",
    "second_character_id",
    "movie_id",
    "text_order",
]

conversations = pd.read_csv(
    "./data/raw/cornell movie-dialogs corpus/movie_conversations.txt",
    error_bad_lines=False,
    header=None,
    names=conversations_columns,
    sep=" \+\+\+\$\+\+\+ ",
    encoding="latin-1",
)
conversations.text_order = conversations.text_order.apply(eval)





The error_bad_lines argument has been deprecated and will be removed in a future version.





In [109]:
conversations_flat = conversations.explode("text_order").rename_axis('conversation_id').reset_index()

In [110]:
corpus = conversations_flat.merge(lines, left_on="text_order", right_on="line_id").merge(characters_metadata_with_gender, left_on="character_id", right_on="character_id")

In [111]:
corpus.head()

Unnamed: 0,conversation_id,first_character_id,second_character_id,movie_id_x,text_order,line_id,character_id,movie_id_y,character_name_x,text,character_name_y,movie_id,movie_title,gender,position
0,0,u0,u2,m0,L194,L194,u0,m0,BIANCA,Can we make this quick? Roxanne Korrine and A...,BIANCA,m0,10 things i hate about you,f,4
1,0,u0,u2,m0,L196,L196,u0,m0,BIANCA,Not the hacking and gagging and spitting part....,BIANCA,m0,10 things i hate about you,f,4
2,1,u0,u2,m0,L198,L198,u0,m0,BIANCA,You're asking me out. That's so cute. What's ...,BIANCA,m0,10 things i hate about you,f,4
3,2,u0,u2,m0,L200,L200,u0,m0,BIANCA,"No, no, it's my fault -- we didn't have a prop...",BIANCA,m0,10 things i hate about you,f,4
4,2,u0,u2,m0,L202,L202,u0,m0,BIANCA,"The thing is, Cameron -- I'm at the mercy of a...",BIANCA,m0,10 things i hate about you,f,4


In [122]:
corpus.groupby(["movie_id", "conversation_id"]).gender.nunique()

movie_id  conversation_id
m0        0                  2
          1                  2
          2                  2
          3                  2
          4                  2
                            ..
m99       13647              2
          13648              1
          13649              1
          13650              1
          13651              1
Name: gender, Length: 76535, dtype: int64