## The New Bechdel test!
Analysis on the new Bechdel test using the Cornell Movie-Dialog Corpus

In [69]:
import csv
import pandas as pd
import re
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from unidecode import unidecode

In [12]:
# Import all the datasets first
# Movie Titles Metadata.txt
mtm = 'cornell movie-dialogs corpus/movie_titles_metadata.txt'
# CSV for transforming the data
mtm_csv = 'mtm_csv.csv'

In [13]:
# Add header rows
with open(mtm_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Movie_Number", "Movie_Name", "Year_of_Release", 
                                                   "IMDB_Rating", "Number_of_Votes", "Genres"])
    writer.writeheader()

In [14]:
# Open the .txt file and write it onto the dataframe
with open(mtm, 'r') as mtmeta:
    for lines in mtmeta:
        text = str(lines)
        line_list = text.split('+++$+++')
        
        with open(mtm_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip(), line_list[5].strip()])

In [15]:
# Check the content of the csv file
mtm_df = pd.read_csv(mtm_csv, index_col = False)
print(mtm_df.shape)
mtm_df.head(5)

(617, 6)


Unnamed: 0,Movie_Number,Movie_Name,Year_of_Release,IMDB_Rating,Number_of_Votes,Genres
0,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
2,m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
3,m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
4,m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


In [16]:
mtm_df.to_csv(mtm_csv, sep=',', index = False)
print("Saved to csv..")

Saved to csv..


In [17]:
# Movie Characters Metadata.txt
mcm = 'cornell movie-dialogs corpus/movie_characters_metadata.txt'
# CSV for transforming the data
mcm_csv = 'mcm_csv.csv'

In [18]:
# Add header rows
with open(mcm_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Character_Id", "Character_Name", "Movie_Number", 
                                                   "Movie_Title", "Gender", "Position_in_Credits"])
    writer.writeheader()

In [19]:
# Open the second .txt file and write it onto the next dataframe
with open(mcm, 'r') as mcmeta:
    for lines in mcmeta:
        text = str(lines)
        line_list = text.split('+++$+++')

        with open(mcm_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip(), line_list[5].strip()])

In [20]:
mcm_df = pd.read_csv(mcm_csv)
print(mcm_df.shape)
mcm_df.head(5)

(9035, 6)


Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [21]:
mcm_df.to_csv(mcm_csv, sep=',', index = False)
print("Saved to csv..")

Saved to csv..


In [22]:
# Movie Characters Metadata.txt
ml = 'cornell movie-dialogs corpus/movie_lines.txt'
# CSV for transforming the data
ml_csv = 'ml_csv.csv'

In [23]:
# Add header rows
with open(ml_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Line_Id", "Character_Id", "Movie_Number", 
                                                   "Character_Name", "Dialogue"])
    writer.writeheader()

In [24]:
# Open the second .txt file and write it onto the next dataframe
with open(ml, 'r') as mlines:
    for lines in mlines:
        text = str(lines)
        line_list = text.split('+++$+++')

        with open(ml_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip()])

In [25]:
ml_df = pd.read_csv('ml_csv.csv')
print(ml_df.shape)
ml_df.head(5)

(304713, 5)


Unnamed: 0,Line_Id,Character_Id,Movie_Number,Character_Name,Dialogue
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [26]:
ml_df.to_csv(ml_csv, sep = ',', index = False)
print('Saved to csv...')

Saved to csv...


In [27]:
# Movie Conversations.txt
mc = 'cornell movie-dialogs corpus/movie_conversations.txt'
# CSV for transforming the data
mc_csv = 'mc_csv.csv'

In [28]:
# Add header rows
with open(mc_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ['Character_Id1', 'Character_Id2', 'Movie_Number',
                                                   'List_of_Utterance'])
    writer.writeheader()

In [29]:
# Open the second .txt file and write it onto the next dataframe
with open(mc, 'r') as mconversations:
    for lines in mconversations:
        line = str(lines)
        line_list = line.split('+++$+++')
        
        with open(mc_csv, 'a') as mconv:
            writer = csv.writer(mconv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip()])

In [30]:
mc_df = pd.read_csv('mc_csv.csv')
print(mc_df.shape)
mc_df.head(5)

(83097, 4)


Unnamed: 0,Character_Id1,Character_Id2,Movie_Number,List_of_Utterance
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [31]:
mc_df.to_csv(mc_csv, sep = ',', index = False)
print('Saved to csv...')

Saved to csv...


## Onto Data Cleaning
There are different aspects to this part and all the data in these csv files must be analysed in order for them to be of a consistent data

In [32]:
# Read all the csv files
mtm_df = pd.read_csv('mtm_csv.csv')
mcm_df = pd.read_csv('mcm_csv.csv')
mc_df = pd.read_csv('mc_csv.csv')
ml_df = pd.read_csv('ml_csv.csv')

In [59]:
# Convert the years to consistent format -- elegant way
mtm_df['Year_of_Release'] = mtm_df['Year_of_Release'].apply(lambda x: st.join(re.findall(r'[0-9]', x)))

In [62]:
# Convert them to Integers
mtm_df['Year_of_Release'] = mtm_df['Year_of_Release'].astype(int)
mtm_df['IMDB_Rating'] = mtm_df['IMDB_Rating'].astype(int)
mtm_df['Number_of_Votes'] = mtm_df['Number_of_Votes'].astype(int)
mtm_df.to_csv(mtm_csv, sep=',', index = False)
print("Saved to csv..")

## Onto Data Exploration 🙌 🙌 
Using plotly to do visualization of all the data so that we get meaningful info out of it

In [63]:
mtm_df.describe()

Unnamed: 0,Year_of_Release,IMDB_Rating,Number_of_Votes
count,617.0,617.0,617.0
mean,1988.575365,6.403566,49820.962723
std,16.589229,1.255192,61880.609145
min,1927.0,2.0,9.0
25%,1984.0,6.0,9919.0
50%,1994.0,7.0,27112.0
75%,1999.0,7.0,66781.0
max,2010.0,9.0,419312.0


In [105]:
data = [go.Histogram(
            x=mtm_df['Year_of_Release'],
            marker = dict(color = 'rgb(17, 157, 100)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
layout = go.Layout(
    title='Year of Releases',
    xaxis=dict(
        title='Years'
    ),
    yaxis=dict(
        title='Counts of Movie Releases'
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram')

In [107]:
data = [go.Histogram(
            y=mtm_df['IMDB_Rating'],
            marker = dict(color = 'rgb(17, 157, 255)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
layout = go.Layout(
    title='IMDB Ratings',
    xaxis=dict(
        title='Rating Scores'
    ),
    yaxis=dict(
        title='Number of Movies'
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram')