# The New Bechdel test!
Analysis on the new Bechdel test using the Cornell Movie-Dialog Corpus

In [1]:
import csv
import pandas as pd
import re
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from unidecode import unidecode
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Import all the datasets first
# Movie Titles Metadata.txt
mtm = 'cornell movie-dialogs corpus/movie_titles_metadata.txt'
# CSV for transforming the data
mtm_csv = 'mtm_csv.csv'

In [3]:
# Add header rows
with open(mtm_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Movie_Number", "Movie_Name", "Year_of_Release", 
                                                   "IMDB_Rating", "Number_of_Votes", "Genres"])
    writer.writeheader()

In [4]:
# Open the .txt file and write it onto the dataframe
with open(mtm, 'r', encoding = 'latin-1') as mtmeta:
    for lines in mtmeta:
        text = str(lines)
        line_list = text.split('+++$+++')
        
        with open(mtm_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip(), line_list[5].strip()])

In [5]:
# Check the content of the csv file
mtm_df = pd.read_csv(mtm_csv, index_col = False)
print(mtm_df.shape)
mtm_df.head(5)

(617, 6)


Unnamed: 0,Movie_Number,Movie_Name,Year_of_Release,IMDB_Rating,Number_of_Votes,Genres
0,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
2,m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
3,m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
4,m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


In [6]:
mtm_df.to_csv(mtm_csv, sep=',', index = False)
print("Saved to csv..")

Saved to csv..


In [7]:
# Movie Characters Metadata.txt
mcm = 'cornell movie-dialogs corpus/movie_characters_metadata.txt'
# CSV for transforming the data
mcm_csv = 'mcm_csv.csv'

In [8]:
# Add header rows
with open(mcm_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Character_Id", "Character_Name", "Movie_Number", 
                                                   "Movie_Title", "Gender", "Position_in_Credits"])
    writer.writeheader()

In [9]:
# Open the second .txt file and write it onto the next dataframe
with open(mcm, 'r', encoding = 'latin-1') as mcmeta:
    for lines in mcmeta:
        text = str(lines)
        line_list = text.split('+++$+++')

        with open(mcm_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip(), line_list[5].strip()])

In [10]:
mcm_df = pd.read_csv(mcm_csv)
print(mcm_df.shape)
mcm_df.head(5)

(9035, 6)


Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [11]:
mcm_df.to_csv(mcm_csv, sep=',', index = False)
print("Saved to csv..")

Saved to csv..


In [12]:
# Movie Characters Metadata.txt
ml = 'cornell movie-dialogs corpus/movie_lines.txt'
# CSV for transforming the data
ml_csv = 'ml_csv.csv'

In [13]:
# Add header rows
with open(ml_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ["Line_Id", "Character_Id", "Movie_Number", 
                                                   "Character_Name", "Dialogue"])
    writer.writeheader()

In [14]:
# Open the second .txt file and write it onto the next dataframe
with open(ml, 'r', encoding = 'latin-1') as mlines:
    for lines in mlines:
        text = str(lines)
        line_list = text.split('+++$+++')

        with open(ml_csv, 'a') as write_csv:
            writer = csv.writer(write_csv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip(), line_list[4].strip()])

In [15]:
ml_df = pd.read_csv('ml_csv.csv')
print(ml_df.shape)
ml_df.head(5)

(304713, 5)


Unnamed: 0,Line_Id,Character_Id,Movie_Number,Character_Name,Dialogue
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [16]:
ml_df.to_csv(ml_csv, sep = ',', index = False)
print('Saved to csv...')

Saved to csv...


In [17]:
# Movie Conversations.txt
mc = 'cornell movie-dialogs corpus/movie_conversations.txt'
# CSV for transforming the data
mc_csv = 'mc_csv.csv'

In [18]:
# Add header rows
with open(mc_csv, 'wt') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames = ['Character_Id1', 'Character_Id2', 'Movie_Number',
                                                   'List_of_Utterance'])
    writer.writeheader()

In [19]:
# Open the second .txt file and write it onto the next dataframe
with open(mc, 'r', encoding = 'latin-1') as mconversations:
    for lines in mconversations:
        line = str(lines)
        line_list = line.split('+++$+++')
        
        with open(mc_csv, 'a') as mconv:
            writer = csv.writer(mconv)
            writer.writerow([line_list[0].strip(), line_list[1].strip(), line_list[2].strip(),
                            line_list[3].strip()])

In [20]:
mc_df = pd.read_csv('mc_csv.csv')
print(mc_df.shape)
mc_df.head(5)

(83097, 4)


Unnamed: 0,Character_Id1,Character_Id2,Movie_Number,List_of_Utterance
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [21]:
mc_df.to_csv(mc_csv, sep = ',', index = False)
print('Saved to csv...')

Saved to csv...


# Onto Data Cleaning
There are different aspects to this part and all the data in these csv files must be analysed in order for them to be of a consistent data

In [22]:
# Read all the csv files
mtm_df = pd.read_csv('mtm_csv.csv')
mcm_df = pd.read_csv('mcm_csv.csv')
mc_df = pd.read_csv('mc_csv.csv')
ml_df = pd.read_csv('ml_csv.csv')

In [23]:
# On the mtm dataset
# To check for any NULL values in the data
mtm_df.isnull().sum()

Movie_Number       0
Movie_Name         0
Year_of_Release    0
IMDB_Rating        0
Number_of_Votes    0
Genres             0
dtype: int64

In [24]:
# Convert the years to consistent format -- elegant way
st = ""
mtm_df['Year_of_Release'] = mtm_df['Year_of_Release'].apply(lambda x: st.join(re.findall(r'[0-9]', x)))

In [25]:
# Convert them to Integers
mtm_df['Year_of_Release'] = mtm_df['Year_of_Release'].astype(int)
mtm_df['IMDB_Rating'] = mtm_df['IMDB_Rating'].astype(int)
mtm_df['Number_of_Votes'] = mtm_df['Number_of_Votes'].astype(int)

In [26]:
# Now describe the dataset
mtm_df.describe()

Unnamed: 0,Year_of_Release,IMDB_Rating,Number_of_Votes
count,617.0,617.0,617.0
mean,1988.575365,6.403566,49820.962723
std,16.589229,1.255192,61880.609145
min,1927.0,2.0,9.0
25%,1984.0,6.0,9919.0
50%,1994.0,7.0,27112.0
75%,1999.0,7.0,66781.0
max,2010.0,9.0,419312.0


In [27]:
# More information on the dataframe
mtm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 6 columns):
Movie_Number       617 non-null object
Movie_Name         617 non-null object
Year_of_Release    617 non-null int64
IMDB_Rating        617 non-null int64
Number_of_Votes    617 non-null int64
Genres             617 non-null object
dtypes: int64(3), object(3)
memory usage: 29.0+ KB


In [28]:
mtm_df.to_csv(mtm_csv, sep=',', index = False)
print("Saved to csv..")
# Cleaning done on the mtm dataset

Saved to csv..


In [29]:
# Onto the MCM dataset
mcm_df.isnull().sum()

Character_Id           0
Character_Name         2
Movie_Number           0
Movie_Title            0
Gender                 0
Position_in_Credits    0
dtype: int64

In [30]:
# The rows which has NULL values
mcm_df[mcm_df.isnull().any(axis=1)]

Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
3764,u3764,,m248,arctic blue,?,?
6558,u6558,,m436,memento,?,?


In [31]:
# Drop NULL records
print('Old size: %d' % len(mcm_df))
mcm_df = mcm_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(mcm_df))

Old size: 9035
New size: 9033


In [32]:
mcm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9033 entries, 0 to 9034
Data columns (total 6 columns):
Character_Id           9033 non-null object
Character_Name         9033 non-null object
Movie_Number           9033 non-null object
Movie_Title            9033 non-null object
Gender                 9033 non-null object
Position_in_Credits    9033 non-null object
dtypes: object(6)
memory usage: 494.0+ KB


Since, the data in the 'Gender' column has bad data, inorder to make the data consistent :-
# A quick Gender Recognition model
Grabbed from [nlpforhackers](https://nlpforhackers.io/introduction-machine-learning/) webpage.
1. Firstly convert the dataset into a numpy array to keep only gender and names
2. Set the feature parameters which takes in different parameters
3. Vectorize the parametes
4. Get varied train, test split and test it for validity by checking out the count of the train test split
5. Transform lists of feature-value mappings to vectors. (When feature values are strings, this transformer will do a binary one-hot (aka one-of-K) coding: one boolean-valued feature is constructed for each of the possible string values that the feature can take on)
6. Train a decision tree classifier on this and save the model as a pickle file

In [11]:
names = pd.read_csv('gender_recognition/names_dataset.csv')
# print(names.head(10))
 
# print("%d names in dataset" % len(names))

# Get the data out of the dataframe into a numpy matrix and keep only the name and gender columns
names = names.as_matrix()[:, 1:]
# print(names)
 
# We're using 90% of the data for training
TRAIN_SPLIT = 0.90

def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1], # Last letter
        'last2-letters': name[-2:], # Last 2 letters
        'last3-letters': name[-3:], # Last 3 letters
    }

# Feature Extraction
# print(features("Alex"))

# Vectorize the features function
features = np.vectorize(features)
# print(features(["Anna", "Hannah", "Paul"]))
# [ array({'first2-letters': 'an', 'last-letter': 'a', 'first-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna', 'first3-letters': 'ann'}, dtype=object)
#   array({'first2-letters': 'ha', 'last-letter': 'h', 'first-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah', 'first3-letters': 'han'}, dtype=object)
#   array({'first2-letters': 'pa', 'last-letter': 'l', 'first-letter': 'p', 'last2-letters': 'ul', 'last3-letters': 'aul', 'first3-letters': 'pau'}, dtype=object)]
 
# Extract the features for the whole dataset
X = features(names[:, 0]) # X contains the features
 
# Get the gender column
y = names[:, 1]           # y contains the targets
 
# Test if we built the dataset correctly
# print("\n\nName: %s, features=%s, gender=%s" % (names[0][0], X[0], y[0]))

X, y = shuffle(X, y)
X_train, X_test = X[:int(TRAIN_SPLIT * len(X))], X[int(TRAIN_SPLIT * len(X)):]
y_train, y_test = y[:int(TRAIN_SPLIT * len(y))], y[int(TRAIN_SPLIT * len(y)):]

# Check to see if the datasets add up
print (len(X_train), len(X_test), len(y_train), len(y_test))
# print(X_train)

# Transforms lists of feature-value mappings to vectors.
vectorizer = DictVectorizer()
vectorizer.fit(X_train)
transformed = vectorizer.transform(features(["Mary", "John"]))
print (vectorizer.transform(X_train), X_train[0])

clf = DecisionTreeClassifier(criterion = 'gini')
clf.fit(vectorizer.transform(X_train), y_train)

# Accuracy on training set
print (clf.score(vectorizer.transform(X_train), y_train) )
 
# Accuracy on test set
print (clf.score(vectorizer.transform(X_test), y_test))


Method .as_matrix will be removed in a future version. Use .values instead.



85522 9503 85522 9503
  (0, 1)	1.0
  (0, 62)	1.0
  (0, 965)	1.0
  (0, 4588)	1.0
  (0, 4682)	1.0
  (0, 5740)	1.0
  (1, 2)	1.0
  (1, 69)	1.0
  (1, 1037)	1.0
  (1, 4592)	1.0
  (1, 4754)	1.0
  (1, 5608)	1.0
  (2, 0)	1.0
  (2, 39)	1.0
  (2, 693)	1.0
  (2, 4606)	1.0
  (2, 4768)	1.0
  (2, 6847)	1.0
  (3, 21)	1.0
  (3, 402)	1.0
  (3, 4107)	1.0
  (3, 4601)	1.0
  (3, 4876)	1.0
  (3, 7779)	1.0
  (4, 11)	1.0
  :	:
  (85517, 5829)	1.0
  (85518, 2)	1.0
  (85518, 66)	1.0
  (85518, 1004)	1.0
  (85518, 4596)	1.0
  (85518, 4833)	1.0
  (85518, 7037)	1.0
  (85519, 19)	1.0
  (85519, 359)	1.0
  (85519, 3784)	1.0
  (85519, 4588)	1.0
  (85519, 4750)	1.0
  (85519, 6160)	1.0
  (85520, 9)	1.0
  (85520, 186)	1.0
  (85520, 2198)	1.0
  (85520, 4592)	1.0
  (85520, 4848)	1.0
  (85520, 5213)	1.0
  (85521, 10)	1.0
  (85521, 210)	1.0
  (85521, 2386)	1.0
  (85521, 4601)	1.0
  (85521, 5028)	1.0
  (85521, 6953)	1.0 {'first-letter': 'b', 'first2-letters': 'br', 'first3-letters': 'bre', 'last-letter': 'a', 'last2-letters': '

In [34]:
import pickle
sent_model = "sentiment_analysis_model.pkl"
sent_model_pkl = open(sent_model, "wb")

# Dump the sentiment analysis model
pickle.dump(clf, sent_model_pkl)

sent_model_pkl.close()

In [35]:
# Therefore, we are getting a decent result from the names
print (clf.predict(vectorizer.transform(features(["SMYSLOV", "MARY JANE", "MISS PERKY", "SHARON", "ALONSO", "SECONDARY OFFICER"]))))

['M' 'F' 'M' 'F' 'M' 'M']


In [36]:
# Now since the model is built let's update the data in the dataframe
print("Number of missing genders : " , mcm_df[mcm_df.Gender == '?'].shape[0])
mcm_df.head(8)

Number of missing genders :  6018


Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6
5,u5,KAT,m0,10 things i hate about you,f,2
6,u6,MANDELLA,m0,10 things i hate about you,f,7
7,u7,MICHAEL,m0,10 things i hate about you,m,5


In [37]:
# Let's convert all the ? with their appropriate genders by using the above trained model
for items in mcm_df.itertuples():
    if items[5] == '?':
        pred_val = clf.predict(vectorizer.transform(features([items[2]])))
        prediction = ""
        prediction = "".join(pred_val)
        mcm_df.loc[items.Index, 'Gender'] = prediction.lower()

In [38]:
# Now check for the updated genders
print("Number of missing genders : " , mcm_df[mcm_df.Gender == '?'].shape[0])
mcm_df.head(8)

Number of missing genders :  0


Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,m,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,f,?
4,u4,JOEY,m0,10 things i hate about you,m,6
5,u5,KAT,m0,10 things i hate about you,f,2
6,u6,MANDELLA,m0,10 things i hate about you,f,7
7,u7,MICHAEL,m0,10 things i hate about you,m,5


In [39]:
mcm_df.to_csv(mcm_csv, sep=',', index = False)
print("Saved to csv..")
# Cleaning done on the mcm dataset

Saved to csv..


# Onto Data Exploration 🙌 🙌 
Using plotly to do visualization of all the data so that we get meaningful info out of it

In [40]:
# Visualization on the year of realese to get the idea of movie years we are working with
data = [go.Histogram(
            x=mtm_df['Year_of_Release'],
            marker = dict(color = 'rgb(17, 157, 100)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
layout = go.Layout(
    title='Year of Releases',
    xaxis=dict(
        title='Years'
    ),
    yaxis=dict(
        title='Counts of Movie Releases'
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram1')


Consider using IPython.display.IFrame instead



In [41]:
# Scatter plot on the same data
trace = go.Scatter(
    x = mtm_df['Year_of_Release'],
    mode = 'markers',
)

layout = go.Layout(
    title='Year of Releases',
    xaxis=dict(
        title='Years'
    ),
    yaxis=dict(
        title='Counts of Movie Releases'
    ),
)
data = [trace]
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-scatter')

In [42]:
# Visualization on the number of IMDB Ratings on these movies
data = [go.Histogram(
            y=mtm_df['IMDB_Rating'],
            marker = dict(color = 'rgb(17, 157, 255)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
layout = go.Layout(
    title='IMDB Ratings',
    xaxis=dict(
        title='Number of Movies'
    ),
    yaxis=dict(
        title='Rating Scores'
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram2')

## More analysis

In [43]:
ba = pd.read_csv("ba_mtm_csv.csv")

In [44]:
gen = ba['Genres'].unique()

In [45]:
# Visualization on the year of realese to get the idea of movie years we are working with
data = [go.Histogram(
            x=ba['Genres'].unique(),
            marker = dict(color = 'rgb(17, 157, 100)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
layout = go.Layout(
    title='Year of Releases',
    xaxis=dict(
        title='Years'
    ),
    yaxis=dict(
        title='Counts of Movie Releases'
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram1')