## Cultural References in Movies

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import spacy

In [2]:
data_folder = './Data/MovieSummaries/'
movies = pd.read_csv(data_folder+'movie.metadata.tsv', sep='\t', header=None,  names = ['Wiki ID','movie ID','name', 'release date', 'BOR', 'runtime','languages','countries','genres'])
char= pd.read_csv(data_folder+'character.metadata.tsv', sep='\t', header=None,  names = ['Wiki ID','movie ID', 'release date', 'char name','DOB', 'gender','heght','ethnicity','actor name', 'actor age', 'map ID', 'char ID', 'actor ID' ])

names_char = pd.read_csv(data_folder+'name.clusters.txt', sep='\t', header=None, names=['Name', 'char ID'] )


In [12]:

# Define the valid date range
start_date = '1678-01-01'
end_date = '2262-04-11'

# Filter out-of-range dates
movies = movies[(movies['release date'] >= start_date) & (movies['release date'] <= end_date)]
#movies['release date'] = pd.to_datetime(movies['release date' ]).dt.strftime('%Y-%m-%d')
print("The oldest movie is from:",movies['release date'].min())
print("The most recent movie is from:",movies['release date'].max())

The oldest movie is from: 1888
The most recent movie is from: 2016-06-08


## Movie summaries

In [4]:
summaries_path = data_folder+'plot_summaries.txt' 
movie_summaries = pd.DataFrame(columns=['Wiki ID', 'Summary'])

with open(summaries_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        row=pd.DataFrame([{'Wiki ID': parts[0], 'Summary': parts[1]}])
        movie_summaries = pd.concat([movie_summaries,row],axis=0, ignore_index=True)


In [None]:
import nltk

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

text = '''
Vasha Busa, a Wild Boy
'''


char_clean=char.dropna(subset=['char name'])
for i in range(char_clean.shape[0]):
    text= str(char_clean.iloc[i]['char name'])
    n_words=len(text.split())
    if n_words>=2:
        #print("character name:", text)
        nltk_results = ne_chunk(pos_tag(word_tokenize(text)))
        for nltk_result in nltk_results:
            if type(nltk_result) == Tree:
                name = ''
                for nltk_result_leaf in nltk_result.leaves():
                    name += nltk_result_leaf[0] + ' '
                #print ('Type: ', nltk_result.label(), 'Name: ', name)
                #print ('Name: ', name)


## Clean

In [6]:
import nltk

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

In [57]:
movie_summaries=movie_summaries.dropna()
char=char.dropna(subset=['Wiki ID', 'char name'])

In [8]:
char.head()

Unnamed: 0,Wiki ID,movie ID,release date,char name,DOB,gender,heght,ethnicity,actor name,actor age,map ID,char ID,actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [62]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)




#### Detect names in plot summary, count the number of mentions and link to actor playing the role

In [140]:
name=''
for i in range(0,1):
    print("hello")
    plot=str(movie_summaries[movie_summaries['Wiki ID']==str(movies.loc[i]['Wiki ID'])]['Summary'])
    #characters=char[char['Wiki ID']==movies.loc[i]['Wiki ID']]
    char_names=char[char['Wiki ID']==movies.loc[i]['Wiki ID']]['char name']
    n_words=len(plot.split())
    names=[]
    name_groups = {}
    if n_words>=2:
        print("Summary of film {0}:".format(i), plot)
        nltk_results = ne_chunk(pos_tag(word_tokenize(plot)))
        for nltk_result in nltk_results:
            if type(nltk_result) == Tree:
                name = ''
                for nltk_result_leaf in nltk_result.leaves():
                    name += nltk_result_leaf[0] + ' '
                names.append(name)

 
    for name in names:
        match_found = False
        name_parts = name.strip().lower().split()
        if len(name_parts) == 2:
            #If full name
            first_name, last_name = name_parts
        elif len(name_parts) == 1:
            #If single name
            first_name, last_name= name_parts[0],""
        else:
            break

        full_name = (first_name, last_name)


        # If name mentionned multiple times, group the mentions
        for group, members in name_groups.items():
            if len(name_parts) == 1:
                if first_name in group[0] or first_name in group[1]:
                    members.append(name)
                    match_found = True
            elif first_name in group[0] or last_name in group[1] or first_name in group[1] or last_name in group[0]:
                members.append(name)
                match_found = True
                break

        if not match_found:
            name_groups[full_name] = [name]


    #Count the number of mentions for each name
    for group, members in name_groups.items():
        print(f"Group: {group}, Members: {members}")
    #     print(len(members))
    
    
    # Match names detected to character of char dataset
    for name in char_names:
        mention=False
        #print(name)
        char_name_parts = name.strip().lower().split()
        #look if each word is in each group
        # for group or for word first?
        for group, members in name_groups.items():
            for word in char_name_parts:
                if word in group:
                    print("Name", name, "has", len(members), "mentions")
                    mention=True
                    #print(char)
                    break
        if mention==False:
            print("Name", name, "has", 0, "mentions")
                    

    

hello
Summary of film 0: 22382    Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missing. She learns that the miners had discovered an underground doorway created by an ancient Martian civilization. When the door was opened it released "ghosts," disembodied spirits which possessed the miners. Violence ensues, as the possessed miners commit horrific acts of death and destruction, as well as self-mutilation. With their team leader Helena Bradock  murdered, Ballard must fight o

In [157]:
char['role']=0

In [9]:
movies.head()

Unnamed: 0,Wiki ID,movie ID,name,release date,BOR,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [35]:
movie_summaries[movie_summaries['Wiki ID']=='261236'].head()
print(movie_summaries.shape[0])
print(movies.shape[0])

42306
81741


In [31]:
print(movies.dtypes)
print(movie_summaries.dtypes)

Wiki ID           int64
movie ID         object
name             object
release date     object
BOR             float64
runtime         float64
languages        object
countries        object
genres           object
dtype: object
Wiki ID    object
Summary    object
dtype: object


In [44]:
common_ids = movies['Wiki ID'].isin(movie_summaries['Wiki ID'])

filtered_movies = movies[common_ids]

print(filtered_movies.index)

Index([    0,     3,     4,     6,    12,    13,    14,    15,    17,    18,
       ...
       81725, 81726, 81728, 81729, 81732, 81733, 81736, 81737, 81739, 81740],
      dtype='int64', length=42207)


In [45]:
for i in filtered_movies.index:
    print("New summary")
    plot=movie_summaries[movie_summaries['Wiki ID']==filtered_movies.loc[i]['Wiki ID']]['Summary']
    #plot=str(movie_summaries.loc[i]['Summary'])
    print(plot)


New summary
22382    Set in the second half of the 22nd century, th...
Name: Summary, dtype: object
New summary
11300    A series of murders of rich young women throug...
Name: Summary, dtype: object
New summary
6437    Eva, an upper class housewife, becomes frustra...
Name: Summary, dtype: object
New summary
34184    Every hundred years, the evil Morgana  returns...
Name: Summary, dtype: object
New summary
28213    Adam, a San Francisco-based artist who works a...
Name: Summary, dtype: object
New summary
12253    {{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...
Name: Summary, dtype: object
New summary
9200    Serap, a young actress with a strong, lively p...
Name: Summary, dtype: object
New summary
228     The story starts as one of the robots flies i...
Name: Summary, dtype: object
New summary
16384    The film opens with Mary Poppins  perched in a...
Name: Summary, dtype: object
New summary
7170    Otto Brosowski, a communist miner, writes to t...
Name: Summary, dtype: object
New s

KeyboardInterrupt: 

## Version 2

In [65]:
name=''
for i in range(0,1):
    print("New film")
    plot=str(movie_summaries[movie_summaries['Wiki ID']==movies.loc[i]['Wiki ID']]['Summary'])
    char_names=str(char[char['Wiki ID']==movies.loc[i]['Wiki ID']]['char name'])
    n_words=len(plot.split())
    names=[]
    name_groups = {}
    
    if n_words>=2:
        print("Summary of film {0}:".format(i), plot)
        nltk_results = ne_chunk(pos_tag(word_tokenize(plot)))
        for nltk_result in nltk_results:
            if type(nltk_result) == Tree:
                name = ''
                for nltk_result_leaf in nltk_result.leaves():
                    name += nltk_result_leaf[0] + ' '
                names.append(name)

 
    for name in names:
        match_found = False
        name_parts = name.strip().lower().split()
        if len(name_parts) == 2:
            #If full name
            first_name, last_name = name_parts
        elif len(name_parts) == 1:
            #If single name
            first_name, last_name= name_parts[0],""
        else:
            break

        full_name = (first_name, last_name)


        # If name mentionned multiple times, group the mentions
        for group, members in name_groups.items():
            if len(name_parts) == 1:
                if first_name in group[0] or first_name in group[1]:
                    members.append(name)
                    match_found = True
            elif first_name in group[0] or last_name in group[1] or first_name in group[1] or last_name in group[0]:
                members.append(name)
                match_found = True
                break

        if not match_found:
            name_groups[full_name] = [name]

    
    # Match names detected to character of char dataset
    for name in char_names:
        mention=False
        print(name)
        # char_name_parts = name.strip().lower().split()
        # for group, members in name_groups.items():
        #     for word in char_name_parts:
        #         if word in group:
        #             print("Name", name, "has", len(members), "mentions")
        #             mention=True
        #             # for now assign the number of mentions to each character
        #             char.loc[char['char name'] == name, 'role'] = len(members)
        #             break
        # if mention==False:
        #     char.loc[char['char name'] == name, 'role'] = 0
        #     print("Name", name, "has", 0, "mentions")
        
 
    





    

        

    

New film
Summary of film 0: 22382    Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missing. She learns that the miners had discovered an underground doorway created by an ancient Martian civilization. When the door was opened it released "ghosts," disembodied spirits which possessed the miners. Violence ensues, as the possessed miners commit horrific acts of death and destruction, as well as self-mutilation. With their team leader Helena Bradock  murdered, Ballard must figh

Unnamed: 0,Wiki ID,movie ID,release date,char name,DOB,gender,heght,ethnicity,actor name,actor age,map ID,char ID,actor ID,role
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,0.0
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,0.0
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,0.0
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,0.0
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,0.0


In [66]:
char.head()

Unnamed: 0,Wiki ID,movie ID,release date,char name,DOB,gender,heght,ethnicity,actor name,actor age,map ID,char ID,actor ID,role
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,0.0
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,0.0
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,0.0
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,0.0
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,0.0


In [None]:
name=''
#for i in range(1,2):
for i in filtered_movies.index:
    print("New film")
    #plot=str(movie_summaries[movie_summaries['Wiki ID']==str(movies.loc[i]['Wiki ID'])]['Summary'])
    plot=str(movie_summaries[movie_summaries['Wiki ID']==filtered_movies.loc[i]['Wiki ID']]['Summary'])
    char_names=str(char[char['Wiki ID']==filtered_movies.loc[i]['Wiki ID']]['char name'])
    n_words=len(plot.split())
    names=[]
    name_groups = {}
    if n_words>=2:
        print("Summary of film {0}:".format(i), plot)
        nltk_results = ne_chunk(pos_tag(word_tokenize(plot)))
        for nltk_result in nltk_results:
            if type(nltk_result) == Tree:
                name = ''
                for nltk_result_leaf in nltk_result.leaves():
                    name += nltk_result_leaf[0] + ' '
                names.append(name)

 
    for name in names:
        match_found = False
        name_parts = name.strip().lower().split()
        if len(name_parts) == 2:
            #If full name
            first_name, last_name = name_parts
        elif len(name_parts) == 1:
            #If single name
            first_name, last_name= name_parts[0],""
        else:
            break

        full_name = (first_name, last_name)


        # If name mentionned multiple times, group the mentions
        for group, members in name_groups.items():
            if len(name_parts) == 1:
                if first_name in group[0] or first_name in group[1]:
                    members.append(name)
                    match_found = True
            elif first_name in group[0] or last_name in group[1] or first_name in group[1] or last_name in group[0]:
                members.append(name)
                match_found = True
                break

        if not match_found:
            name_groups[full_name] = [name]

    
    # Match names detected to character of char dataset
    for name in char_names:
        mention=False
        char_name_parts = name.strip().lower().split()
        for group, members in name_groups.items():
            for word in char_name_parts:
                if word in group:
                    print("Name", name, "has", len(members), "mentions")
                    mention=True
                    # for now assign the number of mentions to each character
                    char.loc[char['char name'] == name, 'role'] = len(members)
                    break
        if mention==False:
            char.loc[char['char name'] == name, 'role'] = 0
            print("Name", name, "has", 0, "mentions")
        
 
    

            




    

        

    

### Assign a categorical value based on the number of mentions compared to other characters of the same movie
#### 0: small role (not mentionned in summary)
#### 1: secondary role (mentionned in summary)
#### 2: lead role (mentionned considerabely more than others)

In [None]:

mentions=char[char['Wiki ID']==movies.loc[0]['Wiki ID']]['role']


In [166]:
char.head()

Unnamed: 0,Wiki ID,movie ID,release date,char name,DOB,gender,heght,ethnicity,actor name,actor age,map ID,char ID,actor ID,role
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,0
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,8
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,4
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,3
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,0


# Actors Career

In [17]:
actor_data_folder = './Data/Actors/'
names = pd.read_csv(actor_data_folder+'name.tsv', sep='\t')


In [18]:
principals = pd.read_csv(actor_data_folder+'principals.tsv', sep='\t')
