<h1> Statistiques par genre de films </h1>

# Import

In [1]:
import os #get all the files in a folder
import pandas as pd
import numpy as np
from collections import Counter
import operator

import nltk
from nltk.corpus import stopwords
all_stopwords = stopwords.words("english")

#tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') #to remove punctuations

# Choices

In [2]:
#genre of the movies
movie_genre = 'adventure' #'action' or 'romance'

#Path to the data
path_to_data = '../data/script/'

#path to the movies whose genre is selected
path_to_movies = path_to_data + movie_genre + '/clean/'

# Open files

In [3]:
df_script_genre = pd.DataFrame()
files = [f for f in os.listdir(path_to_movies) if f[-4:] == '.csv']
print('There are %.0f files in the directory %s.' %(len(files), movie_genre))

for filename in files:
    name_movie = filename[:-4]
    df_script_movie = pd.read_csv(path_to_movies + filename, sep = '|') #read the script
    df_script_genre = df_script_genre.append(df_script_movie)

There are 54 files in the directory adventure.


In [4]:
df_script_genre.shape

(87161, 6)

# Clean the text

In [5]:
def clean_text_from_nrt(text):
    return text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

def clean_text_to_lower(text):
    return text.lower()

# Remove `\n` `\r` and `\t` from the text
df_script_genre['text'] = df_script_genre['text'].apply(clean_text_from_nrt) 
        
# Lowerize    
df_script_genre['text'] = df_script_genre['text'].apply(clean_text_to_lower)

In [6]:
df_script_genre.head(20)

Unnamed: 0.1,Unnamed: 0,character,text,type,character_clean,gender
0,0,,2012 written by roland emmerich & harald klo...,unknown,,
1,1,,over black we listen to the immortal music of...,stage direction,,
2,2,,fade up,unknown,,
3,3,,ext. the solar system,location,,
4,4,,"space, infinite and empty. but then, slowly a...",stage direction,,
5,5,,fade to black 2009 fade up,unknown,,
6,6,,ext. country side/india - sunset,location,,
7,7,,mozart's concerto filters from a jeep's stereo...,stage direction,,
8,8,,prof. west watch out!,unknown,,
9,9,,but it's too late. the jeep drives straight th...,stage direction,,


In [7]:
count_replicas_gender = Counter(df_script_genre.gender)
sorted_count_replicas_gender = sorted(count_replicas_gender.items(), key=operator.itemgetter(1), reverse = True)
sorted_count_replicas_gender

[(nan, 45081), ('M', 28573), ('F', 9969), ('?', 3538)]

In [8]:
#Pie chart between M vs F


# Speech analysis

In [9]:
def clean_nlp(text):
    token = tokenizer.tokenize(' '.join(text))
    del_sw = [word for word in token if word not in all_stopwords]
    return del_sw #' '.join(del_sw)

## general

In [10]:
#Only takes speech type
df_script_genre_speech = df_script_genre[df_script_genre['type'] == 'speech']
df_script_genre_speech.reset_index(inplace=True, drop = True)

In [11]:
all_speech = ' '.join(df_script_genre_speech.text).split(' ')
all_clean_speech = clean_nlp(all_speech)

  app.launch_new_instance()


In [12]:
most_recurrent_words = Counter(all_clean_speech)
sorted_most_recurrent_words = sorted(most_recurrent_words.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_words[:20]

[('know', 2101),
 ('get', 2050),
 ('one', 1824),
 ('like', 1691),
 ('back', 1640),
 ('go', 1456),
 ('got', 1270),
 ('see', 1181),
 ('right', 1129),
 ('us', 1112),
 ('going', 1102),
 ('look', 1095),
 ('come', 993),
 ('think', 963),
 ('want', 954),
 ('time', 952),
 ('let', 943),
 ('good', 910),
 ('oh', 899),
 ('looks', 875)]

In [13]:
#Remove verbs and alter
tagged = nltk.pos_tag(all_clean_speech)
all_clean_speech_noun = [elt for elt, pos in tagged if pos == 'NN']
most_recurrent_noun = Counter(all_clean_speech_noun)
sorted_most_recurrent_noun = sorted(most_recurrent_noun.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_noun[:10]

[('time', 952),
 ('man', 865),
 ('way', 816),
 ('something', 716),
 ('jack', 686),
 ('look', 593),
 ('day', 540),
 ('thing', 500),
 ('life', 455),
 ('get', 429)]

## female speech

In [14]:
#Only takes speech type
df_script_genre_speech_female = df_script_genre_speech[df_script_genre_speech['gender'] == 'F']
df_script_genre_speech_female.reset_index(inplace=True, drop = True)

In [15]:
#Get all the speech pronounced by a female character
all_speech_female = ' '.join(df_script_genre_speech_female.text).split(' ')

#remove stopwords
all_clean_speech_female = clean_nlp(all_speech_female)

#most recurrent tokens
most_recurrent_words = Counter(all_clean_speech_female)
sorted_most_recurrent_words = sorted(most_recurrent_words.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_words[:20]

  app.launch_new_instance()


[('know', 498),
 ('get', 406),
 ('one', 397),
 ('like', 365),
 ('go', 337),
 ('back', 330),
 ('going', 269),
 ('right', 261),
 ('see', 251),
 ('look', 248),
 ('us', 244),
 ('come', 244),
 ('oh', 244),
 ('got', 241),
 ('want', 221),
 ('think', 219),
 ('time', 206),
 ('good', 200),
 ('let', 199),
 ('jack', 195)]

In [16]:
#most recurrent nouns
tagged = nltk.pos_tag(all_clean_speech_female)
all_clean_speech_noun = [elt for elt, pos in tagged if pos == 'NN']
most_recurrent_noun = Counter(all_clean_speech_noun)
sorted_most_recurrent_noun = sorted(most_recurrent_noun.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_noun[:10]

[('time', 206),
 ('jack', 178),
 ('something', 178),
 ('way', 174),
 ('look', 155),
 ('man', 136),
 ('life', 119),
 ('night', 110),
 ('day', 109),
 ('thing', 100)]

## male speech

In [17]:
#Only takes speech type
df_script_genre_speech_male = df_script_genre_speech[df_script_genre_speech['gender'] == 'M']
df_script_genre_speech_male.reset_index(inplace=True, drop = True)

In [18]:
#Get all the speech pronounced by a male character
all_speech_male = ' '.join(df_script_genre_speech_male.text).split(' ')

#remove stopwords
all_clean_speech_male = clean_nlp(all_speech_male)

#most recurrent tokens
most_recurrent_words = Counter(all_clean_speech_male)
sorted_most_recurrent_words = sorted(most_recurrent_words.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_words[:20]

  app.launch_new_instance()


[('get', 1463),
 ('know', 1452),
 ('one', 1209),
 ('like', 1138),
 ('back', 1096),
 ('go', 997),
 ('got', 919),
 ('see', 814),
 ('right', 770),
 ('going', 763),
 ('us', 748),
 ('look', 748),
 ('think', 678),
 ('let', 670),
 ('come', 667),
 ('time', 666),
 ('want', 649),
 ('good', 635),
 ('well', 613),
 ('man', 609)]

In [19]:
#most recurrent nouns
tagged = nltk.pos_tag(all_clean_speech_male)
all_clean_speech_noun = [elt for elt, pos in tagged if pos == 'NN']
most_recurrent_noun = Counter(all_clean_speech_noun)
sorted_most_recurrent_noun = sorted(most_recurrent_noun.items(), key=operator.itemgetter(1), reverse = True)
sorted_most_recurrent_noun[:10]

[('time', 666),
 ('man', 609),
 ('way', 553),
 ('something', 474),
 ('jack', 437),
 ('look', 388),
 ('thing', 347),
 ('day', 335),
 ('world', 304),
 ('let', 298)]