In [None]:
import pandas as pd

# Visualization librairies
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
import matplotlib.ticker as mtick
from matplotlib.lines import Line2D

#Calculating libraries
from scipy.stats import bootstrap
import numpy as np
from scipy import stats
import scipy as sp
import matplotlib.dates as md

#statistical librairies
from os import stat

# Others
from functools import partial
from ast import literal_eval

# Import libraries
import requests
from bs4 import BeautifulSoup

In [None]:
character=pd.read_table('./data/character.metadata.tsv',header=None)
movies=pd.read_table('./data/movie.metadata.tsv',header=None)
names=pd.read_csv('./data/name.clusters.txt',sep="\t",header=None)
summaries=pd.read_csv('./data/plot_summaries.txt',sep="\t",header=None)
tvtropes=pd.read_csv('./data/tvtropes.clusters.txt',sep="\t",header=None)

In [None]:
character=character.rename(columns={0:'wikipedia_movie_id',1:'Freebase_movie_ID',3:'character_name',4:'Actor_DOB',5:'Actor_gender',6:'actor_height',7:'Actor_etnicity',8:'Actor_name',9:'Actor_age_at_movie_release',10:'Freebase_character_map'})
movies.rename(columns={0:'wikipedia_movie_id',1:'Freebase_movie_ID',2:'Movie_name',3:'Movie_release',4:'Box_office_revenue',5:'Movie_runtime',6:'Movie_language',7:'Movie_country',8:'Movie_genre'},inplace=True)
names.rename(columns={0:'Character_names',1:'Freebase_ID'},inplace=True)
tvtropes.rename(columns={0:'Character_type',1:'Freebase_ID'},inplace=True)
summaries.rename(columns={0:'wikipedia_movie_id',1:'Plot_summaries'},inplace=True)

In [None]:
tvtropes["dict_ID"]=[literal_eval(i) for i in tvtropes.Freebase_ID]
tvtropes=pd.concat([tvtropes.drop(['dict_ID'], axis=1), tvtropes
                    ['dict_ID'].apply(pd.Series)], axis=1)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, LancasterStemmer
from nltk.tokenize import word_tokenize
import re

stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def stem_string(string):
    words = word_tokenize(string)
    lemmatized_words = [stemmer.stem(word) for word in words]
    return lemmatized_words

print(stem_string('biographical biography romantic romance'))

def clean_string(string):

    clean_s = string.lower()
    clean_s = clean_s.replace('/', ' ')

    clean_s = re.sub(r'-(?!.*(fi|white))', ' ', clean_s)

    clean_s = re.sub(r'((?<=sci) (?=fi))|((?<=science) (?=fiction))', '-', clean_s)
    # remove everything except alphabets 
    clean_s = re.sub("[^a-zA-Z-]"," ",clean_s) 
    # remove whitespaces 
    clean_s = ' '.join(clean_s.split()) 
    return clean_s

def clean(var):

    out = ''

    if type(var) == list:
        out = var.copy()
        for i,s in enumerate(var):
            out[i] = clean_string(s)
    else :
        if type(var) == dict:
            out = var.copy()
            for key, value in var.items():
                out[clean_string(key)] = clean(value)
        else:
            out = clean_string(var)
    return out

In [None]:
import json

from collections import defaultdict

movies['Movie_release'] = pd.to_datetime(movies['Movie_release'], errors = 'coerce')
sorted_movies = movies.sort_values("Movie_release", ascending=False).dropna(subset=['Movie_release'])

genres_per_year = sorted_movies[['Movie_release', 'Movie_name', 'Movie_genre']]
genres_per_year['Movie_genre'] = genres_per_year.Movie_genre.apply(json.loads).apply(lambda x: list(x.values()))

genres_per_year.head(20)

In [None]:


movies['Movie_release'] = pd.to_datetime(movies['Movie_release'], errors = 'coerce')
sorted_movies = movies.sort_values("Movie_release", ascending=False).dropna(subset=['Movie_release'])

genres_per_year = sorted_movies[['Movie_release', 'Movie_name', 'Movie_genre']]
genres_per_year['Movie_genre'] = genres_per_year.Movie_genre.apply(json.loads).apply(lambda x: clean(list(x.values())))

occur = genres_per_year.copy()

occur.head(20)

In [None]:
URL = 'https://www.studiobinder.com/blog/movie-genres-list/'

# Make the request
r = requests.get(URL) # /ip: Returns the requester's IP Address.

print('Response status code: {0}\n'.format(r.status_code))
print('Response headers: {0}\n'.format(r.headers))

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')
soup.h5.string

publications_wrappers = soup.find_all('div', {"data-css": ["tve-u-16d41491117", "tve-u-16d41ed6dc1", "tve-u-17317362207"]})

print('Total number of items: {0}'.format(len(publications_wrappers)))

genres = dict([])

for p in publications_wrappers[:-1]:    
    sub_g = p.find_all('h5')
    major_genres = p.find_all('h3')

    for g in major_genres :

        m_genre = clean(g.string.rsplit(' ', 1)[0])
        genres[m_genre] = []
        genres[m_genre].append(clean(m_genre))

    for s in sub_g :
        genres[m_genre].append(clean(s.string))

print(genres)

In [52]:
def clean_confusing_genres(all_genres):

    key_list = list(all_genres.keys())
    key_list = [stem_string(key) for key in key_list]


    values_list = []
    for items in list(all_genres.values()) : 
        values_list += items

    for major_genres, sub_genres in all_genres.items() :
        
        temp_list = values_list.copy()
        stemmed_major_genre = stem_string(major_genres)

        for s_g in sub_genres :
            temp_list.remove(s_g)

            for composed_genres in temp_list :
                if s_g in composed_genres :
                    composed_genres.remove(s_g)
        
        temp_list = [x for x in temp_list if x]

        for s_g in sub_genres :

            for other_genres in temp_list :
                if (other_genres[0] in s_g) and (stemmed_major_genre[0] in s_g) :
                    s_g.remove(other_genres[0])

                if "film" in s_g :
                    s_g.remove("film")

                if "fict" in s_g :
                    s_g.remove("fict")

                for m_genres in key_list :
                    if m_genres[0] in s_g :
                        s_g.remove(m_genres[0])
            
        all_genres[major_genres].append(stemmed_major_genre)   
                
        all_genres[major_genres] = [x for x in sub_genres if x]

In [53]:
from utils.genres import additional_wordsets
additional_wordsets = additional_wordsets

def mergeDictionary(dict_1, dict_2):
   dict_3 = {**dict_1, **dict_2}
   for key, value in dict_3.items():
       if key in dict_1 and key in dict_2:
               dict_3[key] = value + dict_1[key]
   return dict_3

def stem_dictionary(dict):
    for key, values in dict.items() :
        dict[key] = [stem_string(v) for v in values]

In [55]:
genre_map = mergeDictionary(genres, clean(additional_wordsets))
stem_dictionary(genre_map)
print(genre_map)
clean_confusing_genres(genre_map)
print('\n\n')
print(genre_map)

{'action': [['swashbuckl'], ['comb'], ['mart'], ['box'], ['sword'], ['act'], ['hero', 'bloodsh'], ['milit', 'act'], ['esp'], ['wux', 'act'], ['disast'], ['adv'], ['superhero']], 'animation': [['cartoon'], ['anim'], ['tradit'], ['stop', 'mot'], ['claym'], ['cutout'], ['comput', 'gen', 'imagery'], ['puppetry'], ['liv', 'act']], 'comedy': [['slapstick'], ['tragicomedy'], ['ston'], ['comedy'], ['act', 'comedy'], ['dark', 'comedy', 'black', 'comedy'], ['rom', 'comedy'], ['buddy', 'comedy'], ['road', 'comedy'], ['slapstick', 'comedy'], ['parody'], ['spoof'], ['satir'], ['sitcom'], ['sketch', 'comedy'], ['mocku'], ['prank']], 'crime': [['spy'], ['crimin'], ['prison'], ['slash'], ['whodunit'], ['crimin'], ['crim'], ['cap'], ['heist'], ['gangst'], ['cop', 'pol'], ['detect'], ['courtroom'], ['proc']], 'drama': [['tragicomedy'], ['american'], ['tragedy'], ['dram'], ['melodram'], ['teen', 'dram'], ['philosoph', 'dram'], ['med', 'dram'], ['leg', 'dram'], ['polit', 'dram'], ['anthropolog', 'dram'], 

In [92]:
global i 
i=0

def extract_genres(df_genres:object)->list:
    """ returns 1 if one pronoun in pronouns is in headline else 0

    Args:
        pronouns (list): list of pronouns
        headline (object): headline of the article

    Returns:
        list: 1 if there is a pronoun from pronouns in headline else 0 
    """
    genre_list = []
    input_genres = df_genres.copy()
    global i
    i +=1

    if(not i%100):
        print(str(i) +" out of " + str(len(genres_per_year.index)))

    merged_stem_strings = [stem_string(clean(g)) for g in input_genres]

    for major_genres, sub_genres in genre_map.items() : 
        for s_g in sub_genres :
            for item in merged_stem_strings :
                if s_g[0] in item :
                    genre_list.append(major_genres)

    if len(genre_list) == 0 :
        genre_list.append("Other")

    genre_list = list(set(genre_list))

    return genre_list

In [93]:
genres_per_year["Major_genres"]=genres_per_year["Movie_genre"].apply(
    partial(extract_genres))

100 out of 74838
200 out of 74838
300 out of 74838
400 out of 74838
500 out of 74838
600 out of 74838
700 out of 74838
800 out of 74838
900 out of 74838
1000 out of 74838
1100 out of 74838
1200 out of 74838
1300 out of 74838
1400 out of 74838
1500 out of 74838
1600 out of 74838
1700 out of 74838
1800 out of 74838
1900 out of 74838
2000 out of 74838
2100 out of 74838
2200 out of 74838
2300 out of 74838
2400 out of 74838
2500 out of 74838
2600 out of 74838
2700 out of 74838
2800 out of 74838
2900 out of 74838
3000 out of 74838
3100 out of 74838
3200 out of 74838
3300 out of 74838
3400 out of 74838
3500 out of 74838
3600 out of 74838
3700 out of 74838
3800 out of 74838
3900 out of 74838
4000 out of 74838
4100 out of 74838
4200 out of 74838
4300 out of 74838
4400 out of 74838
4500 out of 74838
4600 out of 74838
4700 out of 74838
4800 out of 74838
4900 out of 74838
5000 out of 74838
5100 out of 74838
5200 out of 74838
5300 out of 74838
5400 out of 74838
5500 out of 74838
5600 out of 74838
5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres_per_year["Major_genres"]=genres_per_year["Movie_genre"].apply(


In [94]:
pd.set_option('max_colwidth', 100)
genres_per_year.head(20)

Unnamed: 0,Movie_release,Movie_name,Movie_genre,Major_genres
34022,2016-06-08,Jeepers Creepers 4,"[thriller, horror]","[thriller, horror]"
30275,2016-03-18,Kung Fu Panda 3,[],[Other]
64862,2016-01-01,Battle Angel,"[thriller, science-fiction, action, romance film]","[science-fiction, thriller, romance, action]"
53855,2015-01-01,Total Dhamaal,[comedy film],[comedy]
8307,2015-01-01,"I, Robot 2",[science-fiction],[science-fiction]
40166,2015-01-01,Knight Rider: The Movie,"[crime fiction, science-fiction, action]","[crime, action, science-fiction]"
51429,2015-01-01,Avatar 2,"[science-fiction, action, fantasy, adventure]","[science-fiction, action, fantasy]"
70420,2014-12-30,Hosa Prema Purana,[romance film],[romance]
34454,2014-08-15,Appayya,[romance film],[romance]
52234,2014-07-01,Bulbul,[],[Other]
