Add summaries from .txt file to dataframes

In [8]:
import pandas as pd
import numpy as np

In [9]:
df_film = pd.read_csv("clean_data/film_data_clean.csv", index_col = 0)
df_char = pd.read_csv("clean_data/char_data_clean.csv", index_col = 0)

In [11]:
## sort by film ID
df_char = df_char.sort_values(by = "film_id")

In [12]:
## syntax to find film
df_char[df_char.film_id.isin([31186339])]

Unnamed: 0,film_id,film_release_date,char_name,actor_gender,actor_name,actor_age
363240,31186339,2012,Seneca Crane,M,Wes Bentley,33.0
363239,31186339,2012,Cinna,M,Lenny Kravitz,47.0
363241,31186339,2012,Rue,F,Amandla Stenberg,13.0
363248,31186339,2012,Game Tech,M,Eric Hennig,22.0
363242,31186339,2012,Glimmer,F,Leven Rambin,21.0
363243,31186339,2012,Claudius Templesmith,M,Toby Jones,45.0
363244,31186339,2012,Marvel,M,Jack Quaid,19.0
363245,31186339,2012,Mrs. Everdeen,F,Paula Malcomson,42.0
363235,31186339,2012,Caesar Flickerman,M,Stanley Tucci,51.0
363247,31186339,2012,Katniss' Father,M,Phillip Troy Linger,47.0


In [13]:
with open("plot_summaries.txt") as f:
    summaries = f.readlines()

In [14]:
## split summaries into tuples of (film ID, list of strings)
summaries = [summary.lower().split() for summary in summaries]
summary_tuple = [(int(summary[0]), summary[1:]) for summary in summaries]

In [15]:
indices = [int(summ[0]) for summ in summary_tuple]

In [24]:
## syntax to find where film IDs in dataframe overlap with IDs in summaries
## matches = set(char_film_id).intersection(indices)

In [17]:
## create dataframes where summaries and metadata are available 
char_match_df = df_char.loc[df_char['film_id'].isin(indices)]
film_match_df = df_film.loc[df_film['film_id'].isin(indices)]

In [18]:
## sort by film id
film_match_df = film_match_df.sort_values(by = "film_id")
char_match_df = char_match_df.sort_values(by = "film_id")

In [115]:
## convert list of tuples to dict
summaries = dict(summary_tuple)

In [116]:
## create list of values where key is in film_id column
summary_list = []
for row in film_match_df["film_id"]:
    summary_list.append(summaries[row])

In [117]:
## add list as new column
film_match_df["film_summaries"] = summary_list

In [118]:
film_match_df.head(10)

Unnamed: 0,film_id,film_title,film_release_date,film_language,film_country,film_summaries
61305,330,Actrius,1996,"{""/m/01m69"": ""Catalan language"", ""/m/06nm1"": ""...","{""/m/06mkj"": ""Spain""}","[in, order, to, prepare, the, role, of, an, im..."
14570,3217,Army of Darkness,1992,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[after, being, pulled, through, a, time, porta..."
2634,3333,The Birth of a Nation,1915,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","[the, film, follows, two, juxtaposed, families..."
48191,3746,Blade Runner,1982,"{""/m/03_9r"": ""Japanese Language"", ""/m/012w70"":...","{""/m/09c7w0"": ""United States of America"", ""/m/...","[{{hatnote}}, in, los, angeles,, november, 201..."
48856,3837,Blazing Saddles,1974,"{""/m/0880p"": ""Yiddish Language"", ""/m/02h40lc"":...","{""/m/09c7w0"": ""United States of America""}","[in, the, american, old, west, of, 1874,, cons..."
10005,3947,Blue Velvet,1986,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[jeffrey, beaumont, returns, to, his, logging,..."
13905,4227,Barry Lyndon,1975,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","[:by, what, means, redmond, barry, acquired, t..."
23979,4231,Buffy the Vampire Slayer,1992,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[buffy, summers, is, introduced, as, a, stereo..."
73233,4560,Braveheart,1995,"{""/m/064_8sq"": ""French Language"", ""/m/04h9h"": ...","{""/m/09c7w0"": ""United States of America""}","[in, the, 13th, century,, after, several, year..."
55165,4726,Batman,1989,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","[as, a, child,, bruce, wayne, witnesses, his, ..."


In [119]:
char_match_df.head(10)

Unnamed: 0,film_id,film_release_date,char_name,actor_gender,actor_name,actor_age
239901,3217,1992,Wiseman,M,Ian Abercrombie,56.0
239898,3217,1992,Possessed Witch,F,Patricia Tallman,35.0
239904,3217,1992,Lord Arthur,M,Marcus Gilbert,34.0
239897,3217,1992,Sheila,F,Embeth Davidtz,27.0
239899,3217,1992,Duke Henry the Red,M,Richard Grove,37.0
239896,3217,1992,Linda,F,Bridget Fonda,28.0
239900,3217,1992,Second Supportive Villager,M,Ted Raimi,26.0
239892,3217,1992,Fake shemp,M,Ivan Raimi,36.0
239893,3217,1992,Ash Williams,M,Bruce Campbell,34.0
239894,3217,1992,Evil Ash,M,Bruce Campbell,34.0


In [122]:
film_match_df.to_csv("film_data_clean.csv")

In [123]:
char_match_df.to_csv("char_data_clean.csv")