#### Data preparation

We extract plot summaries that we will pass through our pipeline and store them as `.txt` files to be able to run them through the new coreNLP pipeline. 

In [103]:
from load_data import *
from coreNLP_analysis import *
from extraction import *


movie_df = load_movie_df()
plot_df = load_plot_df()

Remove the romance plot summaries which were already processed through the pipeline.

In [108]:
romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
rom_com_plots = get_plots(romance_genres, movie_df, plot_df)
split_df = plot_df[~plot_df['Wikipedia ID'].isin(rom_com_plots['Wikipedia ID'])]

In [109]:
print('Number of non-romance movie plots: ', len(split_df['Wikipedia ID'].unique()))
print('Number of romance movie plots: ', len(rom_com_plots['Wikipedia ID'].unique()))

Number of non-romance movie plots:  35049
Number of romance movie plots:  7254


In [110]:
# Load descriptions for romance and non-romance movies
descriptions = pd.read_csv('Data/CoreNLP/descriptions.csv', sep='\t', index_col=0)
romance_descriptions = pd.read_csv('Data/CoreNLP/romance_descriptions.csv', sep='\t', index_col=0)

print('Number of non-romance output IDs:', len(descriptions['movie_id'].unique()))
print('Number of romance output IDS: ', len(romance_descriptions['movie_id'].unique()))

Number of non-romance output IDs: 29198
Number of romance output IDS:  6662


In [111]:
# Get number of romance and non-romance movie descriptions
romance_outputs = romance_descriptions['movie_id'].unique().astype(int)
non_romance_outputs = descriptions['movie_id'].unique().astype(int)

# Get number of romance and non-romance movie plots
romance_plots = rom_com_plots['Wikipedia ID'].unique()
non_romance_plots = split_df['Wikipedia ID'].unique()

# Append list of movie ids in romance and non-romance descriptions
plots = np.append(romance_plots, non_romance_plots)
outputs = np.append(romance_outputs, non_romance_outputs)


In [114]:
# Get all movie ids in romance_plots but not in romance_outputs
missing_romance = np.setdiff1d(romance_plots, romance_outputs)
print('Number of romance movie plots missing from romance descriptions: ', len(missing_romance))

# Get all movie ids in non_romance_plots but not in non_romance_outputs
missing_non_romance = np.setdiff1d(non_romance_plots, non_romance_outputs)
print('Number of non-romance movie plots missing from non-romance descriptions: ', len(missing_non_romance))

Number of romance movie plots missing from romance descriptions:  592
Number of non-romance movie plots missing from non-romance descriptions:  5851


Split the remaining plot summaries into 4 parts. Change the value whoami to your name (Alex, Antoine, Hugo). 

In [None]:
alex = split_df.iloc[:int(len(split_df)/4)]
antoine = split_df.iloc[int(len(split_df)/4):int(len(split_df)/2)]
hugo = split_df.iloc[int(len(split_df)/2):int(3*len(split_df)/4)]
marg = split_df.iloc[int(3*len(split_df)/4):]

# Change whoami to your name
whoami = antoine

# Store each plot summary as .txt file
for index, row in whoami.iterrows():
    # If directory doesn't exist, create it
    if not os.path.exists('Plots/'):
        os.makedirs('Plots/')
    with open("Plots/" + "{}.txt".format(row['Wikipedia ID']), 'w', encoding='utf8') as f:
        if type(row['Summary']) == str:
            f.write(row['Summary'])
            f.close()