In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_movie_script(title):
    url = f"https://imsdb.com/scripts/{title.replace(' ', '-')}.html"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return "Script not found or error loading page."
        
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tag = soup.find('pre')
        if not script_tag:
            return "Script text not found in the expected format."

        for b_tag in script_tag.find_all('b'):
            b_tag.decompose()
        return script_tag.get_text()
    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
script = scrape_movie_script("12 Years a Slave")
print(script)



In [None]:

movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
movie_metadata.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
movie_metadata['script'] = movie_metadata['Movie name'].apply(scrape_movie_script)

    


In [60]:
#write all the scripts in a file called scripts.txt where between two scripts there is a line break and the name of the movie
with open('scripts.txt', 'w') as f:
    for index, movie in movie_metadata.iterrows():
        f.write(f"{movie['Movie name']}\n")
        f.write(f"{movie['script']}\n\n")
        f.write(f"{'-'*100}\n\n")

#now write in a csv file the movie name and the script
movie_metadata[['Movie name', 'script']].to_csv('scripts.csv', index=False)




In [96]:
final_movie_metadata = pd.read_csv('scripts.csv')
final_movie_metadata.head()
print(len(final_movie_metadata))


313


In [97]:
#new_df = read_files_and_merge(df_merged, './data/movie_scripts')
import os




folder_path = 'movie_scripts'
if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
# script_files = [file for file in os.listdir(path) if file.startswith('Script_') and file.endswith('.txt')]

scripts = {}
for filename in os.listdir(folder_path):
    # check if the file matches the specified format and is in the provided dataset, construct the path
    if filename.startswith('Script_') and filename.endswith('.txt'):
        movie_name_from_filename = filename[len('Script_'):-len('.txt')]
        file_path = os.path.join(folder_path, filename)          
        # read from the file
        with open(file_path, 'r') as file:
            file_contents = file.read()
            scripts[movie_name_from_filename] = file_contents


      

In [None]:
print(len(final_movie_metadata))
#if a script is not in the scripts and is in the final_movie_metadata, add it
for index, movie in final_movie_metadata.iterrows():
    if movie['Movie name'] not in scripts:
        scripts[movie['Movie name']] = movie['script']
        print(f"Successfully added script for {movie['Movie name']}")
#make the movie name lowercase
scripts = {k.lower(): v for k, v in scripts.items()}

print(len(scripts))

In [3]:
movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)

movie_metadata.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']

movie_metadata['Movie name'] = movie_metadata['Movie name'].str.lower()


In [None]:

for movie in list(scripts.keys()):
    movie = movie.lower()
    if movie not in list(movie_metadata['Movie name']):
        scripts.pop(movie)
        print(f"Successfully removed script for {movie}")
print(len(scripts))

In [None]:


import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#'all-MiniLM-L6-v2'
model='all-MiniLM-L6-v2'
script_sentences = sent_tokenize(script)

print(len(script_sentences))
#compute the sentence embeddings of the script
script_sentence_embeddings = model.encode(script_sentences, convert_to_tensor=True)



In [None]:

movie_metadata_script_bow_bow = pd.read_csv('movie_metadata_script_bow_bow.csv')
movie_metadata_script_bow_bow.head()



In [5]:
movie_scripts = movie_metadata_script_bow_bow[['movie_name_script', 'script']]

In [None]:


from sentence_transformers import SentenceTransformer
import torch

# Checking if CUDA (GPU support) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define and load the model, moving it to the appropriate device (GPU or CPU)
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Concatenate every 10 sentences
movie_scripts['sentences_10'] = movie_scripts['sentences'].apply(lambda x: [' '.join(x[i:i+10]) for i in range(0, len(x), 10)])

# Function to process in batches, adjusted for device
def process_in_batches(sentences, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Convert to tensor and move to the same device as the model
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        batch_embeddings = batch_embeddings.to(device)
        embeddings.extend(batch_embeddings.cpu())  # Move embeddings back to CPU if needed
        print(f"Processed {i} sentences")
    return embeddings









In [None]:
movie_embeddings = movie_scripts['sentences_10'].apply(process_in_batches)

In [31]:
movie_embeddings = pd.read_csv('movie_embeddings.csv')
movie_names=movie_metadata_script_bow_bow['movie_name_script']

In [None]:
movie_embeddings.shape


#convert movie_embeddings to a numpy array
movie_embeddings = movie_embeddings.to_numpy()
movie_embeddings[0]


In [None]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

movie_complexity_scores = []

# Iterate over each movie
for i,movie in enumerate(movie_embeddings):
    movie_complexity_score = 0
    print(f"Processing movie {movie_names[i]}")
    
    # Iterate over each sentence embedding and compute cosine similarity with the next three sentences
    for i in range(len(movie) - 3):
        # Reshape the embeddings to 2D arrays for cosine similarity calculation
        emb_current = movie[i].reshape(1, -1)
        emb_next1 = movie[i+1].reshape(1, -1)
        emb_next2 = movie[i+2].reshape(1, -1)
        emb_next3 = movie[i+3].reshape(1, -1)

        # Compute the cosine similarity between the current sentence and the next three sentences
        similarity1 = cosine_similarity(emb_current, emb_next1)[0][0]
        similarity2 = cosine_similarity(emb_current, emb_next2)[0][0]
        similarity3 = cosine_similarity(emb_current, emb_next3)[0][0]

        # Compute the average similarity
        average_similarity = np.mean([similarity1, similarity2, similarity3])
        # Add the average similarity to the movie complexity score
        movie_complexity_score += average_similarity

    # Add the movie complexity score to the list   
    movie_complexity_scores.append(movie_complexity_score/len(movie))

    
# movie_complexity_scores contains the complexity scores for each movie



In [None]:

#for each year, compute the average complexity score of the movies and plot it
import matplotlib.pyplot as plt
import seaborn as sns
complexity_scores = pd.DataFrame({'movie_name': movie_names, 'complexity_score': movie_complexity_scores})
movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
movie_metadata.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
#for each year, compute the average complexity score of the movies and plot it
complexity_scores['Movie name'] = complexity_scores['movie_name'].str.lower()
movie_metadata['Movie name'] = movie_metadata['Movie name'].str.lower()
complexity_scores = complexity_scores.merge(movie_metadata[['Movie name', 'Movie release date']], on='Movie name')
complexity_scores['Movie release date'] = pd.to_datetime(complexity_scores['Movie release date'])
complexity_scores['year'] = complexity_scores['Movie release date'].dt.year
complexity_scores = complexity_scores.groupby('year').mean().reset_index()
complexity_scores.head()
#plot the complexity score for each year
plt.figure(figsize=(10, 8))
sns.lineplot(data=complexity_scores, x='year', y='complexity_score')
plt.title("Average complexity score of movies per year")
plt.show()



In [None]:

complexity_scores['year'] = complexity_scores['year'].apply(lambda x: x - x%3)
complexity_scores = complexity_scores.groupby('year').mean().reset_index()
complexity_scores.head()


plt.figure(figsize=(10, 8))
sns.lineplot(data=complexity_scores, x='year', y='complexity_score')
plt.title("Average complexity score of movies per year")
plt.show()




In [None]:
window_size = 5  # Set the window size for moving average
complexity_scores['moving_avg_complexity'] = complexity_scores['complexity_score'].rolling(window=window_size).mean()


# Group by year and calculate mean and standard deviation for confidence intervals
yearly_stats = complexity_scores.groupby('year').agg({'moving_avg_complexity': ['mean', 'std']}).reset_index()
yearly_stats.columns = ['year', 'avg_complexity', 'std_complexity']

# Plotting with confidence intervals
plt.figure(figsize=(10, 8))
sns.lineplot(data=yearly_stats, x='year', y='avg_complexity', err_style="band", ci='std')
plt.title("Average complexity score of movies per year with Moving Average and Confidence Intervals")
plt.show()


In [None]:
# Create DataFrame for complexity scores
complexity_scores = pd.DataFrame({'movie_name': movie_names, 'complexity_score': movie_complexity_scores})

# Your existing movie_metadata code...
movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
movie_metadata.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']

complexity_scores['Movie name'] = complexity_scores['movie_name'].str.lower()
movie_metadata['Movie name'] = movie_metadata['Movie name'].str.lower()
complexity_scores = complexity_scores.merge(movie_metadata[['Movie name', 'Movie release date']], on='Movie name')
complexity_scores['Movie release date'] = pd.to_datetime(complexity_scores['Movie release date'])
complexity_scores['year'] = complexity_scores['Movie release date'].dt.year

# Calculate the moving average for complexity scores
window_size = 5  # Set the window size for moving average
complexity_scores['moving_avg_complexity'] = complexity_scores['complexity_score'].rolling(window=window_size).mean()

# Group by year and calculate mean and standard deviation for confidence intervals
yearly_stats = complexity_scores.groupby('year').agg({'moving_avg_complexity': ['mean', 'std']}).reset_index()
yearly_stats.columns = ['year', 'avg_complexity', 'std_complexity']

# Plotting with confidence intervals
plt.figure(figsize=(10, 8))
sns.lineplot(data=yearly_stats, x='year', y='avg_complexity', err_style="band", ci='std')
plt.title("Average complexity score of movies per year with Moving Average and Confidence Intervals")
plt.show()