# Loading the dataset chosen and cleaning it
For this project we decided to analyse a dataset on movies, that can be found in this link: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies  
It is being updated daily and has over 1 million rows, so it was important to clean and filter it properly before proceeding with the rest of the project.

In [None]:
import pandas as pd
import numpy as np
import random

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


#### After checking the amount of missing values in most columns, we decided to start filtering the dataframe and reduce the number of columns that would be useless to our analysis.
####  Sorting the dataframe by the revenue, so that the best sellers are on top
## Export the clean dataset
#### Further cleaning and taking care of the most obvious outliers

'''Preprocessing
## Data Loading:

Imported a cleaned dataset (clean_movies.csv) for analysis and modeling.
Feature Engineering:

Extracted release_year from release_date for temporal insights.
Created ROI (Return on Investment): 
(revenue−budget)
/
budget
(revenue−budget)/budget to measure profitability.
Defined success (Classification Target): Binary label indicating whether a movie’s revenue exceeds the median.
Genre Encoding:

Split the genres column into individual genres using .str.split().
Encoded each genre as a binary feature (1 if the genre is present, 0 otherwise).
Feature and Target Selection:

For Regression:
Features: budget, runtime, and all encoded genres.
Target: revenue.
For Classification:
Features: Same as regression.
Target: success.
Data Splitting:
Split data into training (80%) and testing (20%) sets using train_test_split for both regression and classification tasks.'''

In [None]:
# Load dataset
movies_df = pd.read_csv('../data/clean/clean_movies.csv')

# Preprocessing for regression and classification
movies_df['release_year'] = pd.to_datetime(movies_df['release_date']).dt.year
movies_df['ROI'] = (movies_df['revenue'] - movies_df['budget']) / movies_df['budget']
movies_df['success'] = (movies_df['revenue'] > movies_df['revenue'].median()).astype(int)  # Classification target
movies_df['genres_list'] = movies_df['genres'].str.split(',')

# Encode genres for modeling
all_genres = list(set(genre.strip() for sublist in movies_df['genres_list'] for genre in sublist))
for genre in all_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: genre in x)

# Features and targets
X_regression = movies_df[['budget', 'runtime'] + all_genres]
y_regression = movies_df['revenue']

X_classification = X_regression
y_classification = movies_df['success']

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)


In [None]:
# Train a classification model
cls_model = RandomForestClassifier(random_state=42)
cls_model.fit(X_train_cls, y_train_cls)

# Predict and evaluate
y_pred_cls = cls_model.predict(X_test_cls)
accuracy = accuracy_score(y_test_cls, y_pred_cls)
print(f"Classification Model Accuracy: {accuracy}")


In [None]:
# Vectorize genres using TF-IDF
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
genre_matrix = tfidf.fit_transform(movies_df['genres_list'])

# Find similar movies for a given movie
def recommend_movies(movie_title, n=5):
    idx = movies_df[movies_df['title'] == movie_title].index[0]
    cosine_sim = cosine_similarity(genre_matrix[idx], genre_matrix).flatten()
    similar_indices = cosine_sim.argsort()[-n-1:-1][::-1]  # Exclude itself
    return movies_df.iloc[similar_indices][['title', 'genres']]

print(recommend_movies('Avatar'))


## Test for randomization

In [None]:


# Assuming `release_date` is in a valid date format in the DataFrame
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')

# Drop rows where `release_date` couldn't be converted
movies_df = movies_df.dropna(subset=['release_date'])

# Extract the release year
movies_df['release_year'] = movies_df['release_date'].dt.year

# Ensure we have `release_year` calculated
if 'release_year' not in movies_df.columns or movies_df['release_year'].isnull().all():
    raise ValueError("No valid release years available.")

# Filter top-performing movies (e.g., top 10% by revenue)
top_movies = movies_df[movies_df['revenue'] > movies_df['revenue'].quantile(0.9)]

# Ensure `top_movies` is not empty
if not top_movies.empty:
    # Break titles into words for analysis
    title_words = top_movies['title'].dropna().str.split().explode().tolist()  # Convert to a list

    if not title_words:
        raise ValueError("No title words available for generating new movie titles.")

    def generate_title():
        part1 = random.choice(title_words)
        part2 = random.choice(title_words)
        part3 = random.choice(title_words)
        return f"{part1} {part2} {part3}".title()

    # Calculate required variables
    top_genres = top_movies['genres'].str.split(', ').explode().mode().tolist()
    average_runtime = int(top_movies['runtime'].mean())
    average_budget = int(top_movies['budget'].mean())
    popular_release_year = int(top_movies['release_year'].mode()[0])  # Now using the extracted year

    # Create the new movie title
    new_title = generate_title()

    # Synthesizing the new movie concept
    new_movie = {
        'title': new_title,
        'genres': ', '.join(top_genres),
        'runtime': f"{average_runtime} minutes",
        'budget': f"${average_budget:,}",
        'release_year': popular_release_year,
        'concept': f"A {', '.join(top_genres)} movie with a runtime of {average_runtime} minutes, "
                   f"produced on a budget of ${average_budget:,}, releasing in {popular_release_year}. "
                   f"This movie blends the most successful elements from top movies."
    }

    # Display the new movie concept with title
    print(new_movie)
else:
    print("No top-performing movies found to analyze.")


## adding Random Forest Classifier, budget prediction & Title Generation

In [None]:
# Ensure valid dates and release years
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df = movies_df.dropna(subset=['release_date'])
movies_df['release_year'] = movies_df['release_date'].dt.year

# Filter top-performing movies
top_movies = movies_df[movies_df['revenue'] > movies_df['revenue'].quantile(0.5)]

# Extract genres and encode as numerical labels
movies_df['genres_list'] = movies_df['genres'].str.split(', ')
all_genres = list(set(genre.strip() for sublist in movies_df['genres_list'].dropna() for genre in sublist))
for genre in all_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: genre in x if pd.notnull(x) else False)

# Predict Genres Using Classification
X_genres = movies_df[['budget', 'runtime', 'release_year']].fillna(0)
y_genres = movies_df[all_genres]

X_train_genres, X_test_genres, y_train_genres, y_test_genres = train_test_split(X_genres, y_genres, test_size=0.2, random_state=42)
genre_model = RandomForestClassifier(random_state=42)
genre_model.fit(X_train_genres, y_train_genres)
y_pred_genres = genre_model.predict(X_test_genres)
genre_accuracy = accuracy_score(y_test_genres, y_pred_genres > 0.5)
print(f"Genre Prediction Accuracy: {genre_accuracy}")

# Predict Revenue Using Regression
X_revenue = movies_df[['budget', 'runtime', 'release_year']].fillna(0)
y_revenue = movies_df['revenue']

X_train_rev, X_test_rev, y_train_rev, y_test_rev = train_test_split(X_revenue, y_revenue, test_size=0.2, random_state=42)
revenue_model = RandomForestRegressor(random_state=42)
revenue_model.fit(X_train_rev, y_train_rev)
y_pred_rev = revenue_model.predict(X_test_rev)
revenue_mae = mean_absolute_error(y_test_rev, y_pred_rev)
print(f"Revenue Prediction MAE: {revenue_mae}")

# Generate Titles Using NLP
top_titles = top_movies['title'].dropna().tolist()
vectorizer = CountVectorizer().fit(top_titles)
title_matrix = vectorizer.transform(top_titles)

def generate_title_nlp():
    words = vectorizer.get_feature_names_out()
    return f"{random.choice(words).title()} {random.choice(words).title()} {random.choice(words).title()}"

# Generate a new movie concept
new_title = generate_title_nlp()
predicted_genres = genre_model.predict([[200000000, 120, 2025]])[0]  
predicted_revenue = revenue_model.predict([[200000000, 120, 2025]])[0]

predicted_genre_list = [all_genres[i] for i in range(len(predicted_genres)) if predicted_genres[i] > 0.5]

new_movie = {
    'title': new_title,
    'genres': ', '.join(predicted_genre_list),
    'runtime': '120 minutes',
    'budget': '$200,000,000',
    'release_year': 2025,
    'predicted_revenue': f"${int(predicted_revenue):,}",
    'concept': f"A(n) {', '.join(predicted_genre_list)} movie with a runtime of 120 minutes, "
               f"produced on a budget of €10, releasing in 2025 with " 
               f" renowned directors - Clara Capacha & Constanza TT. "
               f"Famous leading man:Vasco Freire & leading lady:Friederike 'Fritzi'" 
               f" & camerawork by Fabby Foo."
               f" This movie is predicted to generate ${int(predicted_revenue):,} in revenue."
}

print(new_movie)


In [None]:

movies_df = pd.read_csv('../data/clean/clean_movies.csv')
numeric_data = ['vote_average', 'revenue', 'runtime', 'budget']
print(movies_df.columns)

scaler = MinMaxScaler()
movies_df[numeric_data] = scaler.fit_transform(movies_df[numeric_data])
movies_df['adult_true'] = movies_df['adult'].astype(int)
movies_df['adult_false'] = (~movies_df['adult']).astype(int)
movies_df=movies_df.drop(columns=['adult'])

# Crear columnas para mes y día
movies_df['release_month'] = pd.to_datetime(movies_df['release_date']).dt.month
movies_df['release_day'] = pd.to_datetime(movies_df['release_date']).dt.day
# Eliminar la columna 'release_date' original si ya no la necesitas
movies_df.drop(columns=['release_date'], inplace=True)

# Split genres column into separate genres and normalize
genres_split = movies_df['genres'].str.get_dummies(sep=',')
movies_df.columns = genres_split.columns.str.strip().str.lower()  # Strip spaces and lowercase
movies_df_with_genres = pd.concat([movies_df, genres_split], axis=1)

# Drop genres column
movies_df_with_genres.drop(columns=['genres'], inplace=True)

# Duplicated genre columns sum
genres_split_combined = genres_split.T.groupby(genres_split.columns).sum().T

# Merging Columns with the same genre
movies_df_with_genres = pd.concat([movies_df.drop(columns='genres'), genres_split_combined], axis=1)
# Count of each genre
genre_counts = genres_split_combined.sum().sort_values(ascending=False)

# Mostrar el DataFrame actualizado
print(movies_df.head())

#movies_df.to_csv('scaled_movies.csv', index=False)