In [155]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/combined_movies.csv")

In [156]:
# removing the first row (empty)
movies_df = movies_df.iloc[1:]

In [157]:
# dropping Poster URL column
movies_df = movies_df.drop(columns=["Poster URL"])

In [158]:
# dropping missing values
movies_df = movies_df.dropna()

# dropping zeros (budget and revenue)
movies_df = movies_df[(movies_df["Budget"] != 0) & (movies_df["Revenue"] != 0)]

In [159]:
# converting runtime to numeric
movies_df["Runtime"] = movies_df["Runtime"].str.replace(" min", "").astype(int)

In [160]:
# splitting the released date into year, month, and day
movies_df["Released"] = pd.to_datetime(movies_df["Released"], errors='coerce')
movies_df["Year"] = movies_df["Released"].dt.year
movies_df["Month"] = movies_df["Released"].dt.month
movies_df["Day"] = movies_df["Released"].dt.day

# dropping the original Released column
movies_df = movies_df.drop(columns=["Released"])

In [161]:
# making the genre column one-hot encoded
genre_dummies = movies_df["Genre"].str.get_dummies(sep=', ')
movies_df = pd.concat([movies_df, genre_dummies], axis=1)

# dropping the original genre column
movies_df = movies_df.drop(columns=["Genre"])

In [162]:
# making the MPAA rating one-hot encoded
mpaa_dummies = movies_df["Rated"].str.get_dummies(sep=', ')
movies_df = pd.concat([movies_df, mpaa_dummies], axis=1)

# dropping the original MPAA rating column
movies_df = movies_df.drop(columns=["Rated"])

In [None]:
# creating variable that indicates the movie was made by a big production company
companies = ["Universal Pictures", "Paramount Pictures", "Warner Bros. Pictures", "Walt Disney Pictures", "Walt Disney Feature Animation", "Sony Pictures", "Universal Pictures", "20th Century Fox", "Columbia Pictures", "Lionsgate", "Metro-Goldwyn-Mayer", "Marvel Studios", "Pixar Animation Studios", "DreamWorks Animation", "DreamWorks Pictures", "New Line Cinema", "Focus Features", "Orion Pictures", "United Artists", "A24", "Searchlight Pictures"]
movies_df["Top_Production_Company"] = movies_df["Production Companies"].apply(lambda x: int(any(company in x for company in companies)))

# dropping the original Production Companies column
movies_df = movies_df.drop(columns=["Production Companies"])

In [164]:
# converting text variables to strings

movies_df["Title"] = movies_df["Title"].astype("string")
movies_df["Director"] = movies_df["Director"].astype("string")
movies_df["Writer"] = movies_df["Writer"].astype("string")
movies_df["Actors"] = movies_df["Actors"].astype("string")
movies_df["Description"] = movies_df["Description"].astype("string")
movies_df["Tagline"] = movies_df["Tagline"].astype("string")
movies_df["Overview"] = movies_df["Overview"].astype("string")

In [165]:
# convering IMDB votes to numeric
movies_df["IMDB Votes"] = movies_df["IMDB Votes"].str.replace(",", "").astype(int)

In [166]:
# converting box office to numeric
movies_df["Box Office"] = movies_df["Box Office"].str.replace("$", "").str.replace(",", "").astype(float)

In [167]:
# converting country to one-hot encoded
country_dummies = movies_df["Country"].str.get_dummies(sep=', ')
movies_df = pd.concat([movies_df, country_dummies], axis=1)

# dropping the original country column
movies_df = movies_df.drop(columns=["Country"])

In [168]:
# converting language to one-hot encoded
country_dummies = movies_df["Language"].str.get_dummies(sep=', ')
movies_df = pd.concat([movies_df, country_dummies], axis=1)

# dropping the original country column
movies_df = movies_df.drop(columns=["Language"])

In [169]:
# check data types
print(movies_df.dtypes[:10])
print(movies_df.dtypes[10:20])
print(movies_df.dtypes[20:24])

Title          string[python]
Year                    int32
Runtime                 int64
Director       string[python]
Writer         string[python]
Actors         string[python]
IMDB Rating           float64
Metascore             float64
IMDB Votes              int64
Awards                 object
dtype: object
Box Office                     float64
Description             string[python]
Tagline                 string[python]
Overview                string[python]
Budget                           int64
Revenue                          int64
TMDB Rating                    float64
Vote Count                       int64
Production Companies            object
Month                            int32
dtype: object
Day          int32
Action       int64
Adventure    int64
Animation    int64
dtype: object


In [170]:
movies = len(movies_df)
print("Number of movies:", movies)

Number of movies: 2816


In [171]:
# check number of columns
columns = len(movies_df.columns)
print("Number of columns:", columns)

Number of columns: 275


In [172]:
movies_df.to_csv("../data/output_data/clean_movies.csv", index=False)


Director       string[python]
Writer         string[python]
Awards                 object

Network Analysis:
Actors

Sentiment Analysis:
Description
Tagline    
Overview       