# Netflix Motivations - Data Science Project

Welcome to the Netflix Motivations project! 

**Contributors**: *Christian Rhodes* and *Drew Jepson*

# The Data

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Read in the data and store in a DataFrame
df = pd.read_csv("data/titles.csv")

# Print the first 5 rows of the DataFrame
df.dropna(inplace=True)
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
5,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,73424.0,17.617,8.306
26,ts45948,Monty Python's Fliegender Zirkus,SHOW,Monty Python's Fliegender Zirkus consisted of ...,1972,TV-MA,43,['comedy'],[],1.0,tt0202477,8.1,2151.0,1.487,7.0
35,ts20681,Seinfeld,SHOW,A stand-up comedian and his three offbeat frie...,1989,TV-PG,24,['comedy'],['US'],9.0,tt0098904,8.9,308824.0,130.213,8.301
44,ts22082,Knight Rider,SHOW,"Michael Long, an undercover police officer, is...",1982,TV-PG,51,"['scifi', 'action', 'crime', 'drama']",['US'],4.0,tt0083437,6.9,34115.0,50.267,7.5
45,ts21715,Thomas & Friends,SHOW,Thomas & Friends is a British children's telev...,1984,TV-Y,10,"['animation', 'family', 'comedy', 'fantasy', '...",['GB'],24.0,tt0086815,6.5,5104.0,42.196,6.5


# Classification Tree

In [12]:
# Convert the IMDB score into categories
bins = [0, 4, 7, 10]
labels = ['low', 'medium', 'high']
df['imdb_category'] = pd.cut(df['imdb_score'], bins=bins, labels=labels)

In [13]:
# Select the features and target variable
X = df[['runtime', 'release_year', 'imdb_votes','tmdb_popularity','seasons']]
y = df['imdb_category']

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Create an instance of the DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=3, random_state=42)

In [16]:
# Train the model on the training set
tree.fit(X_train, y_train)

In [17]:
# Predict the target variable for the testing set
y_pred = tree.predict(X_test)

In [18]:
# Calculate the accuracy score of the model
accuracy = accuracy_score(y_test, y_pred)

In [19]:
# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=X.columns, class_names=labels, filled=True)
plt.show()

ModuleNotFoundError: No module named 'matplotlib_inline'

In [21]:
genres = df['genres'].str.get_dummies(',')
df = pd.concat([df, genres], axis=1)

# Select the features and target variable
X = df[['runtime', 'tmdb_popularity']]
y = df[list(genres.columns)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an instance of the DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=4, random_state=42)

# Train the model on the training set
tree.fit(X_train, y_train)

# Predict the target variable for the testing set
y_pred = tree.predict(X_test)

# Calculate the accuracy score of the model
accuracy = accuracy_score(y_test, y_pred)

# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=X.columns, class_names=list(genres.columns), filled=True)
plt.show()


ModuleNotFoundError: No module named 'matplotlib_inline'

In [22]:
X = df[['runtime', 'release_year', 'imdb_votes','tmdb_popularity','seasons']]
y = df['imdb_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
display = confusion_matrix(y_test.astype(int), y_pred.astype(int))
disp=ConfusionMatrixDisplay(display)
disp.plot()
plt.show()
print(f"Mean squared error: {mse}")
print(f"R-squared score: {r2}")

ModuleNotFoundError: No module named 'matplotlib_inline'

In [34]:
df['genreMain'] = df['genres'].apply(lambda x: x.split(',')[0])

genre_mapping = {
    "['documentation']": "documentary",
    "['drama']": "drama",
    "['fantasy']": "fantasy",
    "['war']": "war",
    "['comedy']": "comedy",
    #"['comedy']": "comedy",
    "['thriller']": "thriller",
    "['crime']": "crime",
    "['romance']": "romance",
    "['action']": "action",
    "['western']": "western",
    "['history']": "history",
    "['drama']": "drama",
    "['music']": "music",
    "['romance']": "romance",
    "['horror']": "horror",
    #"['horror']": "horror",
    "['scifi']": "sci-fi",
    "['animation']": "animation",
    #"['family']": "family",
    "['family']": "family",
    #"['reality']": "reality",
    "['reality']": "reality",
    "['documentation']": "documentary",
    '[]': "unknown",
    "['thriller']": "thriller",
    "['western']": "western",
    "['animation']": "animation",
    "['war']": "war",
    "['music']": "music",
    "['fantasy']": "fantasy",
    "['action']": "action",
    "['crime']": "crime",
    "['scifi']": "sci-fi",
    "['sport']": "sport"
}

df['genreMain'] = df['genreMain'].map(genre_mapping)

In [35]:
genres = df['genreMain'].str.get_dummies(',')
df = pd.concat([df, genres], axis=1)
df.dropna(inplace=True)

# Select the features and target variable
X = df[['runtime', 'release_year', 'imdb_votes','tmdb_popularity','seasons']]
y = df[list(genres.columns)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an instance of the DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=4, random_state=42)

# Train the model on the training set
tree.fit(X_train, y_train)

# Predict the target variable for the testing set
y_pred = tree.predict(X_test)

# Calculate the accuracy score of the model
accuracy = accuracy_score(y_test, y_pred)

# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=X.columns, class_names=list(genres.columns), filled=True)
plt.show()


ModuleNotFoundError: No module named 'matplotlib_inline'

## Linear Regression Model - V2

In [41]:
#first use genre and tmdb score to predict tmdb popularity
X = df[['genreMain', 'tmdb_score']].values
y = df['tmdb_popularity'].values

# Create a LinearRegression object
lr = LinearRegression()

# Fit the model to the data
lr.fit(X, y)

# Make predictions
y_pred = lr.predict(X)

#calculate the accuracy of y_pred
mse = mean_squared_error(y, y_pred)

# Plot the linear regression line
plt.figure(figsize=(12, 8))
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.xlabel('Genre')
plt.ylabel('TMDB Popularity')
plt.show()

ValueError: could not convert string to float: 'family'

In [42]:
#drop NaN values from 'imdb_score' column
df.dropna(subset=['imdb_score'], inplace=True)

#next use genre and imdb score to predict imdb popularity
X = df[['genreMain', 'imdb_score']].values
y = df['imdb_votes'].values

# Create a LinearRegression object
lr = LinearRegression()

# Fit the model to the data
lr.fit(X, y)

# Make predictions
y_pred = lr.predict(X)

#calculate the accuracy of y_pred
mse = mean_squared_error(y, y_pred)

# Plot the linear regression line
plt.figure(figsize=(12, 8))
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.xlabel('Genre')
plt.ylabel('IMDB Votes')
plt.show()

ValueError: could not convert string to float: 'family'

### Let's try to improve the model by minimizing the average loss

We can do this using a regularization technique such as Ridge or Lasso regression. 

In [43]:
from sklearn.linear_model import Ridge

X = df[['genreMain', 'tmdb_score']].values
y = df['tmdb_popularity'].values

# Create a Ridge object with regularization parameter alpha
ridge = Ridge(alpha=0.5)

# Fit the model to the data
ridge.fit(X, y)

# Make predictions
y_pred = ridge.predict(X)


ValueError: could not convert string to float: 'family'

Let's evaluate the performance of the Ridge regression model:

In [44]:
from sklearn.model_selection import cross_val_score

# Calculate the cross-validation score using 5-fold cross-validation
scores = cross_val_score(ridge, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive values and calculate the mean and standard deviation
mse_scores = -scores
mse_mean = mse_scores.mean()
mse_std = mse_scores.std()

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\linear_model\_ridge.py", line 1126, in fit
    X, y = self._validate_data(
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'documentary'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\linear_model\_ridge.py", line 1126, in fit
    X, y = self._validate_data(
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\thech\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'family'
