In [None]:
# Basic Data Science Imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# py imports
from acquire import prepare_data
from prepare import prep_readme_data
import model

# NLP Imports
import nltk
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk.sentiment

# Clustering Imports
from sklearn.cluster import KMeans



import warnings
warnings.filterwarnings('ignore')

# Movie Recommendation Tool

Have you ever finished a movie, and were so in entranced by the concept that you wanted to see more movies like it? Or have you ever gone into a streaming service looking for a specific movie, but can't find it? 

Recommendation models are very common now a days, and are integral in many different type of industries, ranging from Amazon product recommendations, HEB groceries. Recommendation models can also be built in many different ways, and it often depend on how the data is strucutred. 

In this case, I will be using a more classical approach, and use NLP and clustering techniques to create my own movie recommendation tool. The idea is that users will be able to input a movie title, and then a list of similar movies will be outputed by the tool, along with some relevant information. 

## Background

Using a data set found on Kaggle, I will be looking at movies published in the US for the past 20+ years, and using NLP strategies to:

1. Explore the data, and see if any interesting patterns arise
2. Create a clustering modeling that can help us identify movies that are similar to each other based on:
    1. Genre
    1. Description
    1. Avg score by viewers
    1. Director
    1. Actors
    
The data can be found [here](https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset)

## Data Wrangle

In [None]:
movie_title = pd.read_csv("IMDb movies.csv")

In [None]:
# What is are biggest markets in terms of movie production? 

movie_title.groupby("country").title.count().sort_values(ascending=False).nlargest(5)

In [None]:
# What year did most movies get published?

movie_title.groupby("date_published").title.count().nlargest(10)

In [None]:
# What data range do we have? 

movie_title.date_published.min(), movie_title.date_published.max()

In [None]:
# ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

# def clean(text: str) -> list:
#     'A simple function to cleanup text data'
#     wnl = nltk.stem.WordNetLemmatizer()
#     stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
#     text = (text.encode('ascii', 'ignore')
#              .decode('utf-8', 'ignore')
#              .lower())
#     words = re.sub(r'[^\w\s]', '', text).split() # tokenization
#     return [wnl.lemmatize(word) for word in words if word not in stopwords]

For the purposes of this project, we will only look to use movies from the US. If the opportunity to scale up the project is possible, then we will add additional countries.

In [None]:
df = movie_title[(movie_title.country == "USA")]

In [None]:
# we see that we have a couple of missing values.
df.isnull().sum()

**Steps for Removing Null Values**:

1. The movie description will be a large part indicator or similarity, and as such, I want movies that have a description. I will drop any null values in this column
1. I will explore that language column, as I suspect at this moment that these shoud be all inglish
1. Similarly to the description, the director would play a big influence and as such I am thinking of dropping all null values from this column
1. While the meta score value would be really useful - there are too many missing values, and as such I will drop this column. Same for the `reviews_from_critics`
1. Might be able to impude a value for the missing `reviews_from_users`
1. I might be able to find the missing writers from the other IMBD file


In [None]:
# Given that hte majority of the movies are exclusively English, and we know that we are only looking at movies
# made in the US, I can speculate that it is very likely these movies were at least partially made in english
# As such, I will impode the missing values

df.language.value_counts()

In [None]:
df.language = df.language.fillna("English")

In [None]:
# remove nulls from director and description

df = df[df.description.notnull()]

df = df[df.director.notnull()]

df = df[df.writer.notnull()]

In [None]:
# What is the avg budget for all movies so that we can use it to impude values?

df.budget = df.budget.fillna("$ 0")

df = df[~df.budget.str.contains("ESP")]

df = df[~df.budget.str.contains("GBP")]

df = df[~df.budget.str.contains("CAD")]

df = df[~df.budget.str.contains("PYG")]

df = df[~df.budget.str.contains("AUD")]

df = df[~df.budget.str.contains("EUR")]

df = df[~df.budget.str.contains("RUR")]

avg_budget = df.budget.str.replace("$", '').astype(int).mean()

df.budget = df.budget.str.replace("$", '').astype(int)

df.budget = df.budget.replace(0, avg_budget)

In [None]:
# We will do something similar for US gross income

median_income = df[(df.usa_gross_income.notnull()) & (df.usa_gross_income.str.contains("$", regex=False))].usa_gross_income.str.replace("$", '').astype(int).median()

df.usa_gross_income = df.usa_gross_income.fillna("$ 0")

df.usa_gross_income = (
    df[df.usa_gross_income.str.contains("$", regex=False)]
    .usa_gross_income.str.replace("$", '')
    .astype(int)
    .replace(0, median_income)
)

In [None]:
# Remove columns with too many null values

df = df.drop(columns=["worlwide_gross_income", "metascore", "reviews_from_users", "reviews_from_critics"])

In [None]:
# Drop any remaining null values

df = df.dropna()

In [None]:
# Now we have no null values, and still have over 26000 movie titles

df.isnull().sum(), df.shape

## NLP Exploration

In [None]:
# Create a template for Seaborn and Matplot
plt.rc('figure', figsize=(13, 7))
plt.rc('font', size=14)
plt.style.use('seaborn-darkgrid')

In [None]:
df_prep = prep_readme_data(df, "description")

In [None]:
# What are the most common words in movie descriptions?

all_words = model.clean(' '.join(df.description))

pd.Series(all_words).value_counts().head(15).plot.barh()


### Common words by genre?

In [None]:
# What are the top most common genres?

df.genre.value_counts().head(5)

In [None]:
# Let's create new lemmatize groups for the top five categories

drama = model.clean(' '.join(df[df.genre == "Drama"].description))
comedy = model.clean(' '.join(df[df.genre == "Comedy"].description))
comedy_drama = model.clean(' '.join(df[df.genre == "Comedy, Drama"].description))
horror = model.clean(' '.join(df[df.genre == "Horror"].description))
drame_romance = model.clean(' '.join(df[df.genre == "Drama, Romance"].description))

### What Are the most common words in the most popular genres?

In [None]:
plt.subplot(2,3,1)
pd.Series(drama).value_counts().head(3).plot.barh()
plt.title("Drama")
plt.subplot(2,3,2)
pd.Series(comedy).value_counts().head(3).plot.barh()
plt.title("Comedy")
plt.subplot(2,3,3)
pd.Series(comedy_drama).value_counts().head(3).plot.barh()
plt.title("Comedy Drama")
plt.subplot(2,3,4)
pd.Series(horror).value_counts().head(3).plot.barh()
plt.title("Horror")
plt.subplot(2,3,5)
pd.Series(drame_romance).value_counts().head(3).plot.barh()
plt.title("Drama Romance")


plt.tight_layout()
plt.show()


### Common words throughout time?

In [None]:
df.date_published = pd.to_datetime(df.date_published)

In [None]:
most_common_bigrams = df.set_index('date_published').resample('Y').description.agg([model.most_frequent_bigram, "count"])

In [None]:
most_common_bigrams["most_frequent_bigram"][0]

In [None]:
fig, ax = plt.subplots(figsize=(25, 9))
most_common_bigrams.plot(ax=ax)
plt.title("What are the most common bigrams over time?")
plt.ylabel("Count")
plt.xlabel("Date Movie was Published")
for i in range(1, most_common_bigrams.shape[0]):
    if most_common_bigrams["most_frequent_bigram"][i] != most_common_bigrams["most_frequent_bigram"][i-1]:
        ax.text(f"{most_common_bigrams.index[i]}", most_common_bigrams["count"][i] + 20,  f"{most_common_bigrams.most_frequent_bigram[i]}", rotation = 45)

### Common bigrams in high voted movies vs low?

In [None]:
most_common_bigrams = df.groupby("avg_vote").description.agg([model.most_frequent_bigram, "count"])

In [None]:
fig, ax = plt.subplots(figsize=(25, 9))
most_common_bigrams.plot(ax=ax)
plt.title("What are the most common bigrams based on the movie score?")
plt.ylabel("Count")
plt.xlabel("Avg score given by viewers")
for i in range(1, most_common_bigrams.shape[0]):
    if most_common_bigrams["most_frequent_bigram"].iloc[i] != most_common_bigrams["most_frequent_bigram"].iloc[i-1]:
        if most_common_bigrams.index[i] < 7:
            ax.text(most_common_bigrams.index[i], most_common_bigrams["count"].iloc[i],  f"{most_common_bigrams.most_frequent_bigram.iloc[i]}", rotation = -45)
        else:
            ax.text(most_common_bigrams.index[i], most_common_bigrams["count"].iloc[i],  f"{most_common_bigrams.most_frequent_bigram.iloc[i]}", rotation = 45)

### Common words in high grossing movies vs low?

In [None]:
df["usa_gross_bin"] = pd.cut(df.usa_gross_income, 10, labels=[1,2,3,4,5,6,7,8,9, 10])

In [None]:
most_common_bigrams = df.groupby("usa_gross_bin").description.agg([model.most_frequent_bigram, "count"])

In [None]:
df[df["usa_gross_bin"] == 10]

In [None]:
fig, ax = plt.subplots(figsize=(25, 9))
most_common_bigrams.plot(ax=ax)
plt.title("What are the most common bigrams over based on the movie's gross income?")
plt.ylabel("Count")
plt.xlabel("USA Gross Income, in Millions")
for i in most_common_bigrams.index:
    ax.text(i-1, most_common_bigrams["count"][i],  f"{most_common_bigrams.most_frequent_bigram[i]}", rotation = 45)

In [None]:
# what are highest grossing directors of all times?


df.groupby("director").usa_gross_income.sum().sort_values(ascending=False).head(5).plot.barh()
plt.title("What are the top grossing directors?")
plt.xlabel("Dollars")
plt.ylabel("Director")
plt.show()

In [None]:
# what are the top genres by genre?

df.groupby("genre").usa_gross_income.sum().sort_values(ascending=False).head(5).plot.barh()
plt.title("What is the highest grossing genre?")
plt.xlabel("Dollars")
plt.ylabel("Genre")

In [None]:
df[df.genre == "Animation, Drama, Sci-Fi"]

In [None]:
# what are the highest voted directors?

df.groupby("director").avg_vote.mean().sort_values(ascending=False).head(5).plot.barh()
plt.title("Which are the highest voted directors, on average?")
plt.xlabel("Avg votes")
plt.ylabel("Director")

In [None]:
# What are the highest voted genres?

df.groupby("genre").avg_vote.mean().sort_values(ascending=False).head(5).plot.barh()

### Sentiment Analysis

Can sentiment analysis of the movie description help us identify if they are similar movies?

In [None]:
sia = nltk.sentiment.SentimentIntensityAnalyzer()
df["sentiment"] = df.clean_lemmatized.apply(lambda blog: sia.polarity_scores(blog)["compound"])

In [None]:
df.groupby("genre").sentiment.mean().sort_values(ascending=False).head(5).plot.barh()
plt.title("What are the 5 most positive genres?")
plt.xlabel("Sentiment Score")
plt.ylabel("Genre Name")

In [None]:
df.groupby("genre").sentiment.mean().sort_values().head(5).plot.barh()
plt.title("What are the 5 most negative genres?")
plt.xlabel("Sentiment Score")
plt.ylabel("Genre Name")

In [None]:
# What is the most positive movie description?

df.nlargest(1, "sentiment").description.values

In [None]:
# What is the most negative movie description?

df.nsmallest(1, "sentiment").description.values

It seems that overall, animation movies tend to have more positive descriptions. This makes sense, as most animation movies tend to target a younger audience, or a family audience. It should be noted that we are using a sentiment analysis that was largely developed for social media analysis, and as such, it was trained on a a very different corpus. 

## Further questions

1. Does the publish month make a difference in gross_income?
1. Does the publish month make a difference in avg_vote?

# Modeling

For modeling, we will most likely need to do a bag of words, and then use those features are a metric for clustering

The simple recommendation model:

* We will look to do a traditional clustering, by looking at features that are already numerical. These features will be:
    * Year release
    * Run time
    * Avg vote
    * Do a `One Hot Encoder` for genre
  
A more advance model would look to use NLP practices to actually model based on the description, on top of some of the other features previously mentioned.
 

### Simple Model

Steps:

1. Filter data based on the genre
2. Cluster the data based on `avg_votes`, `usa_gross_income`, `year` and `duration`

In [None]:
df = prepare_data()

In [None]:
df = prep_readme_data(df, "description")

In [None]:
df_num = df[["title", "avg_vote", "usa_gross_income", "year", "duration"]]

In [None]:
df_num = df_num.set_index("title")

In [None]:
df_num

In [None]:
sns.scatterplot(data = df_num, x= "duration", y= "avg_vote", hue="year")
plt.title("Is there a relationship between movie length and avg vote?")
plt.ylabel("Avg Votes")
plt.xlabel("Movie Duration, in minutes")


Interestingly, we see that there is not a very distinct difference between movie length and avg score. It is insteresting, however, that newer movies (post-2000) seem to be, on average, a bit longer. 

In [None]:
sns.scatterplot(data = df_num, x= "usa_gross_income", y= "avg_vote", hue="year")
plt.title("Is there a relationship between gross income and avg vote?")
plt.ylabel("Avg Votes")
plt.xlabel("USA Gross Income, in dollars")

It also seems that there are not a lot of significant differences between gross income and movie score. This is probably largely influenced by the fact that we had to impude some of the missing income, resulting in some similar results.

In [None]:
# First, we need to scale the data
minmax = MinMaxScaler()
scaled_df = minmax.fit_transform(df_num)

In [None]:
# Create an instance of KMeans 
kmeans = KMeans(n_clusters=5, random_state=123)
# Use fit_predict to cluster the dataset
predictions = kmeans.fit_predict(scaled_df)

In [None]:
df["cluster"] = predictions

In [None]:
df["cluster"] = "cluster_" + df.cluster.astype(str)

-----

#### Test 1

In [None]:
# Now we test

test = "Toy Story"

In [None]:
model.simple_movie_recommender(df, test)

------

#### Test 2

In [None]:
test = "Bridesmaids"

In [None]:
model.simple_movie_recommender(df, test)

-------

Test 3

In [None]:
test = "A Quiet Place"

In [None]:
model.simple_movie_recommender(df, test)

We see that our function works relatively well. The way the function works is that it looks in the database for the genre and cluster that the `test` title has. It then filters that data using these masks, and returns the top 25 matches of titles that have a similar genre, and at the same time had a similar run-time, release year, avg votes and year (based on the cluster). 

The tool works fairly well, but ceirtainly requires further testing. Some of the abilities I want to implement:

1. If a movie title is duplicated (i.e "Parent Trap"), then it would assume that it is the most recent title. 
1. At the moment, the tool only looks at the first genre. For example, if a movie is categorized as "Action, Adventure, Comedy", the matches are based on containing only "Action". By implementing a series of conditionals, it might be possible to further improve the model's accuracy.

In order to improve the accuracy of the model, I am hoping we can use more advanced NLP techniques to actually group recommendations based on the movie descriptions.

### Complex Model with TF-IDF

Steps:

1. Filter data based on genre
1. Cluster the data based on the TFIDF of the description.

In [None]:
df = prepare_data()

In [None]:
df = prep_readme_data(df, "description")

In [None]:
df = model.simple_cluster(df, 5)

In [None]:
df.info()

In [None]:
df["combined_data"] = df.genre + " " + df.director + " " + df.clean_lemmatized

In [None]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(df.clean_lemmatized)

In [None]:
# Create an instance of KMeans to find seven clusters
kmeans = KMeans(n_clusters=5, random_state =123)
# Use fit_predict to cluster the dataset
predictions = kmeans.fit_predict(tfidfs)

In [None]:
df["cluster_description"] = predictions

In [None]:
df["cluster_description"] = "cluster_" + df.cluster_description.astype(str)

------

#### Test 1

In [None]:
test = "Toy Story"

In [None]:
model.complex_movie_recommendation(df, test)

-------

**Test 2**

In [None]:
test = "Bridesmaids"

In [None]:
model.complex_movie_recommendation(df, test)

-------

**Test 3**

In [None]:
test = "A Quiet Place"

In [None]:
model.complex_movie_recommendation(df, test)

The logic behind the code is working as I expect it - however, there needs to be further testing with the size of the clusters to better arrive at an optimal recommendation

### Complex Model with Count Vectorizer

In [None]:
df = prepare_data()

In [None]:
df = prep_readme_data(df, "description")

In [None]:
df = model.simple_cluster(df, 5)

In [None]:
df.info()

In [None]:
df["combined_data"] = df.genre + " " + df.director + " " + df.clean_lemmatized

In [None]:
cv = CountVectorizer()
cv = cv.fit_transform(df.clean_lemmatized)

In [None]:
# Create an instance of KMeans to find seven clusters
kmeans = KMeans(n_clusters=5, random_state = 123)
# Use fit_predict to cluster the dataset
predictions = kmeans.fit_predict(cv)

In [None]:
df["cluster_description"] = predictions

In [None]:
df["cluster_description"] = "cluster_" + df.cluster_description.astype(str)

------

#### Test 1

In [None]:
test = "Toy Story"

In [None]:
model.complex_movie_recommendation(df, test)

None of the other `Toy Story` movies are recommended. This means that the actual count vectorizer is not being as effective.

-------

**Test 2**

In [None]:
test = "Bridesmaids"

In [None]:
model.complex_movie_recommendation(df, test)

This models struggles the most with the Romantic Comedies, as it returns the smallest list of recommendations, and after some research, there are other movies that I would prefer are recommended over this one. 

-------

**Test 3**

In [None]:
test = "A Quiet Place"

In [None]:
model.complex_movie_recommendation(df, test)

### Complex Model with Bag of Ngrams

In [None]:
df = prepare_data()

In [None]:
df = prep_readme_data(df, "description")

In [None]:
df = model.simple_cluster(df, 5)

In [None]:
df.info()

In [None]:
df["combined_data"] = df.genre + " " + df.director + " " + df.clean_lemmatized

In [None]:
cv = CountVectorizer(ngram_range = (1,2))

In [None]:
cv = cv.fit_transform(df.clean_lemmatized)

In [None]:
# Create an instance of KMeans to find seven clusters
kmeans = KMeans(n_clusters=5, random_state=123)
# Use fit_predict to cluster the dataset
predictions = kmeans.fit_predict(cv)


In [None]:
df["cluster_description"] = predictions

In [None]:
df["cluster_description"] = "cluster_" + df.cluster_description.astype(str)

------

#### Test 1

In [None]:
test = "Toy Story"

In [None]:
model.complex_movie_recommendation(df, test)

Interestingly, this is the first model that recommends all three other `Toy Story` movies. All previous models would recommend one or two of the sequels, but never all three.

-------

**Test 2**

In [None]:
test = "Bridesmaids"

In [None]:
model.complex_movie_recommendation(df, test)

-------

**Test 3**

In [None]:
test = "A Quiet Place"

In [None]:
model.complex_movie_recommendation(df, test)

# Conclusion

We have 4 working movie recommendation models. Given the nature of this project, there is no way we can actually test for accuracy, given that we are using unsupervised machine learning. 

After some preliminary testing, I found that the bag of ngrams complex model is likely the best of the four models. The reason for this is because it was the only one that recommended all other `Toy Story` movies. It also had the most similar recommendations for `A Quiet Place`, which was the test that the other models struggle with the most. Unfortunately, I don't think this model performs as well as the simple model when it comes to `Bridesmaides`. The simple model's recommendations for this particular title are slightly better, as they seem more relevant, and there seems to be more uniformity about the movie release year. That being said, this is genre where I have the least amount of experience, and so it is harder to judge the model's effectiveness. 

This will likely be the end of the project, as further testing and tweaking would require users that would be willing to use the tool, and give their own input on the model's selection. 

I will create a command line app that will serve as a prototype, but the objective of the project was to review NLP and clustering basics, while exploring an IMBd database, and in that sense, the project was a success. 