# Import needed libraries

In [1]:
import pandas as pd 
import numpy as np
import re
pd.set_option('display.max_colwidth', 200)
# pd.set_option('max_colwidth', 200)
pd.set_option('display.max_columns', 100)

In [2]:
df = Movies_df = pd.read_csv('Movie_Final_Dataframe.csv') 

# DATA CLEANING 

#### Getting a quick overview of the Movie dataset Scraped from IMDB. This will help understand what data cleaning techniques are needed.

#### Check dataframe head

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,MovieTitle,MovieDate,MovieRunTime,MovieGenre,MovieRating,MovieScore,MovieDescription,MovieDirector,MovieStars,MovieVotes,MovieGross
0,0,The Last of Us,2023–,50.0,"['Action', ' Adventure', ' Drama']",9.1,,"After a global pandemic destroys civilization, a hardened survivor takes charge of a 14-year-old girl who may be humanity's last hope.",,"['Stars:Pedro Pascal', 'Bella Ramsey', 'Anna Torv', 'Gabriel Luna']",242743,
1,1,Ant-Man and the Wasp: Quantumania,2023,124.0,"['Action', ' Adventure', ' Comedy']",6.5,48.0,"Scott Lang and Hope Van Dyne, along with Hank Pym and Janet Van Dyne, explore the Quantum Realm, where they interact with strange creatures and embark on an adventure that goes beyond the limits o...",Peyton Reed,"['Paul Rudd', 'Evangeline Lilly', 'Michael Douglas', 'Michelle Pfeiffer']",75124,
2,2,Cocaine Bear,2023,95.0,"['Comedy', ' Thriller']",6.4,54.0,"An oddball group of cops, criminals, tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.",Elizabeth Banks,"['Keri Russell', 'Alden Ehrenreich', ""O'Shea Jackson Jr."", 'Ray Liotta']",12369,
3,3,You,2018–,45.0,"['Crime', ' Drama', ' Romance']",7.7,,"A dangerously charming, intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.",,"['Stars:Penn Badgley', 'Victoria Pedretti', 'Ambyr Childers', 'Elizabeth Lail']",260722,
4,4,Outer Banks,2020–,50.0,"['Action', ' Crime', ' Drama']",7.6,,"On an island of haves and have-nots, teen John B enlists his three best friends to hunt for a legendary treasure linked to his father's disappearance.",,"['Stars:Chase Stokes', 'Madelyn Cline', 'Madison Bailey', 'J.D.']",59143,


Observation: From the above head info, it is seen that the  data is not clean as we have unwanted
             brackets and characters attached to our values. Therefore, we will need to clean our dataset. 

### Check DataFrame Shape 

In [4]:
df.shape

total_rows = df.shape[0]
total_columns = df.shape[1]
print('total_rows:', total_rows)
print('total_columns:', total_columns)

total_rows: 5100
total_columns: 12


Observation: The Movie dataframe contains five thousand, one hundred movies (Item) and twelve features (content information) 

#### 





### Check for duplicates in the data

In [5]:
num_of_duplicates = df.duplicated().sum()

print("num_of_duplicates:",  num_of_duplicates)

num_of_duplicates: 0


Observation: No duplicate record exist in our dataframe

### Check for Null values in the data

In [6]:
null_values = df.isnull().sum()
null_values

Unnamed: 0             0
MovieTitle             0
MovieDate             26
MovieRunTime         203
MovieGenre             2
MovieRating          162
MovieScore          3914
MovieDescription       0
MovieDirector       1536
MovieStars             0
MovieVotes           162
MovieGross          2550
dtype: int64

Observation: Some of the features values are missing/null, however, we will not pre-processed them due to other valueable data points in other features.

#### Check for data type 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5100 non-null   int64  
 1   MovieTitle        5100 non-null   object 
 2   MovieDate         5074 non-null   object 
 3   MovieRunTime      4897 non-null   float64
 4   MovieGenre        5098 non-null   object 
 5   MovieRating       4938 non-null   float64
 6   MovieScore        1186 non-null   float64
 7   MovieDescription  5100 non-null   object 
 8   MovieDirector     3564 non-null   object 
 9   MovieStars        5100 non-null   object 
 10  MovieVotes        4938 non-null   object 
 11  MovieGross        2550 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 478.2+ KB


#### Data(Features) Cleaning and Renaming 

In [8]:
# SINCE THE MOVIES WERE SCRAPPED ORDERLY FROM THE FIRST (1) TO THE LAST(5100) movies, WE CAN COMPUTE THE MOVIEID FEATURES AS SHOWN BELOW

df['Movie_id'] = [i for i in range(1, df.shape[0]+1)]
df['Movie_id']

0          1
1          2
2          3
3          4
4          5
        ... 
5095    5096
5096    5097
5097    5098
5098    5099
5099    5100
Name: Movie_id, Length: 5100, dtype: int64

In [9]:
# THE MOVIETITLE FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY 
df['Name'] = df['MovieTitle']
df['Name']

0                          The Last of Us
1       Ant-Man and the Wasp: Quantumania
2                            Cocaine Bear
3                                     You
4                             Outer Banks
                      ...                
5095     Scooby-Doo 2: Monsters Unleashed
5096              Tales of the Unexpected
5097                       House of Payne
5098                       The Love Punch
5099                   My Sister's Keeper
Name: Name, Length: 5100, dtype: object

In [10]:
# CLEANING THE MOVIEDATE FEATURE USING REGULAR EXPRESSION (Regex)
# MovieDaate has characters such as I , II, and brackets, attached to its value; which are not needed.
df['Year'] = df['MovieDate'].str.replace('[IV)(]', '', regex=True)
df['Year']

0          2023– 
1            2023
2            2023
3          2018– 
4          2020– 
          ...    
5095         2004
5096    1979–1988
5097       2006– 
5098         2013
5099         2009
Name: Year, Length: 5100, dtype: object

In [11]:
# THE MOVIERUNTIME FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY
df['Time'] = df['MovieRunTime']
df['Time']

0        50.0
1       124.0
2        95.0
3        45.0
4        50.0
        ...  
5095     93.0
5096     25.0
5097     30.0
5098     94.0
5099    109.0
Name: Time, Length: 5100, dtype: float64

In [12]:
# CLEANING THE MOVIEGENRE FEATURE USING REGULAR EXPRESSION (Regex)
# Moviegenre has characters such as quotation marks attached to its value; which are not needed.
df['Genre'] = df['MovieGenre'].str.replace("""[""'\[\],]""", '', regex=True)
df.Genre

0        Action  Adventure  Drama
1       Action  Adventure  Comedy
2                Comedy  Thriller
3           Crime  Drama  Romance
4            Action  Crime  Drama
                  ...            
5095    Adventure  Comedy  Family
5096        Comedy  Drama  Horror
5097        Comedy  Drama  Family
5098              Comedy  Romance
5099                Drama  Family
Name: Genre, Length: 5100, dtype: object

In [13]:
# THE MOVIERATINGS FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY
df['Rating'] = df['MovieRating']
df['Rating']

0       9.1
1       6.5
2       6.4
3       7.7
4       7.6
       ... 
5095    5.2
5096    7.6
5097    3.5
5098    5.7
5099    7.3
Name: Rating, Length: 5100, dtype: float64

In [14]:
# THE MOVIESCORE FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY
df['Score'] = df['MovieScore']
df['Score']

0        NaN
1       48.0
2       54.0
3        NaN
4        NaN
        ... 
5095     NaN
5096     NaN
5097     NaN
5098    44.0
5099    51.0
Name: Score, Length: 5100, dtype: float64

In [15]:
# CLEANING AND RENAMING THE MOVIEDESCRIPTION FEATURE USING REGULAR EXPRESSION (Regex)
df['Description'] = df['MovieDescription'].str.replace("""[""'\[\],]""", '', regex=True)
df['Description'][0]

'After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.'

In [16]:
# CLEANING AND RENAMING THE MOVIEDIRECTOR FEATURE, USING REGULAR EXPRESSION (Regex)
# df[df['MovieDirector'].str.match("(^Directors.*)")==True]
df['Directors_cast']= df['MovieDirector'].str.replace("(Directors:)", '', regex=True)
df.Directors_cast

0                   NaN
1           Peyton Reed
2       Elizabeth Banks
3                   NaN
4                   NaN
             ...       
5095       Raja Gosnell
5096                NaN
5097                NaN
5098       Joel Hopkins
5099    Nick Cassavetes
Name: Directors_cast, Length: 5100, dtype: object

In [17]:
# CLEANING AND RENAMING THE MOVIESTARS FEATURE, USING REGULAR EXPRESSION (Regex)
df['Stars'] = df['MovieStars'].str.replace("""[""\['\]]""", '', regex=True)
df['Stars'] =  df['Stars'].str.replace("(^Stars:)", '', regex=True)
df['Stars']

0                                Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna
1                    Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer
2                      Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta
3                    Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail
4                                  Chase Stokes, Madelyn Cline, Madison Bailey, J.D.
                                            ...                                     
5095    Freddie Prinze Jr., Sarah Michelle Gellar, Matthew Lillard, Linda Cardellini
5096                         Roald Dahl, Andrew Ray, Forbes Collins, Richard Johnson
5097                              Cassi Davis, LaVan Davis, Allen Payne, Lance Gross
5098                       Pierce Brosnan, Emma Thompson, Timothy Spall, Celia Imrie
5099                       Cameron Diaz, Abigail Breslin, Alec Baldwin, Walter Raney
Name: Stars, Length: 5100, dtype: object

In [18]:
# THE MOVIEVOTES FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY
df['Votes'] = df['MovieVotes']
df['Votes']

0       242,743
1        75,124
2        12,369
3       260,722
4        59,143
         ...   
5095     60,118
5096      2,210
5097      6,157
5098     15,145
5099     94,723
Name: Votes, Length: 5100, dtype: object

In [19]:
# THE MOVIEGROSS FEATURE IS OKAY, SO WE WILL JUST RENAME IT PROPERLY
df['Total'] = df['MovieGross']
df['Total']

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
         ...   
5095    $84.22M
5096        NaN
5097        NaN
5098     $0.30M
5099    $49.20M
Name: Total, Length: 5100, dtype: object

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,MovieTitle,MovieDate,MovieRunTime,MovieGenre,MovieRating,MovieScore,MovieDescription,MovieDirector,MovieStars,MovieVotes,MovieGross,Movie_id,Name,Year,Time,Genre,Rating,Score,Description,Directors_cast,Stars,Votes,Total
0,0,The Last of Us,2023–,50.0,"['Action', ' Adventure', ' Drama']",9.1,,"After a global pandemic destroys civilization, a hardened survivor takes charge of a 14-year-old girl who may be humanity's last hope.",,"['Stars:Pedro Pascal', 'Bella Ramsey', 'Anna Torv', 'Gabriel Luna']",242743,,1,The Last of Us,2023–,50.0,Action Adventure Drama,9.1,,After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.,,"Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna",242743,
1,1,Ant-Man and the Wasp: Quantumania,2023,124.0,"['Action', ' Adventure', ' Comedy']",6.5,48.0,"Scott Lang and Hope Van Dyne, along with Hank Pym and Janet Van Dyne, explore the Quantum Realm, where they interact with strange creatures and embark on an adventure that goes beyond the limits o...",Peyton Reed,"['Paul Rudd', 'Evangeline Lilly', 'Michael Douglas', 'Michelle Pfeiffer']",75124,,2,Ant-Man and the Wasp: Quantumania,2023,124.0,Action Adventure Comedy,6.5,48.0,Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that goes beyond the limits of w...,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer",75124,
2,2,Cocaine Bear,2023,95.0,"['Comedy', ' Thriller']",6.4,54.0,"An oddball group of cops, criminals, tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.",Elizabeth Banks,"['Keri Russell', 'Alden Ehrenreich', ""O'Shea Jackson Jr."", 'Ray Liotta']",12369,,3,Cocaine Bear,2023,95.0,Comedy Thriller,6.4,54.0,An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.,Elizabeth Banks,"Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta",12369,
3,3,You,2018–,45.0,"['Crime', ' Drama', ' Romance']",7.7,,"A dangerously charming, intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.",,"['Stars:Penn Badgley', 'Victoria Pedretti', 'Ambyr Childers', 'Elizabeth Lail']",260722,,4,You,2018–,45.0,Crime Drama Romance,7.7,,A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.,,"Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail",260722,
4,4,Outer Banks,2020–,50.0,"['Action', ' Crime', ' Drama']",7.6,,"On an island of haves and have-nots, teen John B enlists his three best friends to hunt for a legendary treasure linked to his father's disappearance.",,"['Stars:Chase Stokes', 'Madelyn Cline', 'Madison Bailey', 'J.D.']",59143,,5,Outer Banks,2020–,50.0,Action Crime Drama,7.6,,On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance.,,"Chase Stokes, Madelyn Cline, Madison Bailey, J.D.",59143,


##### We have cleaned our dataset as much as we can using Regular Expression ( Regex).
Its time to remove the old features and define/indentify the important features to use to build our TECHSQUAD Reccomender System.

In [21]:
# Lets remove old features from our dataset
df.drop("Unnamed: 0", axis=1, inplace=True)
df.drop("MovieTitle", axis=1, inplace=True)
df.drop("MovieDate", axis=1, inplace=True)
df.drop("MovieRunTime", axis=1, inplace=True)
df.drop("MovieGenre", axis=1, inplace=True)
df.drop("MovieRating", axis=1, inplace=True)
df.drop("MovieScore", axis=1, inplace=True)
df.drop("MovieDescription", axis=1, inplace=True)
df.drop("MovieDirector", axis=1, inplace=True)
df.drop("MovieStars", axis=1, inplace=True)
df.drop("MovieVotes", axis=1, inplace=True)
df.drop("MovieGross", axis=1, inplace=True)

In [22]:
df.head()

Unnamed: 0,Movie_id,Name,Year,Time,Genre,Rating,Score,Description,Directors_cast,Stars,Votes,Total
0,1,The Last of Us,2023–,50.0,Action Adventure Drama,9.1,,After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.,,"Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna",242743,
1,2,Ant-Man and the Wasp: Quantumania,2023,124.0,Action Adventure Comedy,6.5,48.0,Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that goes beyond the limits of w...,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer",75124,
2,3,Cocaine Bear,2023,95.0,Comedy Thriller,6.4,54.0,An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.,Elizabeth Banks,"Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta",12369,
3,4,You,2018–,45.0,Crime Drama Romance,7.7,,A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.,,"Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail",260722,
4,5,Outer Banks,2020–,50.0,Action Crime Drama,7.6,,On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance.,,"Chase Stokes, Madelyn Cline, Madison Bailey, J.D.",59143,


In [23]:
# Exporting Our Clean_processed dataset 
# df.to_csv('Clean_processed_dataset.csv', header=True, index=False, encoding='utf-8')

# 






# FEATURE ENGINEERING AND TEXT PRE-PROCESSING 

# FEATURE PREPROCESSING

Feature engineering is one of the most important art in machine learning which creates a huge difference between a good model and a bad model. And because TECHSQUAD Recommender Sytem is a Machine learning application; In this Phase, We would
1. Indentify the Important Features to build a model for our System.
2. Ensure that the features are pre-processed to suit our purpose, and to ensure a good model is achieved.

#### Lets Indentify the Important features from our Moive dataset.

Now, for our TECHSQUAD System to recommend a movie item based on users past interest,
1. It needs to compute a similarity algorithm between the user past interest and the database list of items.
2. In performing this similatiry computation, several item (movie) content information could be used; the more the better.
3. Hence, for our system, we would preprocess and use the important features listed below, and group them into a new feature ['Tags']. 

List of important features to consider are 'Genre', 'Description', 'Directors_cast',  'Stars'.

In [24]:
df.head()

Unnamed: 0,Movie_id,Name,Year,Time,Genre,Rating,Score,Description,Directors_cast,Stars,Votes,Total
0,1,The Last of Us,2023–,50.0,Action Adventure Drama,9.1,,After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.,,"Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna",242743,
1,2,Ant-Man and the Wasp: Quantumania,2023,124.0,Action Adventure Comedy,6.5,48.0,Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that goes beyond the limits of w...,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer",75124,
2,3,Cocaine Bear,2023,95.0,Comedy Thriller,6.4,54.0,An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.,Elizabeth Banks,"Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta",12369,
3,4,You,2018–,45.0,Crime Drama Romance,7.7,,A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.,,"Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail",260722,
4,5,Outer Banks,2020–,50.0,Action Crime Drama,7.6,,On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance.,,"Chase Stokes, Madelyn Cline, Madison Bailey, J.D.",59143,


#     


The next step would be to convert these important features names and keyword instances into lowercase and strip all the spaces were necessary between them. This is done so that our vectorizer doesn't count a part of the name belonging to an instance as the same for another instance.

For example, we could have two directors or Stars instances sharing the same firstname or lastname, i.e Morgan of "Morgan Freeman" and "Morgan Davies".


In [25]:
# df['Directors_cast'][23]
# df['Directors_cast'][34]
# Removing Spaces between Directors_cast features

df['Directors_cast'].str.replace(' ', '')

0                  NaN
1           PeytonReed
2       ElizabethBanks
3                  NaN
4                  NaN
             ...      
5095       RajaGosnell
5096               NaN
5097               NaN
5098       JoelHopkins
5099    NickCassavetes
Name: Directors_cast, Length: 5100, dtype: object

In [26]:
# Removing Spaces between Directors_cast features
df['Stars'].str.replace(' ', '').str.replace(',', ' ')
#bdf['Stars'].str.replace(',', '')

0                              PedroPascal BellaRamsey AnnaTorv GabrielLuna
1                  PaulRudd EvangelineLilly MichaelDouglas MichellePfeiffer
2                     KeriRussell AldenEhrenreich OSheaJacksonJr. RayLiotta
3                  PennBadgley VictoriaPedretti AmbyrChilders ElizabethLail
4                               ChaseStokes MadelynCline MadisonBailey J.D.
                                       ...                                 
5095    FreddiePrinzeJr. SarahMichelleGellar MatthewLillard LindaCardellini
5096                       RoaldDahl AndrewRay ForbesCollins RichardJohnson
5097                            CassiDavis LaVanDavis AllenPayne LanceGross
5098                     PierceBrosnan EmmaThompson TimothySpall CeliaImrie
5099                     CameronDiaz AbigailBreslin AlecBaldwin WalterRaney
Name: Stars, Length: 5100, dtype: object

In [27]:
# Concatenating all of our Important features to a new feature 

df['Tags'] = df['Genre'].fillna('') + ' ' + df['Description'].fillna('') + ' ' +\
             df['Directors_cast'].str.replace(' ', '').fillna('') + ' ' + df['Stars'].str.replace(' ', '').str.replace(',', ' ').fillna('')
df['Tags'][0]

'Action  Adventure  Drama After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.  PedroPascal BellaRamsey AnnaTorv GabrielLuna'

In [28]:
df.head()

Unnamed: 0,Movie_id,Name,Year,Time,Genre,Rating,Score,Description,Directors_cast,Stars,Votes,Total,Tags
0,1,The Last of Us,2023–,50.0,Action Adventure Drama,9.1,,After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.,,"Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna",242743,,Action Adventure Drama After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope. PedroPascal BellaRamsey AnnaTorv Gabr...
1,2,Ant-Man and the Wasp: Quantumania,2023,124.0,Action Adventure Comedy,6.5,48.0,Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that goes beyond the limits of w...,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer",75124,,Action Adventure Comedy Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that g...
2,3,Cocaine Bear,2023,95.0,Comedy Thriller,6.4,54.0,An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.,Elizabeth Banks,"Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta",12369,,Comedy Thriller An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine. Eli...
3,4,You,2018–,45.0,Crime Drama Romance,7.7,,A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.,,"Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail",260722,,Crime Drama Romance A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by. PennBadgley VictoriaPedretti Amb...
4,5,Outer Banks,2020–,50.0,Action Crime Drama,7.6,,On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance.,,"Chase Stokes, Madelyn Cline, Madison Bailey, J.D.",59143,,Action Crime Drama On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance. ChaseStokes MadelynCline ...


In [29]:
# Converting our Tags feature to lowercase. 
df['Tags'] = df['Tags'].apply(lambda x: x.lower())
df['Tags'][0]

'action  adventure  drama after a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.  pedropascal bellaramsey annatorv gabrielluna'

#                                                              
# TEXT PREPROCESSING
Next, We will apply apply natural language processing techniques to pre-process our Tags feature.
The essence of doing this is because our Tags feature is a Textual data, hence, we need to perform the below steps:
1. Remove Stop-words like ‘the’, ‘me’, ‘I’ etc, which are very frequent and don’t help in prediction.
2. Punctuations symbol such as . , ; ? which are not useful in prediction, hence need to be removed.
3. Stemming. This is a method of normalization of words in Natural Language Processing. It is a technique in which a set of words in a sentence are converted into a sequence to shorten its lookup. In this method, the words having the same meaning but have some variations according to the context or sentence are normalized. The alternatives to Stemming is Lemmatization, which is an algorithm that collects all inflected forms of a word in order to break them down to their root dictionary form or lemma. Words are broken down into a part of speech by way of the rules of grammar.

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.


### Punctuations Pre-processing 

In [30]:
# Removing Punctuations from Tags
import string

In [31]:
# define a Helper function to remove punctuations from Tags feature
def nopunct(text):
    y = []
    for char in text:
        if char not in string.punctuation():
            y.append(char)
    return " ".join(y)    

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_punctuation_02(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

 Before: df['Tags'][0] 
 
'action  adventure  drama after a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.  pedropascal bellaramsey annatorv gabrielluna'

In [32]:
df['Tags'] = df['Tags'].apply(remove_punctuation_02)
df['Tags'][0]

'action  adventure  drama after a global pandemic destroys civilization a hardened survivor takes charge of a 14yearold girl who may be humanitys last hope  pedropascal bellaramsey annatorv gabrielluna'

####  

### Stop Words Preprocessing 

In [33]:
# Removing stopwords from Tags
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
# define a Helper function to remove Stopwords from Tags feature
def remove_stopwords(text):
    y = []
    for words in text.split():
        if words not in stopwords.words('english'):
            y.append(words)
    return " ".join(y)

Before: df['Tags'][0]

'action adventure drama after a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope. pedropascal bellaramsey annatorv gabrielluna'

In [35]:
df['Tags'] = df['Tags'].apply(remove_stopwords)
df['Tags'][0]

'action adventure drama global pandemic destroys civilization hardened survivor takes charge 14yearold girl may humanitys last hope pedropascal bellaramsey annatorv gabrielluna'

In [36]:
df.head()

Unnamed: 0,Movie_id,Name,Year,Time,Genre,Rating,Score,Description,Directors_cast,Stars,Votes,Total,Tags
0,1,The Last of Us,2023–,50.0,Action Adventure Drama,9.1,,After a global pandemic destroys civilization a hardened survivor takes charge of a 14-year-old girl who may be humanitys last hope.,,"Pedro Pascal, Bella Ramsey, Anna Torv, Gabriel Luna",242743,,action adventure drama global pandemic destroys civilization hardened survivor takes charge 14yearold girl may humanitys last hope pedropascal bellaramsey annatorv gabrielluna
1,2,Ant-Man and the Wasp: Quantumania,2023,124.0,Action Adventure Comedy,6.5,48.0,Scott Lang and Hope Van Dyne along with Hank Pym and Janet Van Dyne explore the Quantum Realm where they interact with strange creatures and embark on an adventure that goes beyond the limits of w...,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, Michelle Pfeiffer",75124,,action adventure comedy scott lang hope van dyne along hank pym janet van dyne explore quantum realm interact strange creatures embark adventure goes beyond limits thought possible peytonreed paul...
2,3,Cocaine Bear,2023,95.0,Comedy Thriller,6.4,54.0,An oddball group of cops criminals tourists and teens converge on a Georgia forest where a huge black bear goes on a murderous rampage after unintentionally ingesting cocaine.,Elizabeth Banks,"Keri Russell, Alden Ehrenreich, OShea Jackson Jr., Ray Liotta",12369,,comedy thriller oddball group cops criminals tourists teens converge georgia forest huge black bear goes murderous rampage unintentionally ingesting cocaine elizabethbanks kerirussell aldenehrenre...
3,4,You,2018–,45.0,Crime Drama Romance,7.7,,A dangerously charming intensely obsessive young man goes to extreme measures to insert himself into the lives of those he is transfixed by.,,"Penn Badgley, Victoria Pedretti, Ambyr Childers, Elizabeth Lail",260722,,crime drama romance dangerously charming intensely obsessive young man goes extreme measures insert lives transfixed pennbadgley victoriapedretti ambyrchilders elizabethlail
4,5,Outer Banks,2020–,50.0,Action Crime Drama,7.6,,On an island of haves and have-nots teen John B enlists his three best friends to hunt for a legendary treasure linked to his fathers disappearance.,,"Chase Stokes, Madelyn Cline, Madison Bailey, J.D.",59143,,action crime drama island haves havenots teen john b enlists three best friends hunt legendary treasure linked fathers disappearance chasestokes madelyncline madisonbailey jd


### 

### Stemming Pre Processing

In [37]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [38]:
# define a Helper function to Stem words in Tags feature

def stem(text):
    y = []
    for word in text.split():
        y.append(ps.stem(word))
    return " ".join(y)

Before: df['Tags'][0]

'action adventure drama global pandemic destroys civilization hardened survivor takes charge 14yearold girl may humanitys last hope pedropascal bellaramsey annatorv gabrielluna'

In [39]:
df['Tags'] = df['Tags'].apply(stem)
df['Tags'][0]

'action adventur drama global pandem destroy civil harden survivor take charg 14yearold girl may human last hope pedropasc bellaramsey annatorv gabrielluna'

# 

# BUILDING THE MODEL

In this phase, we are going to define a model for our Recommender system by performing the following steps
1. Vectorization: Vectorization is a process of converting the text data (Tags feature) into a machine-readable form. The words are represented as vectors. 
2. Similarity Matching Computation: This is an algorithm that measures how similar or related two objects are.
3. Build a Recommendation function.

### Vectorization

There are several approaches to achieveing this, which includes 
1. Bags-of-Words Transformation 
2. Term Frequency Inverse Document Frequency (TF-IDF) Transformation

For our model, we have decided to use TF-IDF Transformation

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
  
# Vectorizing pre-processed Tags using TF-IDF
vectorizer = TfidfVectorizer()
Movie_tags_vec = vectorizer.fit_transform((df["Tags"]).apply(lambda x: np.str_(x))).toarray()

In [41]:
Movie_tags_vec.shape

(5100, 23045)

In [42]:
Movie_tags_vec

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
vectorizer.get_feature_names()

['00',
 '007',
 '10',
 '100',
 '1000',
 '100000',
 '1000000',
 '100th',
 '101996',
 '108yearold',
 '109',
 '10episod',
 '10th',
 '10year',
 '10yearold',
 '1100mile',
 '112',
 '1190',
 '11yearold',
 '12',
 '120',
 '120k',
 '12foot',
 '12hour',
 '12person',
 '12th',
 '12year',
 '12yearold',
 '13',
 '1300',
 '13112015',
 '13th',
 '13yearold',
 '14',
 '1400',
 '1408',
 '14th',
 '14yearold',
 '15',
 '150',
 '15th',
 '15yearold',
 '16',
 '160',
 '1600',
 '161',
 '1625',
 '1630',
 '164',
 '1667',
 '16th',
 '16year',
 '16yearold',
 '17',
 '1717',
 '172',
 '1743',
 '17th',
 '17thcenturi',
 '17yearold',
 '18',
 '1800',
 '1812',
 '1820',
 '1825',
 '1839',
 '1840',
 '1862',
 '1863',
 '1870',
 '1871',
 '1872',
 '1873',
 '1877',
 '1880',
 '1885',
 '1889',
 '1890',
 '1892',
 '1897',
 '1899',
 '18month',
 '18th',
 '18thcenturi',
 '18yearold',
 '19',
 '1900',
 '1909',
 '1910',
 '1917',
 '1918',
 '1920',
 '1922',
 '1924',
 '1928',
 '1929',
 '1930',
 '1931',
 '1932',
 '1935',
 '1936',
 '1937',
 '1938',
 

# 

### Similarity Matching

There are several approaches to achieveing this, which includes 
1. Euclidean Distance
2. Cosine Similarity
  
For our model, we have decided to use Cosine similarity. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths.

In [44]:
# Finding cosine similarity between vectors

from sklearn.metrics.pairwise import cosine_similarity
Movie_tags_similarity = cosine_similarity(Movie_tags_vec)
Movie_tags_similarity

array([[1.        , 0.04123206, 0.        , ..., 0.00396478, 0.        ,
        0.00287676],
       [0.04123206, 1.        , 0.02132173, ..., 0.00467232, 0.00447805,
        0.        ],
       [0.        , 0.02132173, 1.        , ..., 0.0057134 , 0.00547584,
        0.        ],
       ...,
       [0.00396478, 0.00467232, 0.0057134 , ..., 1.        , 0.00796849,
        0.02861963],
       [0.        , 0.00447805, 0.00547584, ..., 0.00796849, 1.        ,
        0.        ],
       [0.00287676, 0.        , 0.        , ..., 0.02861963, 0.        ,
        1.        ]])

In [45]:
cosine_similarity(Movie_tags_vec).shape

(5100, 5100)

In [46]:
Movie_tags_similarity[0]

array([1.        , 0.04123206, 0.        , ..., 0.00396478, 0.        ,
       0.00287676])

In [51]:
# Export array to CSV file
# Movie_tags_similarity

np.savetxt("Movie_tags_similarity.csv", Movie_tags_similarity, delimiter=",")

In [53]:
#  Export array to pickle file
import pickle

pickle.dump(Movie_tags_similarity, open('Movie_tags_similarity.pkl', 'wb'))

# 

### Recommendation function (5 movies )

In [50]:
# Defining a function to recommend five (5) movies only
def recommend(movie):
    Movie_id = df[df['Name']== movie]['Movie_id'].values[0]
    distances = Movie_tags_similarity[Movie_id - 1]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
    movie_list = movie_list[1:6]
    for i in movie_list:
        print(df.iloc[i[0]].Name)
        
def recommend_02(movie):
    Movie_index = df[df['Name']== movie].index[0]
    distances = Movie_tags_similarity[Movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
    movies_list = movies_list[1:6]
    
    for i in movies_list:
        print(df.iloc[i[0]].Name)

In [48]:
recommend('Aliens')

Alien Resurrection
Alien³
The Terminator
The Abyss
Avatar 3
