# Build a recommendation system based on:
* Content based filtering and

In [1]:
import pandas as pd           ### for data pre-processing 
import numpy as np            ### for numerical programming
import matplotlib.pyplot as plt   ## for plots/charts..
import seaborn as sns

In [2]:
data = pd.read_csv(r"C:/Users/ecridin/Documents/RecSys/TMB/Datasets/clean_data.csv")

In [3]:
data.shape

(4803, 19)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4803 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  release_date          4802 non-null   object 
 11  revenue               4803 non-null   int64  
 12  runtime               4801 non-null   float64
 13  spoken_languages      4803 non-null   object 
 14  tagline               3959 non-null   object 
 15  vote_average         

In [5]:
data["overview"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [6]:
data["overview"].isnull().sum()

3

In [7]:
data["overview"] = data["overview"].fillna('')

In [8]:
data["overview"].isnull().sum()

0

In [9]:
# step 1 - lower the sentence 
# step 2 - apply tokenization
# step 3 - histogram - calculate the frequency 
# step 4 - filter wards - remove less frecquent words
# step 5 - create a matrix BOW -document matrxi  --> pass to ML
# step 6 - tf-idf

In [10]:
# tf-idf
# term frequency - inversed document frequency 

In [11]:
# tf * idf --> ML
# assign importance to each word of my data 

In [12]:
# doc1 = "It is going to rain today" # Doc 1
# doc2 = "Today I am not going outside" # Doc 2

In [13]:
# how if-idf works
# step 1 - lower the sentence 

# no of occurences of a word in a document / no of words in that document
# tf values of "it" 1/6
# tf values of "is" 1/6

# words per Document Doc 1  Doc2   idf value
# going               1/6   1/6    log(2/2)
# to                  1/6   0      log(2/1)
# today               1/6   1/6    log(2/2)
# i                   0     1/6    log(2/1)
# am                  0     1/6    log(2/1)
# it                  1/6   0      log(2/1)
# is                  1/6   0      log(2/1)
# rain                1/6   0      log(2/1)        
# outside             0     1/6    log(2/1)

# idf  = log(total number of documents you have in your data/
#            nr of documents containing that particular word)

# going --> log(2/2) --> log1 =0

## step 2 - apply tokenization
## step 3 - histogram - calculate the frequency 
## step 4 - filter wards - remove less frecquent words
## step 5 - create a matrix BOW -document matrxi  --> pass to ML
## step 6 - tf-idf

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tf_vect = TfidfVectorizer(min_df=3, max_features=None,ngram_range=(1,3),stop_words="english")

In [16]:
tf_vect_matrix = tf_vect.fit_transform(data["overview"])

In [17]:
tf_vect_matrix

<4803x9919 sparse matrix of type '<class 'numpy.float64'>'
	with 121480 stored elements in Compressed Sparse Row format>

In [18]:
tf_vect_matrix.shape

(4803, 9919)

In [19]:
data["overview"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [20]:
tf_vect_matrix[0].toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [21]:
tf_vect_matrix[0].T.toarray()

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [22]:
X = tf_vect_matrix.toarray()

In [23]:
X_df = pd.DataFrame(X)

In [24]:
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9909,9910,9911,9912,9913,9914,9915,9916,9917,9918
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#cosine similarity(x1,x2) = 0.7
# 70% probability x1 is the same with x2

In [26]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [27]:
sig = sigmoid_kernel(tf_vect_matrix,tf_vect_matrix)

In [28]:
sig[0]

array([0.76163649, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159416])

In [29]:
sig[1]

array([0.76159416, 0.76163649, 0.76159416, ..., 0.76159519, 0.76159416,
       0.76159416])

In [30]:
index = data.index

In [31]:
index

RangeIndex(start=0, stop=4803, step=1)

In [32]:
index[4802]

4802

In [33]:
title =data["original_title"]

In [34]:
title[4802]

'My Date with Drew'

In [35]:
pd.Series(data.index,index = data["original_title"])

original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [36]:
indices = pd.Series(data.index,index = data["original_title"])

In [37]:
indices["Avatar"]

0

In [38]:
sig[indices["Avatar"]]

array([0.76163649, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159416])

In [39]:
len(list(sig[indices["Avatar"]]))

4803

In [40]:
enumerate(list(sig[indices["Avatar"]]))

<enumerate at 0x2690a4b0080>

In [41]:
list(enumerate(list(sig[indices["Avatar"]])))

[(0, 0.7616364930962501),
 (1, 0.7615941559557649),
 (2, 0.7615941559557649),
 (3, 0.761595125164868),
 (4, 0.7615941559557649),
 (5, 0.761595309447695),
 (6, 0.7615941559557649),
 (7, 0.761595908255829),
 (8, 0.7615941559557649),
 (9, 0.7615941559557649),
 (10, 0.7615941559557649),
 (11, 0.7615950846887262),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615941559557649),
 (17, 0.7615941559557649),
 (18, 0.7615941559557649),
 (19, 0.7615941559557649),
 (20, 0.7615941559557649),
 (21, 0.7615941559557649),
 (22, 0.7615941559557649),
 (23, 0.7615941559557649),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (26, 0.76159572705),
 (27, 0.7615962614863476),
 (28, 0.7615941559557649),
 (29, 0.7615952950243241),
 (30, 0.7615941559557649),
 (31, 0.7615968611459101),
 (32, 0.7615941559557649),
 (33, 0.7615941559557649),
 (34, 0.7615941559557649),
 (35, 0.7615941559557649),
 (36, 0.761597988130195),
 (37, 0.761594155955

In [42]:
sorted(list(enumerate(list(sig[indices["Avatar"]]))))

[(0, 0.7616364930962501),
 (1, 0.7615941559557649),
 (2, 0.7615941559557649),
 (3, 0.761595125164868),
 (4, 0.7615941559557649),
 (5, 0.761595309447695),
 (6, 0.7615941559557649),
 (7, 0.761595908255829),
 (8, 0.7615941559557649),
 (9, 0.7615941559557649),
 (10, 0.7615941559557649),
 (11, 0.7615950846887262),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615941559557649),
 (17, 0.7615941559557649),
 (18, 0.7615941559557649),
 (19, 0.7615941559557649),
 (20, 0.7615941559557649),
 (21, 0.7615941559557649),
 (22, 0.7615941559557649),
 (23, 0.7615941559557649),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (26, 0.76159572705),
 (27, 0.7615962614863476),
 (28, 0.7615941559557649),
 (29, 0.7615952950243241),
 (30, 0.7615941559557649),
 (31, 0.7615968611459101),
 (32, 0.7615941559557649),
 (33, 0.7615941559557649),
 (34, 0.7615941559557649),
 (35, 0.7615941559557649),
 (36, 0.761597988130195),
 (37, 0.761594155955

In [43]:
sorted(list(enumerate(list(sig[indices["Avatar"]]))), key = lambda x:x[1])

[(1, 0.7615941559557649),
 (2, 0.7615941559557649),
 (4, 0.7615941559557649),
 (6, 0.7615941559557649),
 (8, 0.7615941559557649),
 (9, 0.7615941559557649),
 (10, 0.7615941559557649),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615941559557649),
 (17, 0.7615941559557649),
 (18, 0.7615941559557649),
 (19, 0.7615941559557649),
 (20, 0.7615941559557649),
 (21, 0.7615941559557649),
 (22, 0.7615941559557649),
 (23, 0.7615941559557649),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (28, 0.7615941559557649),
 (30, 0.7615941559557649),
 (32, 0.7615941559557649),
 (33, 0.7615941559557649),
 (34, 0.7615941559557649),
 (35, 0.7615941559557649),
 (37, 0.7615941559557649),
 (38, 0.7615941559557649),
 (39, 0.7615941559557649),
 (40, 0.7615941559557649),
 (41, 0.7615941559557649),
 (42, 0.7615941559557649),
 (44, 0.7615941559557649),
 (46, 0.7615941559557649),
 (48, 0.7615941559557649),
 (49, 0.7615941559557649),
 (50, 0

In [44]:
sorted(list(enumerate(list(sig[indices["Avatar"]]))), key = lambda x:x[1],reverse=True)

[(0, 0.7616364930962501),
 (1341, 0.7616030155858681),
 (634, 0.7616028561141562),
 (3604, 0.761601930611584),
 (2130, 0.7616015339622925),
 (775, 0.7616011086528327),
 (529, 0.7615996114069044),
 (151, 0.7615991171152051),
 (311, 0.7615990624497703),
 (847, 0.7615987706430225),
 (570, 0.7615986450599548),
 (942, 0.7615984376900236),
 (36, 0.761597988130195),
 (1610, 0.7615979793934843),
 (3070, 0.7615978406764746),
 (1033, 0.7615978182403835),
 (2628, 0.7615977834088159),
 (1784, 0.7615977150705628),
 (2578, 0.7615976778191441),
 (150, 0.7615976453752453),
 (3724, 0.7615975951237102),
 (1013, 0.761597590729192),
 (4211, 0.7615975631290406),
 (1213, 0.7615975380289366),
 (1345, 0.7615974549075267),
 (312, 0.7615974086679764),
 (4039, 0.7615973645677722),
 (2967, 0.7615973512232982),
 (614, 0.7615972949789032),
 (281, 0.7615972537743877),
 (174, 0.7615972462403858),
 (3493, 0.7615971922075142),
 (3624, 0.7615971821325882),
 (972, 0.7615971791001622),
 (1274, 0.7615971587672579),
 (1959,

In [45]:
sigma_score = sorted(list(enumerate(list(sig[indices["Avatar"]]))), key = lambda x:x[1],reverse=True)

In [46]:
sigma_score[1:11]

[(1341, 0.7616030155858681),
 (634, 0.7616028561141562),
 (3604, 0.761601930611584),
 (2130, 0.7616015339622925),
 (775, 0.7616011086528327),
 (529, 0.7615996114069044),
 (151, 0.7615991171152051),
 (311, 0.7615990624497703),
 (847, 0.7615987706430225),
 (570, 0.7615986450599548)]

In [47]:
[index[0] for index in sigma_score[1:11]]

[1341, 634, 3604, 2130, 775, 529, 151, 311, 847, 570]

In [48]:
ind = [index[0] for index in sigma_score[1:11]]

In [49]:
data["original_title"].iloc[ind]

1341                Obitaemyy Ostrov
634                       The Matrix
3604                       Apollo 18
2130                    The American
775                        Supernova
529                 Tears of the Sun
151                          Beowulf
311     The Adventures of Pluto Nash
847                         Semi-Pro
570                           Ransom
Name: original_title, dtype: object

In [50]:
def give_rec(title,model):
    idx = indices[title]
    #list(enumerate(list(sig[indices["Avatar"]]))) # step1
    #list(enumerate(list(model[indices["Avatar"]]))) #step2
    model_scores = list(enumerate(list(model[indices[idx]]))) #step3
    # sorted(list(enumerate(list(sig[indices["Avatar"]]))), key = lambda x:x[1],reverse=True) #step1
    model_scores = sorted(model_scores, key = lambda x:x[1],reverse=True) # step2
    model_scores =  model_scores[1:11]
    #ind = [index[0] for index in sigma_score[1:11]] #1
    movie_indices = [index[0] for index in model_scores] #2
    #data["original_title"].iloc[ind] 1
    # print(data["original_title"].iloc[movie_indices].to_string(index=False))
    # return data["original_title"].iloc[movie_indices].tolist()
    return data["original_title"].iloc[movie_indices]

In [51]:
give_rec("Avatar",sig)

1341                Obitaemyy Ostrov
634                       The Matrix
3604                       Apollo 18
2130                    The American
775                        Supernova
529                 Tears of the Sun
151                          Beowulf
311     The Adventures of Pluto Nash
847                         Semi-Pro
570                           Ransom
Name: original_title, dtype: object

# Improve the model

In [78]:
data = pd.read_csv(r"C:/Users/ecridin/Documents/RecSys/TMB/Datasets/clean_data.csv")
data.columns

Index(['Unnamed: 0', 'budget', 'genres', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'tagline',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [79]:
data["genres"][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [80]:
data["genres"][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [81]:
#'"[abbbac]"'.replace('"',"X")

In [82]:
data["genres"][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [83]:
eval(data["genres"][0])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [84]:
eval(data["genres"][0])[0]

{'id': 28, 'name': 'Action'}

In [85]:
from ast import literal_eval

In [86]:
literal_eval(data["genres"][0])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [87]:
features = ["cast","crew","keywords","genres"]

In [88]:
data["genres"][1]

'[{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]'

In [89]:
#data["genres"]=data["genres"].apply(literal_eval)

In [90]:
#data["genres"][1]

In [91]:
for feature in features :
    data[feature]=data[feature].apply(literal_eval)

In [92]:
data.columns

Index(['Unnamed: 0', 'budget', 'genres', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'tagline',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [93]:
#data["genres"][1]

In [94]:
data[features]

Unnamed: 0,cast,crew,keywords,genres
0,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","[{'id': 1463, 'name': 'culture clash'}, {'id':...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
1,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
3,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[{'id': 849, 'name': 'dc comics'}, {'id': 853,...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
4,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
...,...,...,...,...
4798,"[{'cast_id': 1, 'character': 'El Mariachi', 'c...","[{'credit_id': '52fe44eec3a36847f80b280b', 'de...","[{'id': 5616, 'name': 'united states–mexico ba...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
4799,"[{'cast_id': 1, 'character': 'Buzzy', 'credit_...","[{'credit_id': '52fe487dc3a368484e0fb013', 'de...",[],"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '..."
4800,"[{'cast_id': 8, 'character': 'Oliver O’Toole',...","[{'credit_id': '52fe4df3c3a36847f8275ecf', 'de...","[{'id': 248, 'name': 'date'}, {'id': 699, 'nam...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4801,"[{'cast_id': 3, 'character': 'Sam', 'credit_id...","[{'credit_id': '52fe4ad9c3a368484e16a36b', 'de...",[],[]


In [None]:
data.columns

In [None]:
# search - extract feature director - regizor

In [96]:
type(data["crew"][0])
    

list

In [97]:
len(data["crew"][0])

153

In [99]:
data["crew"][0][:2]

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'}]

In [107]:

for i in data["crew"][0]:
    print(i["job"])

        

Editor
Production Design
Sound Designer
Supervising Sound Editor
Casting
Original Music Composer
Director
Writer
Editor
Producer
Screenplay
Art Direction
Visual Effects Producer
Casting
Supervising Art Director
Music Editor
Sound Effects Editor
Foley
Foley
Costume Design
Producer
Art Direction
Set Decoration
Supervising Art Director
Set Designer
Executive Producer
Costume Design
Director of Photography
Set Designer
Stunts
Makeup Artist
Hairstylist
Camera Operator
Art Direction
Visual Effects Supervisor
Visual Effects Editor
Editor
Set Designer
Director of Photography
Stunts
Visual Effects Supervisor
Dialect Coach
Art Direction
Art Direction
Art Direction
Motion Capture Artist
Stunt Coordinator
Visual Effects Supervisor
Supervising Art Director
Supervising Art Director
Casting
Production Design
Costume Design
Steadicam Operator
Makeup Department Head
Visual Effects Producer
Visual Effects Supervisor
Post Production Supervisor
Visual Effects Supervisor
Makeup Artist
Costume Supervisor
Vi

In [112]:
len(data["crew"][0])

153

In [None]:
lista de 153 de dictionare 
accesezi dictionar i["key"]

In [111]:
for i in data["crew"][0]:
    if i["job"]=="Director":
        print(i["name"].strip())

James Cameron


In [113]:
def get_director(x):
    for dict in x:
        if dict["job"]=="Director":
            return dict["name"]
    else:
        return np.nan
    

In [114]:
get_director(data["crew"][0])

'James Cameron'

In [115]:
data["director"] = data["crew"].apply(get_director)

In [118]:
data.columns.tolist()[-3:]

['cast', 'crew', 'director']

In [119]:
data[data.columns.tolist()[-3:]].head()

Unnamed: 0,cast,crew,director
0,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron
1,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",Gore Verbinski
2,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",Sam Mendes
3,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christopher Nolan
4,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",Andrew Stanton


In [120]:
data["director"].isnull().sum()

30

In [121]:
data["cast"][0]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2},
 {'cast_id': 4,
  'character': 'Col. Quaritch',
  'credit_id': '52fe48009251416c750ac9cf',
  'gender': 2,
  'id': 32747,
  'name': 'Stephen Lang',
  'order': 3},
 {'cast_id': 5,
  'character': 'Trudy Chacon',
  'credit_id': '52fe48009251416c750ac9d3',
  'gender': 1,
  'id': 17647,
  'name': 'Michelle Rodriguez',
  'order': 4},
 {'cast_id': 8,
  'character': 'Selfridge',
  'credit_id': '52fe48009251416c750ac9e1',
  'gender': 2,
  'id': 1771,
  'name': 'Giovanni Ribisi',
  'order': 5},
 {'cast_id': 7,
  'c

In [139]:
def get_list(x):

    names = [dict["name"].strip().lower() for dict in x if type(x) == list]
    if len(names) >= 3:
        return names[0:3]
    else:
        return names
        

In [149]:
get_list(data["crew"][0])

['stephen e. rivkin', 'rick carter', 'christopher boyes']

In [150]:
# def get_list2(x):
#     names = []
#     for dict in x:
#         if type(x) == list :
#             name = dict["name"]
#             name = name.strip()
#             name = name.lower()
#             names.append(name)
#     if len(names) >=3:
#         return names[:3]
#     else:
#         return names
    

In [151]:
# get_list2(data["cast"][4])

In [152]:
df2 = data.copy()

In [153]:
for feature in features:
    data[feature] = data[feature].apply(get_list)

In [154]:
data[features]

Unnamed: 0,cast,crew,keywords,genres
0,"[sam worthington, zoe saldana, sigourney weaver]","[stephen e. rivkin, rick carter, christopher b...","[culture clash, future, space war]","[action, adventure, fantasy]"
1,"[johnny depp, orlando bloom, keira knightley]","[dariusz wolski, gore verbinski, jerry bruckhe...","[ocean, drug abuse, exotic island]","[adventure, fantasy, action]"
2,"[daniel craig, christoph waltz, léa seydoux]","[thomas newman, sam mendes, anna pinnock]","[spy, based on novel, secret agent]","[action, adventure, crime]"
3,"[christian bale, michael caine, gary oldman]","[hans zimmer, charles roven, christopher nolan]","[dc comics, crime fighter, terrorist]","[action, crime, drama]"
4,"[taylor kitsch, lynn collins, samantha morton]","[andrew stanton, andrew stanton, john lasseter]","[based on novel, mars, medallion]","[action, adventure, science fiction]"
...,...,...,...,...
4798,"[carlos gallardo, jaime de hoyos, peter marqua...","[robert rodriguez, robert rodriguez, robert ro...","[united states–mexico barrier, legs, arms]","[action, crime, thriller]"
4799,"[edward burns, kerry bishé, marsha dietlein]","[edward burns, edward burns, edward burns]",[],"[comedy, romance]"
4800,"[eric mabius, kristin booth, crystal lowe]","[carla hetland, harvey kahn, adam sliwinski]","[date, love at first sight, narration]","[comedy, drama, romance]"
4801,"[daniel henney, eliza coupe, bill paxton]","[daniel hsia, daniel hsia]",[],[]


In [155]:
df2[features]

Unnamed: 0,cast,crew,keywords,genres
0,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","[{'id': 1463, 'name': 'culture clash'}, {'id':...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
1,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
3,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[{'id': 849, 'name': 'dc comics'}, {'id': 853,...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
4,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam..."
...,...,...,...,...
4798,"[{'cast_id': 1, 'character': 'El Mariachi', 'c...","[{'credit_id': '52fe44eec3a36847f80b280b', 'de...","[{'id': 5616, 'name': 'united states–mexico ba...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
4799,"[{'cast_id': 1, 'character': 'Buzzy', 'credit_...","[{'credit_id': '52fe487dc3a368484e0fb013', 'de...",[],"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '..."
4800,"[{'cast_id': 8, 'character': 'Oliver O’Toole',...","[{'credit_id': '52fe4df3c3a36847f8275ecf', 'de...","[{'id': 248, 'name': 'date'}, {'id': 699, 'nam...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4801,"[{'cast_id': 3, 'character': 'Sam', 'credit_id...","[{'credit_id': '52fe4ad9c3a368484e16a36b', 'de...",[],[]


In [157]:
data.isnull().sum()

Unnamed: 0                0
budget                    0
genres                    0
id                        0
keywords                  0
original_language         0
original_title            0
overview                  3
popularity                0
production_companies      0
release_date              1
revenue                   0
runtime                   2
spoken_languages          0
tagline                 844
vote_average              0
vote_count                0
cast                      0
crew                      0
director                 30
dtype: int64

In [158]:
data.dropna(subset=["director"],inplace=True)

In [159]:
data.isna().sum()

Unnamed: 0                0
budget                    0
genres                    0
id                        0
keywords                  0
original_language         0
original_title            0
overview                  3
popularity                0
production_companies      0
release_date              0
revenue                   0
runtime                   2
spoken_languages          0
tagline                 822
vote_average              0
vote_count                0
cast                      0
crew                      0
director                  0
dtype: int64

In [160]:
# create metadata
data["cast"][0]

['sam worthington', 'zoe saldana', 'sigourney weaver']

In [164]:
" ".join(data["cast"][0])

'sam worthington zoe saldana sigourney weaver'

In [165]:
type(" ".join(data["cast"][0]))

str

In [167]:
def create_feature (row):
    return " ".join(row["keywords"]) + " " + " ".join(row["cast"]) + " " + row["director"] + " " + " ".join(row["genres"])

In [168]:
#data.apply by default is 0 (coloane)
data["important_feature"] = data.apply(create_feature,axis = 1)

In [170]:
data["important_feature"].head()

0    culture clash future space war sam worthington...
1    ocean drug abuse exotic island johnny depp orl...
2    spy based on novel secret agent daniel craig c...
3    dc comics crime fighter terrorist christian ba...
4    based on novel mars medallion taylor kitsch ly...
Name: important_feature, dtype: object

In [171]:
data["keywords"][0]

['culture clash', 'future', 'space war']

In [172]:
data["cast"][0]

['sam worthington', 'zoe saldana', 'sigourney weaver']

In [173]:
data["director"][0]

'James Cameron'

In [174]:
data["genres"][0]

['action', 'adventure', 'fantasy']

In [175]:
from sklearn.feature_extraction.text import CountVectorizer

In [176]:
count = CountVectorizer(stop_words="english")

In [177]:
count_matrix = count.fit_transform(data["important_feature"])

In [178]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [180]:
#https://www.linkedin.com/pulse/count-vectorizers-vs-tfidf-natural-language-processing-sheel-saket/

# """
# Count Vectors can be helpful in understanding the type of text by the frequency of words in it. 
# But its major disadvantages are:

# Its inability in identifying more important and less important words for analysis.
# It will just consider words that are abundant in a corpus as the most statistically significant word.
# It also doesn't identify the relationships between words such as linguistic similarity between words.
# """

In [None]:
# TF-IDF means Term Frequency - Inverse Document Frequency.
# This is a statistic that is based on the frequency of a word in the corpus but it also provides 
# a numerical representation of how important a word is for statistical analysis.

# TF-IDF is better than Count Vectorizers because it not only focuses on the frequency 
# of words present in the corpus but also provides the importance of the words. We can then 
# remove the words that are less important for analysis, hence making the model building less 
# complex by reducing the input dimensions.


# Even though TFIDF can provide a good understanding about the importance of words but just like Count Vectors,
# its disadvantage is:

# It fails to provide linguistic information about the words such as the real meaning of the words,
# similarity with other words etc.
# To train a model on the actual linguistic relationship of the words,
# there are two other word embedding techniques widely used in NLP, 
# they are "word2vec" and "Glove". I will discuss about these two in another article.

In [181]:
from sklearn.metrics.pairwise import cosine_similarity

In [182]:
cosine_sim2 = cosine_similarity(count_matrix,count_matrix)

In [184]:
cosine_sim2[0]

array([1.    , 0.1875, 0.1875, ..., 0.    , 0.    , 0.    ])

In [185]:
data["original_title"][6]

'Tangled'

In [187]:
give_rec(data["original_title"][6],cosine_sim2)

578     Alvin and the Chipmunks: The Squeakquel
1108                                  Pinocchio
1481                         The House of Magic
1857                            Rugrats Go Wild
42                                  Toy Story 3
390                          Hotel Transylvania
565                                     Shrek 2
899                                       Shrek
1695                                    Aladdin
3281                        The Secret of Kells
Name: original_title, dtype: object

In [188]:
give_rec(data["original_title"][0],cosine_sim2)

2403                          Aliens
94           Guardians of the Galaxy
5                       Spider-Man 3
30                      Spider-Man 2
206              Clash of the Titans
131                          G-Force
773                          Flyboys
1804    Snow White: A Tale of Terror
37        Oz: The Great and Powerful
47           Star Trek Into Darkness
Name: original_title, dtype: object

In [191]:
data["original_title"][data["original_title"].str.lower().str.contains("star wars")]

229     Star Wars: Episode III - Revenge of the Sith
230     Star Wars: Episode II - Attack of the Clones
233        Star Wars: Episode I - The Phantom Menace
2912                                       Star Wars
3208                Star Wars: Clone Wars (Volume 1)
Name: original_title, dtype: object

In [192]:
give_rec(data["original_title"][229],cosine_sim2)

230     Star Wars: Episode II - Attack of the Clones
233        Star Wars: Episode I - The Phantom Menace
2912                                       Star Wars
1962                                  Jane Got a Gun
2995                      Mad Max Beyond Thunderdome
127                               Mad Max: Fury Road
271                                       The Island
419                                           Jumper
1324                                Virgin Territory
1303                          Star Trek: Generations
Name: original_title, dtype: object

In [193]:
data["original_title"][data["original_title"].str.lower().str.contains("godfather")]

867     The Godfather: Part III
2728         The Last Godfather
2731     The Godfather: Part II
3337              The Godfather
Name: original_title, dtype: object

In [None]:
give_rec(data["original_title"][229],cosine_sim2)

In [None]:
The Godfather

In [194]:
give_rec(data["original_title"][3337],cosine_sim2)

867     The Godfather: Part III
2731     The Godfather: Part II
1525             Apocalypse Now
2792        Glengarry Glen Ross
1209              The Rainmaker
3012              The Outsiders
4209           The Conversation
2649          The Son of No One
1018            The Cotton Club
4124         This Thing of Ours
Name: original_title, dtype: object

In [195]:
give_rec(data["original_title"][3337],sig)

2731                             The Godfather: Part II
1873                                         Blood Ties
3730                                              Cargo
867                             The Godfather: Part III
3623                                               Made
3963                                 They Came Together
2464                             The Master of Disguise
2973    For Greater Glory - The True Story of Cristiada
2389                                        Renaissance
1475                                  Playing for Keeps
Name: original_title, dtype: object

In [None]:
# add actors , production company 