In [129]:
import pandas as pd

In [130]:
df = pd.read_csv('imdb_top_1000.csv')
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


## Question
- Create new metadata with Genre, Overview, Director and Star 1-4
- Preprocessing the text with preprocessing part 1-3
- Create recommendation system with cosine distance
- Try the recommendation system is it good or bad ?

In [131]:
import warnings
warnings.filterwarnings('ignore')

In [132]:
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [133]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


# 1 Create new metadata with Genre, Overview, Director and Star 1-4

In [134]:
df_meta = df[['Series_Title']]
df_meta.head()

Unnamed: 0,Series_Title
0,The Shawshank Redemption
1,The Godfather
2,The Dark Knight
3,The Godfather: Part II
4,12 Angry Men


In [135]:
def combine_meta(row):
    return row['Genre'] + ' ' + row['Overview'] + ' ' + row['Director'] + ' ' + row['Star1'] + ' ' + row['Star2'] + ' ' + row['Star3'] + ' ' + row['Star4']

In [136]:
df_meta['metadata'] = df.apply(combine_meta, axis=1)
df_meta.head()

Unnamed: 0,Series_Title,metadata
0,The Shawshank Redemption,Drama Two imprisoned men bond over a number of...
1,The Godfather,"Crime, Drama An organized crime dynasty's agin..."
2,The Dark Knight,"Action, Crime, Drama When the menace known as ..."
3,The Godfather: Part II,"Crime, Drama The early life and career of Vito..."
4,12 Angry Men,"Crime, Drama A jury holdout attempts to preven..."


# Preprocessing

In [137]:
def to_lower(text):
    return text.lower()

In [138]:
contractions_dict = {     
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "iit will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def remove_contraction(text):

    list_kata = text.split()
    list_hasil = []

    for kata in list_kata:

        if kata in contractions_dict.keys():
            list_hasil.append(contractions_dict[kata])
        else:
            list_hasil.append(kata)

    hasil = ' '.join(list_hasil)
    return hasil 

In [139]:
def remove_number(text):
    hasil = ''.join([char for char in text if not char.isnumeric()])
    return hasil

In [140]:
from string import punctuation

def remove_punctuation(text):
    hasil = ''.join([char for char in text if char not in punctuation])
    return hasil

In [141]:
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.remove('not')

def remove_stopwords(text):
    hasil = ' '.join([word for word in text.split() if word not in stop_words])
    return hasil

In [142]:
def remove_whitespace(text):
    hasil = ' '.join(text.split())
    return hasil

---

In [143]:

df_meta['metadata'] = df_meta['metadata'].apply(to_lower)
df_meta['metadata'] = df_meta['metadata'].apply(remove_contraction)
df_meta['metadata'] = df_meta['metadata'].apply(remove_number)
df_meta['metadata'] = df_meta['metadata'].apply(remove_punctuation)
df_meta['metadata'] = df_meta['metadata'].apply(remove_stopwords)
df_meta['metadata'] = df_meta['metadata'].apply(remove_whitespace)

df_meta.head()

Unnamed: 0,Series_Title,metadata
0,The Shawshank Redemption,drama two imprisoned men bond number years fin...
1,The Godfather,crime drama organized crime dynastys aging pat...
2,The Dark Knight,action crime drama menace known joker wreaks h...
3,The Godfather: Part II,crime drama early life career vito corleone ne...
4,12 Angry Men,crime drama jury holdout attempts prevent misc...


In [144]:
df_meta['metadata'][0]

'drama two imprisoned men bond number years finding solace eventual redemption acts common decency frank darabont tim robbins morgan freeman bob gunton william sadler'

In [145]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

def lemmatize(text):

    list_hasil = []

    for sentence in nltk.sent_tokenize(text):   # pecah paragraf menjadi kalimat-kalimat
        for word in nltk.word_tokenize(sentence): # pecah kalimat menjadi kata-kata
            list_hasil.append(lemmatizer.lemmatize(word))   # ubah kata ke kata dasarnya

    hasil = ' '.join(list_hasil)
    return hasil 

In [146]:
df_meta['metadata'] = df_meta['metadata'].apply(lemmatize)
df_meta.head()

Unnamed: 0,Series_Title,metadata
0,The Shawshank Redemption,drama two imprisoned men bond number year find...
1,The Godfather,crime drama organized crime dynasty aging patr...
2,The Dark Knight,action crime drama menace known joker wreaks h...
3,The Godfather: Part II,crime drama early life career vito corleone ne...
4,12 Angry Men,crime drama jury holdout attempt prevent misca...


In [147]:
df_meta['metadata'][0]

'drama two imprisoned men bond number year finding solace eventual redemption act common decency frank darabont tim robbins morgan freeman bob gunton william sadler'

In [148]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

In [149]:
tf = CountVectorizer(stop_words='english', tokenizer=word_tokenize) 
bow = tf.fit_transform(df_meta['metadata']) 

In [150]:
idx = 0

content = df_meta.loc[idx, 'metadata']
watched = tf.transform([content])
watched

<1x8987 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [151]:
from sklearn.metrics.pairwise import cosine_distances

dist = cosine_distances(watched, bow)
dist

array([[1.11022302e-16, 9.59106959e-01, 9.61930651e-01, 9.64240073e-01,
        9.56521739e-01, 9.21188959e-01, 9.18213918e-01, 9.60594480e-01,
        1.00000000e+00, 9.54498424e-01, 9.59871382e-01, 9.62549706e-01,
        9.58297117e-01, 9.62549706e-01, 1.00000000e+00, 9.66174495e-01,
        1.00000000e+00, 9.56521739e-01, 9.65247598e-01, 9.60594480e-01,
        9.16594234e-01, 9.55544578e-01, 9.53374760e-01, 1.00000000e+00,
        9.59871382e-01, 8.81783439e-01, 9.63702300e-01, 8.77320877e-01,
        9.63702300e-01, 1.00000000e+00, 9.63702300e-01, 9.56521739e-01,
        9.14874347e-01, 9.65247598e-01, 9.61279845e-01, 9.56521739e-01,
        9.57437173e-01, 9.57437173e-01, 9.14874347e-01, 9.57437173e-01,
        9.55544578e-01, 1.00000000e+00, 9.63139511e-01, 9.57437173e-01,
        1.00000000e+00, 9.60594480e-01, 9.58297117e-01, 9.64240073e-01,
        1.00000000e+00, 1.00000000e+00, 9.60594480e-01, 9.59871382e-01,
        9.63702300e-01, 9.59106959e-01, 9.61930651e-01, 9.625497

In [152]:
rec_idx = dist.argsort()[0, 1:11]
rec_idx

array([505, 817, 167,  68, 234, 877, 311,  27, 350, 922], dtype=int64)

In [153]:
df_meta.loc[rec_idx]

Unnamed: 0,Series_Title,metadata
505,Mystic River,crime drama mystery life three men childhood f...
817,Short Cuts,comedy drama daytoday life several suburban lo...
167,Unforgiven,drama western retired old west gunslinger will...
68,Oldeuboi,action drama mystery kidnapped imprisoned fift...
234,Million Dollar Baby,drama sport determined woman work hardened box...
877,Dark Waters,biography drama history corporate defense atto...
311,The Shop Around the Corner,comedy drama romance two employee gift shop ba...
27,Se7en,crime drama mystery two detective rookie veter...
350,Dev.D,drama romance breaking childhood sweetheart yo...
922,Gone Baby Gone,crime drama mystery two boston area detective ...


---

In [154]:
df_meta['poster_link'] = df['Poster_Link']
df_meta.head()

Unnamed: 0,Series_Title,metadata,poster_link
0,The Shawshank Redemption,drama two imprisoned men bond number year find...,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,The Godfather,crime drama organized crime dynasty aging patr...,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,The Dark Knight,action crime drama menace known joker wreaks h...,https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,The Godfather: Part II,crime drama early life career vito corleone ne...,https://m.media-amazon.com/images/M/MV5BMWMwMG...
4,12 Angry Men,crime drama jury holdout attempt prevent misca...,https://m.media-amazon.com/images/M/MV5BMWU4N2...


In [155]:
df_meta.to_csv('imdb_clean.csv', index=False)

In [156]:
import pickle

pickle.dump(tf, open('tf.pkl', 'wb'))
pickle.dump(bow, open('bow.pkl', 'wb'))