In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")
df = df.drop(columns=['Wiki Page'])

In [3]:
# Number of Null values
print('Check Null and Unknown Values')
print(f"Amount of null value {df['Cast'].isnull().sum()} in Cast")
print(f"Amount of null value {df[df['Director']=='Unknown']['Director'].count()} in Director")
print(f"Amount of null value {df[df['Genre']=='unknown']['Genre'].count()} in Genre")

Check Null and Unknown Values
Amount of null value 1422 in Cast
Amount of null value 1124 in Director
Amount of null value 6083 in Genre


In [4]:
filtered_df = df[(df['Director']!='Unknown') & (df['Cast'].notnull()) & (df['Genre']!='unknown')].reset_index(drop=True)

In [5]:
filtered_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Plot
0,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,Boone's daughter befriends an Indian maiden as...
1,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,The plot is that of a black woman going to the...
2,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,On a beautiful summer day a father and mother ...
3,1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,A thug accosts a girl as she leaves her workpl...
4,1908,A Calamitous Elopement,American,D.W. Griffith,"Harry Solter, Linda Arvidson",comedy,A young couple decides to elope after being ca...
...,...,...,...,...,...,...,...
27827,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,"Zeynep lost her job at weaving factory, and he..."
27828,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,"Two musicians, Salih and Gürkan, described the..."
27829,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,"Zafer, a sailor living with his mother Döndü i..."
27830,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,The film centres around a young woman named Am...


In [6]:
filtered_df['Genre'] = filtered_df['Genre'].map(lambda x: x.lower().split(' '))
filtered_df['Cast'] = filtered_df['Cast'].map(lambda x: x.lower().split(','))
filtered_df['Director'] = filtered_df['Director'].map(lambda x: x.lower().split(' '))

In [7]:
filtered_df.dtypes

Release Year         int64
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Plot                object
dtype: object

In [8]:
filtered_df['bag_of_words'] = ''
columns = filtered_df.columns

for index, row in filtered_df.iterrows():
    words = ''
    for col in columns:
        if col == 'Director' or col == 'Cast' or col == 'Genre':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + str(row[col]) + ' '
    filtered_df.loc[index, 'bag_of_words'] = words
    
# filtered_df.drop(columns = [col for col in filtered_df.columns if col!= 'bag_of_words'], inplace = True)

In [9]:
filtered_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Plot,bag_of_words
0,1907,Daniel Boone,American,"[wallace, mccutcheon, and, ediwin, s., porter]","[william craven, florence lawrence]",[biographical],Boone's daughter befriends an Indian maiden as...,1907 Daniel Boone American wallace mccutcheon ...
1,1907,Laughing Gas,American,"[edwin, stanton, porter]","[bertha regustus, edward boulden]",[comedy],The plot is that of a black woman going to the...,1907 Laughing Gas American edwin stanton porte...
2,1908,The Adventures of Dollie,American,"[d., w., griffith]","[arthur v. johnson, linda arvidson]",[drama],On a beautiful summer day a father and mother ...,1908 The Adventures of Dollie American d. w. g...
3,1908,The Black Viper,American,"[d., w., griffith]",[d. w. griffith],[drama],A thug accosts a girl as she leaves her workpl...,1908 The Black Viper American d. w. griffith d...
4,1908,A Calamitous Elopement,American,"[d.w., griffith]","[harry solter, linda arvidson]",[comedy],A young couple decides to elope after being ca...,1908 A Calamitous Elopement American d.w. grif...
...,...,...,...,...,...,...,...,...
27827,2013,Particle (film),Turkish,"[erdem, tepegöz]","[jale arıkan, rüçhan caliskur, özay fecht, ...","[drama, film]","Zeynep lost her job at weaving factory, and he...",2013 Particle (film) Turkish erdem tepegöz jal...
27828,2017,Çalgı Çengi İkimiz,Turkish,"[selçuk, aydemir]","[ahmet kural, murat cemcir]",[comedy],"Two musicians, Salih and Gürkan, described the...",2017 Çalgı Çengi İkimiz Turkish selçuk aydemir...
27829,2017,Olanlar Oldu,Turkish,"[hakan, algül]","[ata demirer, tuvana türkay, ülkü duru]",[comedy],"Zafer, a sailor living with his mother Döndü i...",2017 Olanlar Oldu Turkish hakan algül ata demi...
27830,2017,Non-Transferable,Turkish,"[brendan, bradley]","[youtubers shanna malcolm, shira lazar, sara...","[romantic, comedy]",The film centres around a young woman named Am...,2017 Non-Transferable Turkish brendan bradley ...


In [15]:
filtered_df['bag_of_words'] = filtered_df['bag_of_words'].apply(stemSentence_porter)

In [16]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(filtered_df['bag_of_words'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in filtered_df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], filtered_df['Title'][i]) for i in similar_indices]

    results[row['Title']] = similar_items[1:]
    
print('done!')


done!


In [17]:
def item(title):
    return filtered_df.loc[filtered_df['Title'] == title]['Title'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(title, num):
    print("Recommending " + str(num) + " products similar to " + item(title) + "...")
    print("-------")
    recs = results[title][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [25]:
recommend(title="""Avatar""", num=5)

Recommending 5 products similar to Avatar...
-------
Recommended:  Never Back Down (score:0.15322104049087207)
Recommended: The Law and Jake Wade (score:0.14933914270913226)
Recommended: Revolver (score:0.1453985043114403)
Recommended: I'll Be Home For Christmas (score:0.14252786680183469)
Recommended: U Turn (score:0.13583663304988317)


In [23]:
print(list(filtered_df[filtered_df['Origin/Ethnicity']=='American']['Title']))

