In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
non_spoiler = pd.read_excel("Non_Spoiler_list_sample.xlsx")
plot = pd.read_excel("Plot_synopsis.xlsx")
spoiler = pd.read_excel("Spoiler_list_sample.xlsx")

In [3]:
non_spoiler.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
plot.drop("Unnamed: 0",axis=1,inplace=True)

In [5]:
spoiler.drop("Unnamed: 0",axis=1,inplace=True)

In [6]:
non_spoiler = non_spoiler.rename(columns={0:'Reviews', 1:'Title'})

In [7]:
spoiler = spoiler.rename(columns={0:'Reviews', 1:'Title'})

In [8]:
plot = plot.rename(columns={"titles":'Title', "plot_Synopsis":'Synopsis'})

In [9]:
# Text Cleaning

In [10]:
import re,html
def clean(text):
    #tags like <tag>
    text = re.sub(r'<[^<>]*>', ' ',text)
    #Markdown Urls
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)',r'\1',text)
    #Remove Punctuation
    text = re.sub(r'([!?,])\1+', r'\1', text)
    #Remove all URL's
    text = re.sub(r'http.*', ' ', text)
    #Remove @
    text = re.sub(r'@\w*', ' ', text)
    #text or code in brackets
    text = re.sub(r'\[[^\[\]]*\]',' ',text)
    # remove b"
    text = text.replace('b\"',' ') 
    # remove b'
    text = text.replace("b\'",' ') 
    # remove \\n
    text = text.replace('\\n',' ')
    #Remove &amp
    text = text.replace('&amp',' ') 
    # remove UTF-8 code like \\xe2
    text = re.sub(r'(\\x(.){2})', ' ',text) 
    #Standalone sequences for specials
    text = re.sub(r'(?:^|\s)[;.\'\"&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ',text)
    #stand alone sequence of hyphens 
    text= re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ',text)
    # Sequence of white spaces
    text = re.sub(r'\s+',' ',text)
    return text.strip()

In [11]:
non_spoiler["Reviews"] = non_spoiler["Reviews"].map(clean)

In [12]:
spoiler["Reviews"] = spoiler["Reviews"].map(clean)

In [13]:
plot["Synopsis"] = plot["Synopsis"].map(clean)

In [14]:
spoiler.head()

Unnamed: 0,Reviews,Title
0,The Shawshank Redemption is written and direct...,/title/tt0111161/
1,"In its Oscar year, Shawshank Redemption (writt...",/title/tt0111161/
2,"Based on a novella by Stephen King, this is be...",/title/tt0111161/
3,The Shawshank Redemption is without a doubt on...,/title/tt0111161/
4,None of the usual otherworld creatures that po...,/title/tt0111161/


In [15]:
plot.head()

Unnamed: 0,Title,Synopsis
0,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
1,/title/tt0068646/,"In late summer 1945, guests are gathered for t..."
2,/title/tt0468569/,The movie begins with a gang of men with clown...
3,/title/tt0071562/,The Godfather Part II presents two parallel st...
4,/title/tt0050083/,"In a New York City courthouse, an eighteen-yea..."


In [16]:
non_spoiler.head()

Unnamed: 0,Reviews,Title
0,It is no wonder that the film has such a high ...,/title/tt0111161/
1,I'm trying to save you money; this is the last...,/title/tt0111161/
2,This movie is not your ordinary Hollywood flic...,/title/tt0111161/
3,One of the finest films made in recent years. ...,/title/tt0111161/
4,Misery and Stand By Me were the best adaptatio...,/title/tt0111161/


In [17]:
spoilers_final = pd.merge(spoiler, plot, on=['Title','Title'])

In [18]:
spoilers_final.tail()

Unnamed: 0,Reviews,Title,Synopsis
1202,"Why don't we live in Westport!, My wife is the...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
1203,What I learned from American beauty: 1. Smokin...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
1204,"Of the 250+ films I've seen and rated on IMDb,...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
1205,American Beauty is the greatest movie ever mad...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
1206,And there were a lot of great ones this year. ...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...


In [19]:
spoilers_final.shape

(1207, 3)

In [20]:
non_spoiler_final = pd.merge(non_spoiler, plot, on=['Title','Title'])

In [21]:
non_spoiler_final.tail()

Unnamed: 0,Reviews,Title,Synopsis
3785,"""American Beauty"" is tour de force cinema. Sam...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
3786,Probably the best film of 1999. This dark come...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
3787,The intention is so clear that everything else...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
3788,This movie was a joy to watch. I didn't know w...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...
3789,"When I first saw this movie in theaters, I fou...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...


In [22]:
non_spoiler_final.shape

(3790, 3)

In [23]:
#cosine similarity

In [24]:
#spoiler

In [25]:
tfidf = TfidfVectorizer()

In [26]:
text_matrix1 = tfidf.fit_transform(spoilers_final['Reviews'].values.astype('U'))
text_matrix2 = tfidf.transform(spoilers_final['Synopsis'].values.astype('U'))


In [27]:
similarity_matrix = cosine_similarity(text_matrix1, text_matrix2)


In [28]:
spoilers_final['Cosine_Similarity'] = similarity_matrix.diagonal()


In [29]:
spoilers_final['Cosine_Similarity'].argmax()

404

In [30]:
spoilers_final.iloc[404]

Reviews              Even though it is nowadays considered one of t...
Title                                                /title/tt0038650/
Synopsis             This movie is about a divine intervention by a...
Cosine_Similarity                                             0.768337
Name: 404, dtype: object

In [31]:
spoilers_final['Cosine_Similarity'].describe()

count    1207.000000
mean        0.376269
std         0.148885
min         0.048705
25%         0.266566
50%         0.365304
75%         0.475660
max         0.768337
Name: Cosine_Similarity, dtype: float64

In [33]:
spoilers_final['Cosine_Similarity'].argmin()

976

In [34]:
for i in spoilers_final.iloc[976]:
    print(i)

Great direction, screenplay, performances, soundtrack, cinematography, costume design, production design, all the things you can expect from a Tarantino film. Also like others Tarantino films, only those who really have a extensive knowledge of films are going to really appreciate some things, like the Franco Nero scene.
/title/tt1853728/
In 1858, Django ( Jamie Foxx ), a slave, is chained to a bunch of other slaves and being marched to his new owner's estate in Texas by the Speck brothers. At nightfall, a German man in a dentist cart pulls up and hails the Speck brothers. He introduces himself as Dr. King Schultz ( Christoph Waltz ). Schultz is clearly more intelligent and enlightened than the Specks. He says he is looking for a slave who can identify a band of wanted fugitives known as the Brittle brothers. Django announces that he knows the Brittle brothers and can identify them. Schultz offers to buy Django, but his polite and educated manner rubs the ill-mannered Specks the wrong 

In [35]:
spoilers_final.iloc[976]

Reviews              Great direction, screenplay, performances, sou...
Title                                                /title/tt1853728/
Synopsis             In 1858, Django ( Jamie Foxx ), a slave, is ch...
Cosine_Similarity                                             0.048705
Name: 976, dtype: object

In [36]:
len(spoilers_final[spoilers_final['Cosine_Similarity']<0.10])


12

In [57]:
spoilers_final

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
0,The Shawshank Redemption is written and direct...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.452051,0.436470
1,"In its Oscar year, Shawshank Redemption (writt...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.437792,0.355432
2,"Based on a novella by Stephen King, this is be...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.205656,0.320277
3,The Shawshank Redemption is without a doubt on...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.431378,0.313466
4,None of the usual otherworld creatures that po...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.352788,0.464498
...,...,...,...,...,...
1202,"Why don't we live in Westport!, My wife is the...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.112705,0.349289
1203,What I learned from American beauty: 1. Smokin...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.119354,0.337446
1204,"Of the 250+ films I've seen and rated on IMDb,...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.267724,0.286844
1205,American Beauty is the greatest movie ever mad...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.379869,0.416507


In [37]:
#non spoiler

In [38]:
text_matrix3 = tfidf.fit_transform(non_spoiler_final['Reviews'].values.astype('U'))
text_matrix4 = tfidf.transform(non_spoiler_final['Synopsis'].values.astype('U'))

In [39]:
similarity_matrix = cosine_similarity(text_matrix3, text_matrix4)


In [40]:
non_spoiler_final['Cosine_Similarity'] = similarity_matrix.diagonal()


In [41]:
non_spoiler_final['Cosine_Similarity'].describe()

count    3790.000000
mean        0.253450
std         0.134210
min         0.000000
25%         0.145328
50%         0.230399
75%         0.343198
max         0.727958
Name: Cosine_Similarity, dtype: float64

In [42]:
non_spoiler_final['Cosine_Similarity'].argmax()

1559

In [43]:
for i in non_spoiler_final.iloc[1559]:
    print(i)
    break
    

Normally when people tell you about a sequel that was better than the original or just as good, Terminator 2 is always guaranteed to be in their list; why? Because this is THE action movie of all action movies, next to Die Hard, this is the movie that isn't just about the action as well, but has an incredible story and message behind it that will always stay with you. Terminator 2 like the first Terminator film has memorable lines, moments, and incredible effects. This is the film that made you believe in "liquid metal" machines. Robert Patrick's performance is flawless, to be honest I found him a million times more terrifying than Arnold in the first Terminator, because Robert looks like this normal average guy, but he's not like Arnold where he gets shot and you can clearly see he's a terminator, Robert goes back to human looking and won't stop. Not only that you don't know how to stop him. Linda Hamilton returns and gives a great performance as Sarah Conor who is no longer a meek li

In [44]:
#doc2vec

In [45]:
#spoilers data

In [46]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [47]:
texts = spoilers_final["Reviews"].tolist() + spoilers_final["Synopsis"].tolist()
preprocessed_texts = [text.lower().split() for text in texts]

In [48]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_texts)]
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, epochs=50)

In [49]:
similarities = []
for i in range(len(spoilers_final)):
    review_vec = model.infer_vector(preprocessed_texts[i])
    synopsis_vec = model.infer_vector(preprocessed_texts[i + len(spoilers_final)])
    similarity = model.docvecs.similarity_unseen_docs(model, preprocessed_texts[i], preprocessed_texts[i + len(spoilers_final)], alpha=1, min_alpha=0.0001, steps=5)
    similarities.append(similarity)

# Add similarity scores as a new column to the DataFrame
spoilers_final['doc_similarity'] = similarities

In [58]:
spoilers_final.head()

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
0,The Shawshank Redemption is written and direct...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.452051,0.43647
1,"In its Oscar year, Shawshank Redemption (writt...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.437792,0.355432
2,"Based on a novella by Stephen King, this is be...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.205656,0.320277
3,The Shawshank Redemption is without a doubt on...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.431378,0.313466
4,None of the usual otherworld creatures that po...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.352788,0.464498


In [59]:
spoilers_final.doc_similarity.argmax()

129

In [60]:
spoilers_final.iloc[129]

Reviews              The Lord of the Rings: The Return of the King ...
Title                                                /title/tt0167260/
Synopsis             In the opening scene: a flashback shows two ho...
Cosine_Similarity                                             0.697823
doc_similarity                                                0.684038
Name: 129, dtype: object

In [61]:
spoilers_final[spoilers_final.doc_similarity>0.40]

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
0,The Shawshank Redemption is written and direct...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.452051,0.436470
4,None of the usual otherworld creatures that po...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.352788,0.464498
9,The Shawshank Redemption is written and direct...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.452051,0.453468
12,The Shawshank Redemption is without a doubt on...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.431378,0.434219
13,None of the usual otherworld creatures that po...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.352788,0.569499
...,...,...,...,...,...
1180,As much as I enjoy this movie (and as an anima...,/title/tt4633694/,This animated film starts with Peter Parker (v...,0.512315,0.433225
1184,The thing I hate about superhero movies is the...,/title/tt4633694/,This animated film starts with Peter Parker (v...,0.516704,0.427425
1189,American Beauty is the greatest movie ever mad...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.379869,0.444918
1197,American Beauty is the greatest movie ever mad...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.379869,0.468264


In [62]:
#non spoiler data

In [63]:
texts = non_spoiler_final["Reviews"].tolist() + non_spoiler_final["Synopsis"].tolist()
preprocessed_texts = [text.lower().split() for text in texts]

In [64]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_texts)]
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, epochs=50)

In [65]:
similarities = []
for i in range(len(non_spoiler_final)):
    review_vec = model.infer_vector(preprocessed_texts[i])
    synopsis_vec = model.infer_vector(preprocessed_texts[i + len(non_spoiler_final)])
    similarity = model.docvecs.similarity_unseen_docs(model, preprocessed_texts[i], preprocessed_texts[i + len(non_spoiler_final)], alpha=1, min_alpha=0.0001, steps=5)
    similarities.append(similarity)

# Add similarity scores as a new column to the DataFrame
non_spoiler_final['doc_similarity'] = similarities

In [66]:
non_spoiler_final.head()

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
0,It is no wonder that the film has such a high ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.139889,0.202139
1,I'm trying to save you money; this is the last...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.284965,0.372835
2,This movie is not your ordinary Hollywood flic...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.458681,0.321482
3,One of the finest films made in recent years. ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.196977,0.108079
4,Misery and Stand By Me were the best adaptatio...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.30557,0.310521


In [67]:
non_spoiler_final[non_spoiler_final["doc_similarity"]>0.40]

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
13,"""The Shawshank Redemption"" should have won Bes...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.462606,0.423698
47,The story of this film takes place over a twen...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.680031,0.464823
48,It is now past 1 PM and I just finished watchi...,/title/tt0068646/,"In late summer 1945, guests are gathered for t...",0.379326,0.418883
76,My nephew who is all of 17 told me in no uncer...,/title/tt0068646/,"In late summer 1945, guests are gathered for t...",0.415246,0.417905
140,I used to leave a theatre after seeing a highl...,/title/tt0468569/,The movie begins with a gang of men with clown...,0.470816,0.405665
...,...,...,...,...,...
3649,Series note: It is strongly advised that you w...,/title/tt0090605/,"After the opening credits, we see a spacecraft...",0.476635,0.442713
3656,What more can be said that hasn't been said be...,/title/tt0090605/,"After the opening credits, we see a spacecraft...",0.355129,0.426704
3670,Series note: It is strongly advised that you w...,/title/tt0090605/,"After the opening credits, we see a spacecraft...",0.476635,0.463593
3674,I first saw this in the late 80s on a vhs. Rev...,/title/tt0090605/,"After the opening credits, we see a spacecraft...",0.089462,0.489831


In [68]:
non_spoiler_final.doc_similarity.argmax()

3079

In [69]:
non_spoiler_final.iloc[2212]

Reviews              After a bloodbath and fire on a moored ship on...
Title                                                /title/tt0114814/
Synopsis             On the deck of a ship in San Pedro, California...
Cosine_Similarity                                              0.58157
doc_similarity                                                0.406586
Name: 2212, dtype: object

In [70]:
non_spoiler_final[non_spoiler_final.doc_similarity>0.25]

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
1,I'm trying to save you money; this is the last...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.284965,0.372835
2,This movie is not your ordinary Hollywood flic...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.458681,0.321482
4,Misery and Stand By Me were the best adaptatio...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.305570,0.310521
7,The best movie in history and the best ending ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.088396,0.258090
11,"Can Hollywood, usually creating things for ent...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.399542,0.257781
...,...,...,...,...,...
3778,I first saw this in theaters back in 1999. I l...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.563875,0.255717
3780,"Lester Burnham (Kevin Spacey), 42, has spent y...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.348117,0.297677
3783,"This is by far one of the best, if not THE, mo...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.502960,0.302877
3786,Probably the best film of 1999. This dark come...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.259557,0.387862


In [75]:
#outliers

In [77]:
#spoiler

In [73]:
spoilers_final[spoilers_final.doc_similarity<0.10]

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
116,"To all the ppl on the 1st four review pages, u...",/title/tt0108052/,The relocation of Polish Jews from surrounding...,0.221232,0.068011
344,Spoilers herein. Competent filmmakers are rare...,/title/tt0099685/,The film opens with three men driving in their...,0.220938,0.065175
854,There's two things that stand out to me always...,/title/tt0064116/,In the desert Southwest of America during the ...,0.261914,0.088917
1022,"""Wall-E"" along with ""Up"" must be one of the be...",/title/tt0910970/,A Dystopia in the Future Approximately seven h...,0.362451,0.09295
1158,It's hard to imagine how Dame Agatha could hav...,/title/tt0051201/,"A few years after War War II, in London, Leona...",0.147858,0.073731


In [78]:
#non spoiler

In [76]:
non_spoiler_final.doc_similarity.mean()

0.24081486463546753

In [80]:
non_spoiler_final[non_spoiler_final.doc_similarity<0.25]

Unnamed: 0,Reviews,Title,Synopsis,Cosine_Similarity,doc_similarity
0,It is no wonder that the film has such a high ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.139889,0.202139
3,One of the finest films made in recent years. ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.196977,0.108079
5,I've lost count of the number of times I have ...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.204272,0.187180
6,Two imprisoned men (Tim Robbins and Morgan Fre...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.109267,0.200362
8,The Shawshank Redemption has great performance...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank...",0.089680,0.183197
...,...,...,...,...,...
3782,This film is one of a kind. After seeing this ...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.316232,0.100068
3784,I have come to see the movie with a certain pr...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.144238,0.138452
3785,"""American Beauty"" is tour de force cinema. Sam...",/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.368562,0.209948
3787,The intention is so clear that everything else...,/title/tt0169547/,Lester Burnham (Kevin Spacey) is a 42-year-old...,0.165112,0.005265


In [81]:
non_spoiler_final.shape

(3790, 5)

In [85]:
import glob as g

def read_datasets(filepath):
    columns=['Index','Review','Title']
    df_main = pd.DataFrame()
    filenames = g.glob(filepath + '/*')
    for name in filenames:
        print(name)
        df = pd.read_excel(name)
        df_main = pd.concat([df_main,df])
    df_main.columns = columns
    return df_main

In [87]:
df_non_spoilers = read_datasets("D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler")

D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler\Non_Spoiler_list_sect1.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler\Non_Spoiler_list_sect2.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler\Non_Spoiler_list_sect3.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler\Non_Spoiler_list_sect4.xlsx


In [88]:
df_non_spoilers

Unnamed: 0,Index,Review,Title
0,0,"<div class=""text show-more__control"">It is no ...",/title/tt0111161/
1,1,"<div class=""text show-more__control"">I'm tryin...",/title/tt0111161/
2,2,"<div class=""text show-more__control"">This movi...",/title/tt0111161/
3,3,"<div class=""text show-more__control"">One of th...",/title/tt0111161/
4,4,"<div class=""text show-more__control"">Misery an...",/title/tt0111161/
...,...,...,...
2921,2921,"<div class=""text show-more__control"">Like most...",/title/tt0099348/
2922,2922,"<div class=""text show-more__control"">And he de...",/title/tt0099348/
2923,2923,"<div class=""text show-more__control"">What the ...",/title/tt0099348/
2924,2924,"<div class=""text show-more__control"">Let me pr...",/title/tt0099348/


In [89]:
df_spoilers = read_datasets("D:/Lambton/Sem 3/Capstone project/Datasets/spoilers")

D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect1.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect11.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect12.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect13.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect14.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect15.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect16.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect17.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect18.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect19.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect2.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect20.xlsx
D:/Lambton/Sem 3/Capstone project/Datasets/spoilers\Spoiler_list_sect21.xlsx
D

In [90]:
df_spoilers

Unnamed: 0,Index,Review,Title
0,0,"<div class=""text show-more__control"">The Shaws...",/title/tt0111161/
1,1,"<div class=""text show-more__control"">In its Os...",/title/tt0111161/
2,2,"<div class=""text show-more__control"">Based on ...",/title/tt0111161/
3,3,"<div class=""text show-more__control"">The Shaws...",/title/tt0111161/
4,4,"<div class=""text show-more__control"">None of t...",/title/tt0111161/
...,...,...,...
770,770,"<div class=""text show-more__control"">It's hard...",/title/tt0099348/
771,771,"<div class=""text show-more__control"">""Dances W...",/title/tt0099348/
772,772,"<div class=""text show-more__control"">""Avatar"" ...",/title/tt0099348/
773,773,"<div class=""text show-more__control"">""Dances w...",/title/tt0099348/


In [91]:
df_spoilers.drop("Index",axis=1,inplace=True)

In [92]:
df_non_spoilers.drop("Index",axis=1,inplace=True)

In [93]:
plot

Unnamed: 0,Title,Synopsis
0,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
1,/title/tt0068646/,"In late summer 1945, guests are gathered for t..."
2,/title/tt0468569/,The movie begins with a gang of men with clown...
3,/title/tt0071562/,The Godfather Part II presents two parallel st...
4,/title/tt0050083/,"In a New York City courthouse, an eighteen-yea..."
...,...,...
245,/title/tt0071411/,Dersu Uzala is named after the character playe...
246,/title/tt1454029/,"In civil-rights era Jackson, Mississippi, 23-y..."
247,/title/tt0103639/,A street peddler (Robin Williams) is guiding t...
248,/title/tt0083987/,The story begins with the assassination of Moh...


In [101]:
#data cleaning

In [95]:
df_spoilers["Review"] = df_spoilers["Review"].map(clean)

In [96]:
df_non_spoilers["Review"] = df_non_spoilers["Review"].map(clean)

In [97]:
#plot["Synopsis"] = plot["Synopsis"].map(clean)

In [98]:
#merging files

In [99]:
spoilers_final = pd.merge(df_spoilers, plot, on=['Title','Title'])

In [100]:
non_spoiler_final = pd.merge(df_non_spoilers, plot, on=['Title','Title'])

In [106]:
df_spoilers.shape

(5816, 2)

In [104]:
spoilers_final.shape

(4291, 3)

In [105]:
non_spoiler_final.shape

(14179, 3)

In [117]:
spoilers_final.isna().sum()

Review      0
Title       0
Synopsis    0
dtype: int64

In [118]:
df_spoilers.isna().sum()

Review       0
Title     1450
dtype: int64

In [119]:
df_spoilers.shape

(5816, 2)

In [126]:
df_spoilers[df_spoilers.Title.isna()]


Unnamed: 0,Review,Title
0,The Shawshank Redemption is written and direct...,
1,"In its Oscar year, Shawshank Redemption (writt...",
2,"Based on a novella by Stephen King, this is be...",
3,The Shawshank Redemption is without a doubt on...,
4,None of the usual otherworld creatures that po...,
...,...,...
80,"I'm not at all an emotional person,but this mo...",
81,"Oh my, hold everything. You want to be prepare...",
82,I don't think you find a better example of a d...,
83,"Well, I just came back from seeing this in Shi...",


In [129]:
spoilers_final[spoilers_final.Title.isna()]


Unnamed: 0,Review,Title,Synopsis


In [130]:
spoilers_final

Unnamed: 0,Review,Title,Synopsis
0,The Shawshank Redemption is written and direct...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
1,"In its Oscar year, Shawshank Redemption (writt...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
2,"Based on a novella by Stephen King, this is be...",/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
3,The Shawshank Redemption is without a doubt on...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
4,None of the usual otherworld creatures that po...,/title/tt0111161/,"In 1947, Andy Dufresne ( Tim Robbins ), a bank..."
...,...,...,...
4286,It's hard for me to believe that fourteen year...,/title/tt0099348/,"During a US Civil War battle in Tennessee, Uni..."
4287,"""Dances With Wolves"" is another great period p...",/title/tt0099348/,"During a US Civil War battle in Tennessee, Uni..."
4288,"""Avatar"" vs ""Dances With Wolves"". Round One. F...",/title/tt0099348/,"During a US Civil War battle in Tennessee, Uni..."
4289,"""Dances with Wolves"" is hated by quite a lot o...",/title/tt0099348/,"During a US Civil War battle in Tennessee, Uni..."


In [4]:
# importing the required modules
import glob
import pandas as pd
 
# specifying the path to csv files
path = "D:/Lambton/Sem 3/Capstone project/Datasets/spoilers"
 
# csv files in the path
file_list = glob.glob(path + "/*.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('Spoilers_final.xlsx', index=False)

In [3]:
# importing the required modules
import glob
import pandas as pd
 
# specifying the path to csv files
path = "D:/Lambton/Sem 3/Capstone project/Datasets/non spoiler"
 
# csv files in the path
file_list = glob.glob(path + "/*.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('Non_Spoilers_final.xlsx', index=False)

In [133]:
Spoiler_final = pd.read_excel("S_final.xlsx")

In [137]:
Spoiler_final = Spoiler_final.rename(columns={0:'Reviews', 1:'Title'})

In [139]:
Spoiler_final

1450