## Dataset

In [1]:
# https://www.kaggle.com/datasets/andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews
import pandas as pd
df_movie_names = pd.read_csv('rotten_tomatoes_movies.csv')

In [2]:
df_movie_names.head(1)

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,space-zombie-bingo,Space Zombie Bingo!,50.0,,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,George Ormrod,"George Ormrod,John Sabotta",,,


In [3]:
len(df_movie_names)

143258

In [4]:
for column in df_movie_names.columns:
    if(column != 'id' and column != 'title' and column !='genre'):
        del df_movie_names[column]

In [5]:
df_movie_names.set_index('id', inplace=True)
df_movie_names.head(1)

Unnamed: 0_level_0,title,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1
space-zombie-bingo,Space Zombie Bingo!,"Comedy, Horror, Sci-fi"


In [6]:
df_movie = pd.read_csv('rotten_tomatoes_movie_reviews.csv')
df_movie.head(5)

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
2,city_hunter_shinjuku_private_eyes,2590987,2019-05-28,Reuben Baron,False,,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,https://www.cbr.com/city-hunter-shinjuku-priva...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
4,dangerous_men_2015,2504681,2018-08-29,Pat Padua,False,,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,http://dcist.com/2015/11/out_of_frame_dangerou...


Filtering based on positive reviews, because we are attempting to match descriptors of a movie to an input. Negative reviews may associate terms with the movie that are inaccurate; like "not very fast-paced" might asscociate "fast-paced" with a slow movie. 

In [7]:
bool_positive = df_movie['scoreSentiment'] == 'POSITIVE'
df_movie = df_movie[bool_positive]
df_movie.scoreSentiment

0          POSITIVE
2          POSITIVE
4          POSITIVE
5          POSITIVE
6          POSITIVE
             ...   
1444957    POSITIVE
1444958    POSITIVE
1444959    POSITIVE
1444960    POSITIVE
1444962    POSITIVE
Name: scoreSentiment, Length: 963799, dtype: object

In [8]:
for column in df_movie.columns:
    if(column != 'id' and column != 'reviewText'):
        del df_movie[column]

In [9]:
# putting the proper name of the movies 
dict_movie = df_movie_names.to_dict()['title']
dict_genre = df_movie_names.to_dict()['genre']
movieids = df_movie.id.tolist()

In [10]:
def to_title(m):
    new = None
    try:
        new = dict_movie[m]
    except:
        pass
    return new

def to_genre(m):
    new = None
    try:
        new = dict_genre[m]
    except:
        pass
    return new

df_movie['title'] = pd.Series(list(map(to_title, movieids)))
df_movie['genre'] = pd.Series(list(map(to_genre, movieids)))

In [11]:
df_movie.head()

Unnamed: 0,id,reviewText,title,genre
0,beavers,Timed to be just long enough for most youngste...,Beavers,Documentary
2,city_hunter_shinjuku_private_eyes,The choreography is so precise and lifelike at...,Dangerous Men,
4,dangerous_men_2015,Its clumsy determination is endearing and some...,Dangerous Men,
5,dangerous_men_2015,"With every new minute, there's another head-sc...",Dangerous Men,
6,dangerous_men_2015,"Emotionless reaction shots, zero characterizat...",Dangerous Men,


In [12]:
del df_movie['id']

In [13]:
df_movie.head(1)

Unnamed: 0,reviewText,title,genre
0,Timed to be just long enough for most youngste...,Beavers,Documentary


In [14]:
# read data 
file_path = 'Hydra-Movie-Scrape.csv'
df_hydra = pd.read_csv(file_path)
df_hydra.head(1)

Unnamed: 0,Title,Year,Summary,Short Summary,Genres,IMDB ID,Runtime,YouTube Trailer,Rating,Movie Poster,Director,Writers,Cast
0,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...","Patton Oswalt, despite a personal tragedy, pro...",Uncategorized,tt7026230,66,4hZi5QaMBFc,7.4,https://hydramovies.com/wp-content/uploads/201...,Bobcat Goldthwait,Patton Oswalt,Patton Oswalt


In [15]:
len(df_hydra)

3940

In [19]:
prev_titles = df_movie.title.tolist()
bool_in_rt_set = df_hydra['Title'].apply(lambda t: t in prev_titles)
df_hydra = df_hydra[bool_in_rt_set]

In [21]:
for column in df_hydra.columns:
    if(column != 'Title' and column != 'Genres' and column !='Short Summary'):
        del df_hydra[column]

Unnamed: 0,Title,Short Summary,Genres


In [22]:
df_hydra.head(1)

Unnamed: 0,Title,Short Summary,Genres
0,Patton Oswalt: Annihilation,"Patton Oswalt, despite a personal tragedy, pro...",Uncategorized


Making a compund dataset

In [26]:
df_movie = df_movie.rename(columns={'reviewText': 'summary'})
df_hydra = df_hydra.rename(columns={'Title': 'title', 'Short Summary': 'summary', "Genres":'genre'})

In [30]:
df_movies = pd.concat([df_movie, df_hydra], axis=0)
df_movies.dropna(inplace=True)

In [33]:
import re

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Regular expression pattern
pattern = r'\b\w{5}\b'  # Match words with exactly 5 characters

# Search for the first match
re.search(pattern, text).group()

'quick'

In [43]:
import re
df_movies['genre'] = df_movies['genre'].apply(lambda g: re.search('(\w*)[|,]*[$]?', g).group(1))

  df_movies['genre'] = df_movies['genre'].apply(lambda g: re.search('(\w*)[|,]*[$]?', g).group(1))


In [44]:
df_movies

Unnamed: 0,summary,title,genre
0,Timed to be just long enough for most youngste...,Beavers,Documentary
12,"Ridiculous, artless, and wildly entertaining, ...",Small Town Wisconsin,Comedy
13,To sit through it feels like honoring the drea...,Small Town Wisconsin,Comedy
14,Small Town Wisconsin could hit some home truth...,Small Town Wisconsin,Comedy
15,This low-key drama has lovely interludes and s...,Small Town Wisconsin,Comedy
...,...,...,...
3930,The life of a children's book author is turned...,The Child in Time,Drama
3931,Molly Mahoney is the awkward and insecure mana...,Mr. Magorium's Wonder Emporium,Comedy
3934,An employee at a weapons factory discovers tha...,Office Uprising,Action
3935,A security expert must infiltrate a burning sk...,Skyscraper,Action


In [28]:
# https://spotintelligence.com/2023/01/11/lstm-in-nlp-tasks/

## LSTM with the above data

In [149]:
#sequences of data
df_movies = df_movies.sample(n=30000)

texts = df_movies['summary'].tolist()
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=80)
print('Shape of data tensor:', X.shape)

genre = df_movies['genre'].tolist()

Y = pd.get_dummies(df_movies['title']).values
print('Shape of label tensor:', Y.shape)

labels = list(set(df_movies['title'].tolist()))

Shape of data tensor: (30000, 80)
Shape of label tensor: (30000, 13056)


In [150]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(27000, 80) (27000, 13056)
(3000, 80) (3000, 13056)


In [151]:
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 80

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(13056, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 25
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 116ms/step - accuracy: 7.6597e-04 - loss: 9.4651 - val_accuracy: 7.4074e-04 - val_loss: 9.4850
Epoch 2/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 104ms/step - accuracy: 0.0014 - loss: 9.2354 - val_accuracy: 7.4074e-04 - val_loss: 9.7066
Epoch 3/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 105ms/step - accuracy: 0.0015 - loss: 8.9894 - val_accuracy: 0.0000e+00 - val_loss: 10.0648
Epoch 4/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 106ms/step - accuracy: 0.0013 - loss: 8.6922 - val_accuracy: 3.7037e-04 - val_loss: 10.3714
Epoch 5/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 103ms/step - accuracy: 0.0031 - loss: 8.3073 - val_accuracy: 3.7037e-04 - val_loss: 10.7478
Epoch 6/25
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 108ms/step - accuracy: 0.0058 - loss: 7.8707 - val_accuracy: 0.0011 - 

In [156]:
def predict_movie(summary):
    sequences = tokenizer.texts_to_sequences([summary])
    X_new = pad_sequences(sequences, maxlen=80)
    predictions = model.predict(X_new) 
    return labels[np.argmax(predictions[0])]
    
predict_movie("Romantic Film that has enemies to lovers")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


'Blume in Love'