In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import gensim
print(gensim.__version__)


2024-04-22 00:28:06.828039: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


4.3.2


In [2]:
# Load the dataset
data = pd.read_csv('cmu_dataset_v3.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and (token.isalnum() or token == '.')]
    return ' '.join(filtered_tokens)

data['processed_plot'] = data['plot'].apply(preprocess_text)

# Preprocess the genres
data['genres'] = data['genre'].apply(lambda x: x.split('|'))
print(data['processed_plot'])
# Open a text file in write mode
with open("12345.txt", "w") as file:
    # Write the content of 'processed_plot' column to the file
    for plot_summary in data['processed_plot']:
        file.write(plot_summary + "\n")

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

[nltk_data] Downloading package stopwords to /Users/johan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/johan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0       mavis arden movie star get romantically involv...
1       jimmy robert boy ca seem say anybody result en...
2       elmer becomes fireman particularly good one . ...
3       three dog one cat naturally suspicious . first...
4       couple bungling idiot abandoned deep jungle af...
                              ...                        
5995    rock widow try come term death husband artist ...
5996    blind mountain follows young woman bai xuemei ...
5997    two unsuccessful year pursuing art career pari...
5998    tale follows history mary boleyn sister anne b...
5999    film follows two officer istanbul sent new yor...
Name: processed_plot, Length: 6000, dtype: object


In [3]:
print(data['processed_plot'][0])

mavis arden movie star get romantically involved politician . make plan meet next tour stop roll royce break left stranded middle rural town . manager arranges stay local boarding house . immediately set eye young mechanic fixing car bud norton played randolph scott . west comedic rendition lovely arthur burke song saying moon trying seduce scott .


In [4]:
# Load the pre-trained word2vec embeddings
# word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model_from_cmu_utf8.bin', binary=True) # Adjust the path accordingly
# word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model_from_cmu.bin', binary=True, encoding='latin1')
word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')



# Tokenize and pad the text sequences
# Create a Tokenizer and configure it to retain dots
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')  # Remove all punctuations except dots
tokenizer.fit_on_texts(data['processed_plot'])
sequences = tokenizer.texts_to_sequences(data['processed_plot'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)
print(sequences[0])
print(padded_sequences[0])

# Create the embedding matrix
embedding_dim = word2vec_model.vector_size  # Adjust the dimensionality according to your word2vec embeddings
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

[24869, 8506, 129, 562, 5, 3768, 506, 2100, 1, 20, 82, 37, 52, 1669, 99, 1412, 6176, 109, 124, 2895, 742, 2101, 83, 1, 819, 1512, 176, 100, 2305, 12, 1, 368, 86, 297, 50, 2828, 9247, 42, 2217, 3825, 1093, 3826, 567, 1, 1050, 8855, 12266, 2896, 1782, 3529, 588, 409, 1287, 163, 2255, 567, 1]
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0    

In [5]:
reverted_texts = tokenizer.sequences_to_texts(padded_sequences)
print(reverted_texts[0])

mavis arden movie star get romantically involved politician . make plan meet next tour stop roll royce break left stranded middle rural town . manager arranges stay local boarding house . immediately set eye young mechanic fixing car bud norton played randolph scott . west comedic rendition lovely arthur burke song saying moon trying seduce scott .


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant

# Define the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)


# Count the number of elements in each variable
num_X_train = len(X_train)
num_X_test = len(X_test)
num_y_train = len(y_train)
num_y_test = len(y_test)

# Print the counts
print(f"Number of elements in X_train: {num_X_train}")
print(f"Number of elements in X_test: {num_X_test}")
print(f"Number of elements in y_train: {num_y_train}")
print(f"Number of elements in y_test: {num_y_test}")


# Train the model
model.fit(X_train, y_train, batch_size=64, epochs=3, validation_data=(X_test, y_test))

Number of elements in X_train: 4800
Number of elements in X_test: 1200
Number of elements in y_train: 4800
Number of elements in y_test: 1200
Epoch 1/3
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 579ms/step - accuracy: 0.3164 - loss: 0.5795 - val_accuracy: 0.4883 - val_loss: 0.4836
Epoch 2/3
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 679ms/step - accuracy: 0.5106 - loss: 0.4697 - val_accuracy: 0.5742 - val_loss: 0.4239
Epoch 3/3
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 663ms/step - accuracy: 0.6247 - loss: 0.4020 - val_accuracy: 0.6350 - val_loss: 0.3911


<keras.src.callbacks.history.History at 0x7fb3cd61aa90>

In [8]:
# Convert sequences back to text
X_train_reverted = tokenizer.sequences_to_texts(X_test)

sentencecount = 0
plotcount = 0
sentence_level_predictions = []

# Assuming X_train_reverted is a list of plot summaries
for plot_summary in X_train_reverted:
    # Split the plot summary into sentences based on "."
    sentences = [sentence.strip() for sentence in plot_summary.split(".") if sentence.strip()]
    num_sentences = len(sentences)
    # print("Number of sentences:", num_sentences)
    # print(plot_summary)
    # scores for each value in eg [0.24 0.65 0.22 0.12] (then devide it by number of sentences)
    score1 = 0
    score2 = 0
    score3 = 0
    score4 = 0
    
    for sentence in sentences:
        # print("Sentence loop beginning")
        # print(sentence)
        sequence = tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequence, maxlen=300)
        # print(padded_sequence[0])
        prediction = model.predict(padded_sequence)
        # print(prediction)
        #print(prediction[0][0])
        #print(prediction[0][1])
        #print(prediction[0][2])
        #print(prediction[0][3])
        score1 += prediction[0][0]
        score2 += prediction[0][1]
        score3 += prediction[0][2]
        score4 += prediction[0][3]
    plotcount += 1
    #print("Plots processed:", plotcount)
    processed_score1 = score1 / num_sentences
    processed_score2 = score2 / num_sentences
    processed_score3 = score3 / num_sentences
    processed_score4 = score4 / num_sentences
    processed_values = [processed_score1,processed_score2,processed_score3,processed_score4]
    #print('Processed values:')
    #print(processed_values)
    sentence_level_predictions.append(processed_values)
    print(plotcount)
#print("Total sentences processed:", sentencecount)





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 229ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [10]:
with open("sentence_level_predictions.txt", "w") as file:
    for prediction in sentence_level_predictions:
        line = " ".join(map(str, prediction))
        file.write(line + "\n")

In [16]:
# Make predictions on the test set
y_pred_classes_sentence = (sentence_level_predictions > 0.5).astype(int) #0.5 här är att den uppskattade sannolikheten måste vara över 0.5. Annars blir den None (gör stor inpact på precision/recall men är oftast en trade off mellan dom båda)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes_sentence)
precision = precision_score(y_test, y_pred_classes_sentence, average='micro')
recall = recall_score(y_test, y_pred_classes_sentence, average='micro')
f1 = f1_score(y_test, y_pred_classes_sentence, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(y_train)
reverted_labels = mlb.inverse_transform(y_train)
print(reverted_labels)

TypeError: '>' not supported between instances of 'list' and 'float'

In [31]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize an empty list to store the predicted classes
y_pred_classes_sentence = []

# Apply the threshold to each prediction
for prediction_values in sentence_level_predictions:
    # Convert the prediction values to a NumPy array
    prediction_values = np.array(prediction_values)
    
    # Apply the threshold and convert to int
    prediction_class = (prediction_values > 0.39).astype(int)
    
    # Append the prediction class to the list
    y_pred_classes_sentence.append(prediction_class)

# Convert the list of lists to a NumPy array
y_pred_classes_sentence = np.array(y_pred_classes_sentence)

# Now you can calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes_sentence)
precision = precision_score(y_test, y_pred_classes_sentence, average='micro')
recall = recall_score(y_test, y_pred_classes_sentence, average='micro')
f1 = f1_score(y_test, y_pred_classes_sentence, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')


Accuracy: 0.48833333333333334
Precision: 0.6568537258509659
Recall: 0.595
F1-score: 0.6243987756886751


In [None]:
text = "clark young mathematics major university think found best deal student housing group squatter live abandoned hospital secretly . quirky resident let community provided follow rule including telling anyone living arrangement . seems wonderful discovers reason hospital abandoned series murder 1940s strange shrieking killer never captured discovery someone living hospital using occult mean bring back demonic shrieker ."
sentences = text.split(".")
for sentence in sentences:
    print(sentence.strip())  # Remove leading and trailing whitespaces
    print()  # Print an empty line between sentences
num_sentences = len(sentences)
print("Number of sentences:", num_sentences)

sentences = [sentence.strip() for sentence in text.split(".") if sentence.strip()]
num_sentences = len(sentences)
print("Number of sentences:", num_sentences)

In [None]:
# Convert sequences back to text
reverted_texts = tokenizer.sequences_to_texts(X_test)
print(reverted_texts[0])
print(X_test[0])

In [None]:
# Assuming X_test contains preprocessed and padded plot summaries
# Extract plot summary number 10 (index 9) from X_test
plot_summary_10 = X_test[9]
# Reshape the plot summary to match the input shape expected by the model
plot_summary_10 = plot_summary_10.reshape(1, -1)
# Make prediction on the extracted plot summary
prediction_10 = model.predict(plot_summary_10)

print(plot_summary_10)
print(prediction_10)


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.39).astype(int) #0.5 här är att den uppskattade sannolikheten måste vara över 0.5. Annars blir den None (gör stor inpact på precision/recall men är oftast en trade off mellan dom båda)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='micro')
recall = recall_score(y_test, y_pred_classes, average='micro')
f1 = f1_score(y_test, y_pred_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(y_train)
reverted_labels = mlb.inverse_transform(y_train)
print(reverted_labels)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 128ms/step
Accuracy: 0.5816666666666667
Precision: 0.639141205615194
Recall: 0.645
F1-score: 0.6420572376607216
[[0 0 0 1]
 [0 1 0 0]
 [0 0 1 0]
 ...
 [0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]]
[('Thriller',), ('Drama',), ('Horror',), ('Horror',), ('Horror',), ('Horror',), ('Drama',), ('Thriller',), ('Comedy',), ('Drama',), ('Horror',), ('Comedy',), ('Comedy',), ('Drama',), ('Horror',), ('Comedy',), ('Thriller',), ('Drama',), ('Drama',), ('Horror',), ('Drama',), ('Comedy',), ('Thriller',), ('Thriller',), ('Comedy',), ('Drama',), ('Horror',), ('Comedy',), ('Comedy',), ('Thriller',), ('Thriller',), ('Drama',), ('Comedy',), ('Horror',), ('Drama',), ('Horror',), ('Thriller',), ('Drama',), ('Drama',), ('Horror',), ('Horror',), ('Drama',), ('Horror',), ('Comedy',), ('Comedy',), ('Thriller',), ('Thriller',), ('Horror',), ('Thriller',), ('Thriller',), ('Thriller',), ('Comedy',), ('Comedy',), ('Horror',), ('Drama',), ('Comedy',), ('Comedy',)

In [None]:
print(y_pred)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics for each genre
genre_scores = {}
for i, genre in enumerate(mlb.classes_):
    genre_accuracy = accuracy_score(y_test[:, i], y_pred_classes[:, i])
    genre_precision = precision_score(y_test[:, i], y_pred_classes[:, i])
    genre_recall = recall_score(y_test[:, i], y_pred_classes[:, i])
    genre_f1 = f1_score(y_test[:, i], y_pred_classes[:, i])
    
    genre_scores[genre] = {'Accuracy': genre_accuracy,
                           'Precision': genre_precision,
                           'Recall': genre_recall,
                           'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()


In [None]:
# Count the number of times each genre is predicted
genre_counts = {}
for i, genre in enumerate(mlb.classes_):
    genre_count = np.sum(y_pred_classes[:, i])
    genre_counts[genre] = genre_count

# Print the counts for each genre
for genre, count in genre_counts.items():
    print(f"Predicted {genre}: {count} times")


In [14]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Initialize a list to store predicted genres
predicted_genres = []

# Iterate over each prediction and determine the genre
for prediction in y_pred_classes:
    # Assuming mlb.classes_ contains the genre labels
    genres = [mlb.classes_[i] for i, pred in enumerate(prediction) if pred == 1]
    # If no genres are predicted, append None
    if not genres:
        genres = [None]
    # Append the predicted genres for this plot summary
    predicted_genres.append(genres)

# Print the predicted genres for each plot summary
for idx, genres in enumerate(predicted_genres):
    if genres[0] is not None:
        print(f"Predicted genre for plot summary {idx + 1}: {', '.join(genres)}")
    else:
        print(f"Predicted genre for plot summary {idx + 1}: None")

# Write the predicted genres to a text file
with open("predictions.txt", "w") as file:
    # Write the predicted genres for each plot summary
    for idx, genres in enumerate(predicted_genres):
        if genres[0] is not None:
            file.write(f"Predicted genre for plot summary {idx + 1}: {', '.join(genres)}\n")
        else:
            file.write(f"Predicted genre for plot summary {idx + 1}: None\n")



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 126ms/step
Predicted genre for plot summary 1: Horror
Predicted genre for plot summary 2: Thriller
Predicted genre for plot summary 3: Comedy
Predicted genre for plot summary 4: Horror
Predicted genre for plot summary 5: Drama
Predicted genre for plot summary 6: Comedy
Predicted genre for plot summary 7: Comedy
Predicted genre for plot summary 8: Drama
Predicted genre for plot summary 9: Horror
Predicted genre for plot summary 10: None
Predicted genre for plot summary 11: None
Predicted genre for plot summary 12: None
Predicted genre for plot summary 13: Drama
Predicted genre for plot summary 14: Comedy
Predicted genre for plot summary 15: None
Predicted genre for plot summary 16: Horror
Predicted genre for plot summary 17: Horror
Predicted genre for plot summary 18: None
Predicted genre for plot summary 19: Drama
Predicted genre for plot summary 20: None
Predicted genre for plot summary 21: None
Predicted genre for plot s

In [15]:
with open("y_pred.txt", "w") as file:
    for prediction in y_pred:
        line = " ".join(map(str, prediction))
        file.write(line + "\n")