# Preprocessed Movie Dataset Comprising Using Cosine Similarity

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
# Sample movie dataset
data = {
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Description': [
        'A story about adventure and friendship.',
        'A comedy film with lots of laughs.',
        'A romantic drama with a heartfelt story.',
        'An action-packed thriller with suspense.'
    ]
}

In [None]:
# Create a DataFrame from the sample data
df = pd.DataFrame(data)

In [None]:
# Tokenization and preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [None]:
df['Preprocessed_Description'] = df['Description'].apply(preprocess_text)

In [None]:
# Calculate TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Description'])

In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Create a DataFrame for cosine similarity scores
cosine_sim_df = pd.DataFrame(cosine_sim, columns=df['Title'], index=df['Title'])

In [None]:
# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_sim_df)