In [31]:
!pip install convokit
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [32]:
corpus.print_summary_stats()


Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [33]:
pip install scikit-surprise



In [34]:
import pandas as pd
from surprise import Dataset,Reader,SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [35]:
# Display basic statistics
print("Number of conversations:", len(corpus.conversations))
print("Number of users:", len(corpus.speakers))
print("Number of utterances:", len(corpus.utterances))

Number of conversations: 83097
Number of users: 9035
Number of utterances: 304713


In [36]:
# Display information about conversations
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)
    print("Conversation ID:", convo_id)
    print("Metadata:", convo.meta)
    print("Number of utterances in conversation:", len(convo.get_utterance_ids()))
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Conversation ID: L405501
Metadata: ConvoKitMeta({'movie_idx': 'm143', 'movie_name': 'mystery men', 'release_year': '1999', 'rating': '5.90', 'votes': '31817', 'genre': "['action', 'comedy', 'fantasy']"})
Number of utterances in conversation: 5

Conversation ID: L405498
Metadata: ConvoKitMeta({'movie_idx': 'm143', 'movie_name': 'mystery men', 'release_year': '1999', 'rating': '5.90', 'votes': '31817', 'genre': "['action', 'comedy', 'fantasy']"})
Number of utterances in conversation: 3

Conversation ID: L405493
Metadata: ConvoKitMeta({'movie_idx': 'm143', 'movie_name': 'mystery men', 'release_year': '1999', 'rating': '5.90', 'votes': '31817', 'genre': "['action', 'comedy', 'fantasy']"})
Number of utterances in conversation: 5

Conversation ID: L405486
Metadata: ConvoKitMeta({'movie_idx': 'm143', 'movie_name': 'mystery men', 'release_year': '1999', 'rating': '5.90', 'votes': '31817', 'genre': "['action', 'comedy', 'fantasy'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Conversation ID: L655326
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 2

Conversation ID: L655322
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 4

Conversation ID: L655320
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 2

Conversation ID: L654917
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thrill

In [37]:
import pandas as pd

# Create empty lists to store data
conversation_ids = []
movie_indices = []
movie_names = []
release_years = []
ratings = []
votes = []
genres = []
num_utterances = []

# Loop through conversations and extract data
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)

    # Extract metadata from ConvoKitMeta object
    metadata = convo.meta

    # Append data to respective lists
    conversation_ids.append(convo_id)
    movie_indices.append(metadata['movie_idx'])
    movie_names.append(metadata['movie_name'])
    release_years.append(metadata['release_year'])
    ratings.append(metadata['rating'])
    votes.append(metadata['votes'])
    genres.append(metadata['genre'])
    num_utterances.append(len(convo.get_utterance_ids()))

# Create a DataFrame from the lists
data = {
    'Conversation ID': conversation_ids,
    'Movie Index': movie_indices,
    'Movie Name': movie_names,
    'Release Year': release_years,
    'Rating': ratings,
    'Votes': votes,
    'Genres': genres,
    'Number of Utterances': num_utterances
}

df = pd.DataFrame(data)



In [110]:
reader = Reader(rating_scale=(1, 10))


In [158]:
data=df[['Conversation ID','Movie Name','Rating']]

In [159]:
data = Dataset.load_from_df(data, reader)


In [160]:
trainset, testset = train_test_split(data, test_size=0.7,random_state=42)


In [161]:
model=SVD()

In [162]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d953033ada0>

In [163]:
predictions=model.test(testset)

In [164]:
rmse=accuracy.rmse(predictions)
print(f'RMSE:{rmse:.2f}')

RMSE: 0.1451
RMSE:0.15


In [165]:
top_n = 3
user_recommendations = []

In [166]:
item_column_name = 'Movie Name'
all_item_ids = df[item_column_name].unique()

In [167]:
user_id='L236416'
for item_id in all_item_ids:
    predicted_rating = model.predict(user_id, item_id).est
    user_recommendations.append((item_id, predicted_rating))

In [168]:
user_recommendations.sort(key=lambda x: x[1], reverse=True)


In [169]:
top_n = 3
print(f'Top {top_n} recommendations for user {user_id} :')
for item_id, predicted_rating in user_recommendations[:top_n]:
  print(f'Item {item_id}: Predicted Rating ={predicted_rating:.2f}')


Top 3 recommendations for user L236416 :
Item neuromancer: Predicted Rating =9.05
Item the godfather: Predicted Rating =8.92
Item the matrix: Predicted Rating =8.73
