In [4]:
import pandas as pd
import requests
from json import dumps, loads, load
from kafka import KafkaConsumer
from tqdm import tqdm
import os

## Read data from kafka

In [5]:
KAFKA_SERVER = 'localhost:9092'
TOPIC = 'movielog16'

consumer = KafkaConsumer(
    TOPIC,
    bootstrap_servers=KAFKA_SERVER,
    auto_offset_reset='earliest',
    enable_auto_commit=True,
)

print(f"Connected to Kafka topic: {TOPIC}")

Connected to Kafka topic: movielog16


In [6]:
watch_history = []
ratings = []
recommendation_requests = []


print("Reading messages from Kafka...")
for message in tqdm(consumer):
    message = message.value.decode()

    # Extract timestamp and user_id
    parts = message.split(",")
    if len(parts) < 3:
        continue  # Skip malformed logs
    
    timestamp, user_id, action = parts[:3]

    # recommendation request event
    if "recommendation request" in action:
        rec_result = action.split("result: ")[-1]
        recommendation_requests.append((timestamp, user_id, rec_result))
    
    # movie watch history event
    elif "GET /data/m/" in action:
        movie_id = action.split("/")[3]
        watch_history.append((timestamp, user_id, movie_id))
    
    # rating event
    elif "GET /rate/" in action:
        rating_info = action.split("=")
        if len(rating_info) == 2:
            movie_id, rating = rating_info[0].split("/")[-1], rating_info[1]
            ratings.append((timestamp, user_id, movie_id, int(rating)))
        os.system(f"echo {message} >> ../data/kafka_ratings_log.csv")
    
    # milestone 1: limit to 10000 ratings
    '''
        We are choosing the rating stream as
        1. It provides explicit feedback (i.e., direct user preferences with a rating from 1 to 5).
        2. Most recommendation models (like collaborative filtering) work better with explicit ratings.
        3. Allows for better personalization since we know exactly how much a user liked a movie.
    '''
    if len(ratings) > 10000:  
        break

Reading messages from Kafka...


1463145it [01:29, 16360.00it/s]


## Ratings Data

In [7]:
df_ratings = pd.DataFrame(ratings, columns=["timestamp", "user_id", "movie_id", "rating"])
df_ratings.to_csv('../data/ratings.csv')

In [8]:
df_ratings.head()

Unnamed: 0,timestamp,user_id,movie_id,rating
0,2025-02-15T01:49:06,52141,criminal+2004,4
1,2025-02-15T01:49:09,15942,pride+and+glory+2008,3
2,2025-02-15T01:49:09,30410,beer+league+2006,3
3,2025-02-15T01:49:11,12318,the+boy+in+blue+1986,3
4,2025-02-15T01:49:12,86351,1114+2003,3


## Get Movie Details from API

In [9]:
MOVIE_API_URL = "http://128.2.204.215:8080/movie/"

def get_movie_details(movie_id):
    try:
        response = requests.get(f"{MOVIE_API_URL}{movie_id}")
        if response.status_code == 200:
            return response.json()  # Returns JSON object with movie details
        else:
            print(f"Error {response.status_code} fetching movie: {movie_id}")
    except Exception as e:
        print(f"Request failed: {e}")
    return None

In [10]:
# Example of movie details:
sample_movie_id = "the+thin+red+line+1998"
movie_data = get_movie_details(sample_movie_id)
print(movie_data)

{'id': 'the+thin+red+line+1998', 'tmdb_id': 8741, 'imdb_id': 'tt0120863', 'title': 'The Thin Red Line', 'original_title': 'The Thin Red Line', 'adult': 'False', 'belongs_to_collection': {}, 'budget': '52000000', 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name': 'History'}, {'id': 10752, 'name': 'War'}], 'homepage': 'null', 'original_language': 'en', 'overview': 'Based on the graphic novel by James Jones, The Thin Red Line tells the story of a group of men, an Army Rifle company called C-for-Charlie, who change, suffer, and ultimately make essential discoveries about themselves during the fierce World War II battle of Guadalcanal. It follows their journey, from the surprise of an unopposed landing, through the bloody and exhausting battles that follow, to the ultimate departure of those who survived. A powerful frontline cast - including Sean Penn, Nick Nolte, Woody Harrelson and George Clooney - explodes into action in this hauntingly realistic view of military and moral chaos

## Get user details from API

In [11]:
USER_API_URL = "http://128.2.204.215:8080/user/"

def get_user_details(user_id):
    """Fetch user metadata from the API."""
    try:
        response = requests.get(f"{USER_API_URL}{user_id}")
        if response.status_code == 200:
            return response.json()  # Returns JSON object with user details
        else:
            print(f"Error {response.status_code} fetching user: {user_id}")
    except Exception as e:
        print(f"Request failed: {e}")
    return None

In [12]:
# Example of user detail:
sample_user_id = "102833"
user_data = get_user_details(sample_user_id)
print(user_data)

{'user_id': 102833, 'age': 27, 'occupation': 'self-employed', 'gender': 'M'}


In [None]:
# movie details
unique_movies = df_watch["movie_id"].unique()
movie_metadata = {movie: get_movie_details(movie) for movie in unique_movies}
df_movies = pd.DataFrame.from_dict(movie_metadata, orient='index')

# user details
unique_users = df_watch["user_id"].unique()
user_metadata = {user: get_user_details(user) for user in unique_users}
df_users = pd.DataFrame.from_dict(user_metadata, orient='index')

# Save to CSV
df_movies.to_csv("movies_metadata.csv", index=False)
df_users.to_csv("users_metadata.csv", index=False)

In [59]:
df_movies.head()

Unnamed: 0,id,tmdb_id,imdb_id,title,original_title,adult,belongs_to_collection,budget,genres,homepage,...,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,vote_average,vote_count
the+thin+red+line+1998,the+thin+red+line+1998,8741,tt0120863,The Thin Red Line,The Thin Red Line,False,{},52000000,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,...,/jGDK6eM6vb8VEUmR0ZoaghcCG8f.jpg,"[{'name': 'Fox 2000 Pictures', 'id': 711}, {'n...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1998-12-25,98126565,170,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,7.2,793
donnie+brasco+1997,donnie+brasco+1997,9366,tt0119008,Donnie Brasco,Donnie Brasco,False,{},35000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,...,/xtKLvpOfARi1XVm8u2FTdhY5Piq.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1997-02-27,41954997,127,"[{'iso_639_1': 'ja', 'name': '日本語'}, {'iso_639...",Released,7.4,1175
cliffhanger+1993,cliffhanger+1993,9350,tt0106582,Cliffhanger,Cliffhanger,False,{},70000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,...,/8XefYka77ypAnPJvaVlfUGBBs4a.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",1993-05-28,255000211,112,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,6.1,600
halloween+h20+1998,halloween+h20+1998,11675,tt0120694,Halloween: H20,Halloween: H20,False,"{'id': 91361, 'name': 'Halloween Collection', ...",17000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,...,/crycJDHRvpQyZeQyRYtIZKIpNzK.jpg,"[{'name': 'Dimension Films', 'id': 7405}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1998-08-05,55041738,86,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,5.7,306
jurassic+park+1993,jurassic+park+1993,329,tt0107290,Jurassic Park,Jurassic Park,False,"{'id': 328, 'name': 'Jurassic Park Collection'...",63000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",http://www.jurassicpark.com/,...,/c414cDeQ9b6qLPLeKmiJuLDUREJ.jpg,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'US', 'name': 'United States o...",1993-06-11,920100000,127,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,7.6,4956
