In [None]:
import pandas as pd
import random
import datetime
import time
import getpass

# Items

In [None]:
movies1 = pd.read_csv('/kaggle/input/aws-movies-datasets/response1.csv')
movies2 = pd.read_csv('/kaggle/input/aws-movies-datasets/response2.csv')
movies3 = pd.read_csv('/kaggle/input/aws-movies-datasets/response3.csv')
movies4 = pd.read_csv('/kaggle/input/aws-movies-datasets/response4.csv')
movies5 = pd.read_csv('/kaggle/input/aws-movies-datasets/response5.csv')

In [None]:
movies = pd.concat([movies1, movies2, movies3, movies4, movies5])

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
movies.dropna(subset=["actors", "director"], inplace=True)

In [None]:
pip install requests

In [None]:
import requests

def get_movie_info(movie_title, api_key):
    base_url = "https://api.themoviedb.org/3"
    search_url = f"{base_url}/search/movie?api_key={api_key}&query={movie_title}"
    
    response = requests.get(search_url)
    
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")
    
    data = response.json()
    results = data.get("results")
    
    if not results:
        print(f"No results found for '{movie_title}'")
        return None
    
    movie_id = results[0].get("id")
    details_url = f"{base_url}/movie/{movie_id}?api_key={api_key}"
    details_response = requests.get(details_url)
    
    if details_response.status_code != 200:
        raise Exception(f"Error: {details_response.status_code}")
    
    details_data = details_response.json()
    
    release_date = results[0].get("release_date")
    if not release_date:
        print(f"No release date found for '{movie_title}'")
        return None
    release_year = int(release_date.split("-")[0])
    
    rating = details_data.get("vote_average")
    if not rating:
        print(f"No rating found for '{movie_title}'")
        return None
    
    return {"release_year": release_year, "rating": rating}

In [None]:
tmdb_api = getpass.getpass('API: ')

In [None]:
for i, row in movies.iterrows():
    movie_title = row["title"]
    movie_info = get_movie_info(movie_title, tmdb_api)
    
    # fill in the 'YEAR' and 'eventValue' columns with the retrieved info
    if movie_info is not None:
        movies.at[i, "RELEASE_YEAR"] = int(movie_info["release_year"])
        movies.at[i, "eventValue"] = movie_info["rating"]
        
        # convert the release year to a timestamp and add it to 'CREATION_TIMESTAMP'
        timestamp = datetime.datetime(int(movie_info["release_year"]), 1, 1)
        movies.at[i, "CREATION_TIMESTAMP"] = int(timestamp.timestamp())
    else:
        movies.at[i, "RELEASE_YEAR"] = None
        movies.at[i, "eventValue"] = None
        movies.at[i, "CREATION_TIMESTAMP"] = None

In [None]:
movies.info()

In [None]:
movies["RELEASE_YEAR"].fillna(2023, inplace=True)
movies["eventValue"].fillna(6.5, inplace=True)
movies["CREATION_TIMESTAMP"].fillna(1672531200, inplace=True)

In [None]:
movies["RELEASE_YEAR"] = movies["RELEASE_YEAR"].astype(int)

In [None]:
movies.rename(columns = {'id':'ITEM_ID', 'actors':'ACTORS', 'director':'DIRECTOR', 
                         'title':'TITLE', 'genre':'GENRES'}, inplace = True)

In [None]:
movies.info()

In [None]:
movies["CREATION_TIMESTAMP"] = movies["CREATION_TIMESTAMP"].astype("int64")

In [18]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 45
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ITEM_ID             243 non-null    int64  
 1   ACTORS              243 non-null    object 
 2   DIRECTOR            243 non-null    object 
 3   GENRES              243 non-null    object 
 4   TITLE               243 non-null    object 
 5   RELEASE_YEAR        243 non-null    int64  
 6   eventValue          243 non-null    float64
 7   CREATION_TIMESTAMP  243 non-null    int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 25.2+ KB


In [81]:
movies.head(10)

Unnamed: 0,ITEM_ID,ACTORS,DIRECTOR,GENRES,TITLE,RELEASE_YEAR,eventValue,CREATION_TIMESTAMP
0,920962,"Brad Pitt, Logan Lerman, Michael Peña, Shia La...",David Ayer,"War, Drama, Action",Fury,2022,7.272,1640995000.0
1,920961,"Clint Eastwood, Marianne Koch, Gian Maria Volo...",Sergio Leone,Western,A Fistful of Dollars,2017,7.624,1483229000.0
2,920960,"Josh Brolin, Javier Bardem, Tommy Lee Jones, K...","Joel Coen, Ethan Coen","Crime, Drama, Thriller",No Country for Old Men,2023,6.308,1672531000.0
3,920959,"Ben Cross, Ian Charleson, Cheryl Campbell, Ali...",Hugh Hudson,"Drama, History",Chariots of Fire,2011,6.055,1293840000.0
4,920958,"Steve McQueen, James Garner, Richard Attenboro...",John Sturges,"Adventure, Drama, History, Thriller, War",The Great Escape,2023,6.308,1672531000.0
5,920957,"Mickey Rourke, Evan Rachel Wood, Marisa Tomei,...",Darren Aronofsky,"Drama, Romance",The Wrestler,2017,6.85,1483229000.0
6,920956,"Alan Ladd, Jean Arthur, Van Heflin, Brandon De...",George Stevens,"Drama, Western",Shane,2006,6.914,1136074000.0
7,920955,"Clint Eastwood, Eli Wallach, Lee Van Cleef, Al...",Sergio Leone,Western,"The Good, the Bad and the Ugly",2014,7.676,1388534000.0
8,920954,"William Holden, Alec Guinness, Jack Hawkins, S...",David Lean,"Drama, History, War",The Bridge on the River Kwai,2017,7.6,1483229000.0
9,920953,"Marlon Brando, Robert Duvall, Martin Sheen, Fr...",Francis Ford Coppola,"Drama, War",Apocalypse Now,2011,6.989,1293840000.0


In [17]:
movies.to_csv('items.csv')

# Users

In [2]:
genre_combinations = [
    ["Action", "Drama", "Thriller"],
    ["Fantasy"],
    ["Science Fiction"],
    ["Comedy"],
    ["Romance", "Music"],
    ["Animation", "Family", "Comedy"],
    ["Horror"],
    ["War"],
    ["Documentary"]
]

In [14]:
user_data = {'USER_ID': [], 'AGE': [], 'SEX': [], 'FAVORITE_GENRES': [], 'YEAR_PREFERENCE': [], 'RATING_PREFERENCE': []}

In [15]:
for i in range(1, 201):
    user_id = i
    age = random.randint(16, 60)
    sex = random.choice(['M', 'F'])
    preferences = random.choice(genre_combinations)
    
    # Set year preference
    year_preference = None
    if i % 5 == 0 and i % 10 >= 3:
        year_preference = 'old'
    elif i % 5 == 1 and i % 10 >= 3:
        year_preference = 'new'
        
    # Set rating preference
    rating_preference = None
    if i % 10 < 3 and year_preference is None:
        rating_preference = 'high'
    
    user_data['USER_ID'].append(user_id)
    user_data['AGE'].append(age)
    user_data['SEX'].append(sex)
    user_data['FAVORITE_GENRES'].append(preferences)
    user_data['YEAR_PREFERENCE'].append(year_preference)
    user_data['RATING_PREFERENCE'].append(rating_preference)

In [16]:
users = pd.DataFrame(user_data)

In [17]:
users['FAVORITE_GENRES'] = users['FAVORITE_GENRES'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [18]:
users.head(7)

Unnamed: 0,USER_ID,AGE,SEX,FAVORITE_GENRES,YEAR_PREFERENCE,RATING_PREFERENCE
0,1,32,F,Fantasy,,high
1,2,43,F,Comedy,,high
2,3,53,M,Fantasy,,
3,4,26,M,"Romance, Music",,
4,5,53,F,"Action, Drama, Thriller",old,
5,6,58,M,"Romance, Music",new,
6,7,42,M,"Action, Drama, Thriller",,


# Interactions

In [67]:
interactions = []

for _, user in users.iterrows():
    num_interactions = 0
    
    # Determine user's preferred movie release year
    if user["YEAR_PREFERENCE"] == "old":
        release_year_cutoff = 2005
    elif user["YEAR_PREFERENCE"] == "new":
        release_year_cutoff = 2020
    else:
        release_year_cutoff = None
    
    # Determine user's preferred minimum movie rating
    if user["RATING_PREFERENCE"] == "high":
        min_rating = 7.8
    else:
        min_rating = 0
    
    # Create a filtered DataFrame based on user preferences
    filtered_movies = movies.copy()
    if release_year_cutoff is not None:
        filtered_movies = filtered_movies[filtered_movies["RELEASE_YEAR"] < release_year_cutoff]
    if min_rating > 0:
        filtered_movies = filtered_movies[filtered_movies["eventValue"] >= min_rating]
    
    # Keep track of selected movies for each user
    selected_movies = set()
    
    while num_interactions < 10:
        # Select a random movie from the filtered DataFrame
        if not filtered_movies.empty:
            movie = filtered_movies.sample().iloc[0]
        else:
            break

        # Check if the movie has already been selected
        if movie["ITEM_ID"] in selected_movies:
            filtered_movies.drop(movie.name, inplace=True)
            continue
        else:
            selected_movies.add(movie["ITEM_ID"])
        
        # Assign rating based on user preferences
        if user["YEAR_PREFERENCE"] in ["new", "old"] or user["RATING_PREFERENCE"] == "high":
            if user["RATING_PREFERENCE"] == "high" and movie["eventValue"] >= min_rating:
                rating = 5
            elif user["YEAR_PREFERENCE"] == "new" and movie["RELEASE_YEAR"] >= 2020:
                rating = 5
            elif user["YEAR_PREFERENCE"] == "old" and movie["RELEASE_YEAR"] < 2005:
                rating = 5
            else:
                filtered_movies.drop(movie.name, inplace=True)
                continue
        else:
            favorite_genres = set(user["FAVORITE_GENRES"].split(', '))
            movie_genres = set(movie["GENRES"].split(', '))
    
            # Check if all favorite genres are included in the movie genres
            if favorite_genres.issubset(movie_genres):
                if movie["eventValue"] >= 7.0:
                    rating = 5
                elif movie["eventValue"] <= 6.0:
                    rating = 3
                else:
                    rating = 4
            else:
                filtered_movies.drop(movie.name, inplace=True)
                continue
        
        # Append interaction to interactions list and increment num_interactions
        interactions.append({
            "USER_ID": user["USER_ID"],
            "ITEM_ID": movie["ITEM_ID"],
            "EVENT_TYPE": 'watch',
            "eventValue": rating,
            "TIMESTAMP": int(time.time())
        })
        num_interactions += 1

In [68]:
interactions = pd.DataFrame(interactions)

In [73]:
movie = movies.loc[movies["ITEM_ID"] == 920863]
movie.head()

Unnamed: 0,ITEM_ID,ACTORS,DIRECTOR,GENRES,TITLE,RELEASE_YEAR,eventValue,CREATION_TIMESTAMP
48,920863,"Al Gore, Billy West, Ronald Reagan, George W. ...",Davis Guggenheim,Documentary,An Inconvenient Truth,2023,6.5,1672531000.0


In [79]:
interactions.head()

Unnamed: 0,USER_ID,ITEM_ID,EVENT_TYPE,eventValue,TIMESTAMP
0,1,918382,watch,5,1681289127
1,1,920916,watch,5,1681289127
2,1,918356,watch,5,1681289127
3,1,920923,watch,5,1681289127
4,1,918307,watch,5,1681289127


In [69]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282 entries, 0 to 1281
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   USER_ID     1282 non-null   int64 
 1   ITEM_ID     1282 non-null   int64 
 2   EVENT_TYPE  1282 non-null   object
 3   eventValue  1282 non-null   int64 
 4   TIMESTAMP   1282 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 50.2+ KB


In [74]:
interactions.to_csv('interactions.csv')

In [75]:
users = users.drop(columns=['FAVORITE_GENRES', 'YEAR_PREFERENCE', 'RATING_PREFERENCE'])

In [76]:
users.head()

Unnamed: 0,USER_ID,AGE,SEX
0,1,29,M
1,2,27,F
2,3,31,M
3,4,20,M
4,5,35,F


In [77]:
users.to_csv('users.csv')