In [None]:
import pandas as pd

Load Dataset

In [None]:
file_path = 'film_complete.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,movie_name,year,runtime,genre,rating,director,star,votes
0,Knock at the Cabin,2023,100 min,Horror,6.4,M. Night Shyamalan,"Dave Bautista, \nJonathan Groff, \nRupert Grin...",17991.0
1,Knock at the Cabin,2023,100 min,Mystery,6.4,M. Night Shyamalan,"Dave Bautista, \nJonathan Groff, \nRupert Grin...",17991.0
2,Knock at the Cabin,2023,100 min,Thriller,6.4,M. Night Shyamalan,"Dave Bautista, \nJonathan Groff, \nRupert Grin...",17991.0
3,The Menu,2022,107 min,Horror,7.2,Mark Mylod,"Ralph Fiennes, \nAnya Taylor-Joy, \nNicholas H...",232052.0
4,The Menu,2022,107 min,Thriller,7.2,Mark Mylod,"Ralph Fiennes, \nAnya Taylor-Joy, \nNicholas H...",232052.0


In [None]:
genre_counts = df['genre'].value_counts()
print(genre_counts)

genre
Horror        14042
Thriller       3946
Drama          2950
Comedy         2431
Mystery        2343
Action         1514
Sci-Fi         1354
Fantasy        1292
Crime           982
Adventure       763
Romance         273
Animation       161
Family           96
Musical          58
Western          54
Music            48
History          39
Biography        30
War              25
Film-Noir        12
Sport             8
Reality-TV        1
Name: count, dtype: int64


Preprocessing

In [None]:
df['runtime'] = df['runtime'].str.replace(' min', '')
df['runtime'] = df['runtime'].str.replace(',', '')
df = df[df['year'] != 'I']

In [None]:
print(df['year'].unique())
print(df['runtime'].unique())
print(df['rating'].unique())
print(df['votes'].unique())

['2023' '2022' '2000' '2021' '2019' '2018' '2016' '2017' '1996' '1979'
 '2015' '2011' '1954' '1980' '2010' '2014' '2020' '2013' '1997' '2002'
 '1987' '2009' '1982' '2007' '1993' '1973' '1960' '2008' '1981' '2004'
 '1992' '2005' '1999' '1994' '2001' '1998' '1975' '1988' '2012' '2003'
 '1986' '1976' '2006' '1968' '1978' '1984' '1989' '1974' '1983' '1977'
 '1990' '1985' '1995' '1963' '1962' '1922' '1972' '1932' '1920' '1933'
 '1966' '1951' '1991' '1931' '1956' '1921' '1955' '1958' '1967' '1970'
 '1965' '1959' '1964' '1953' '1971' '1957' '1935' '1942' '1961' '1945'
 '1928' '1925' '1943' '1941' '1949' '1969' '1948' '1944' '1936' '1939'
 '1946' '1926' '1934' '1923' '1911' '1940' '1927' '1952' '1924' '1913'
 '1914' '1947' '1919' '1930' '1950' '1937' '1918' '1917' '1929' '1938'
 '1916' '1915']
['100' '107' '102' '97' '117' '128' '105' '84' '108' '103' '115' '130'
 '88' '131' '114' '148' '127' '136' '138' '83' '104' '111' '94' '126' '93'
 '116' '95' '132' '79' '146' '90' '135' '101' '99' '121' 

In [None]:
def reduce_genre_random(df, max_count=1500):
    grouped = df.groupby('genre')

    df = pd.DataFrame(columns=df.columns)

    for name, group in grouped:
        if len(group) > max_count:
            group = group.sample(n=max_count, random_state=42)
        df = pd.concat([df, group])

    return df

df = reduce_genre_random(df, max_count=4000)

print(df['genre'].value_counts())

genre
Horror        4000
Thriller      3946
Drama         2950
Comedy        2430
Mystery       2343
Action        1514
Sci-Fi        1354
Fantasy       1292
Crime          982
Adventure      763
Romance        273
Animation      161
Family          96
Musical         58
Western         54
Music           48
History         39
Biography       30
War             25
Film-Noir       12
Sport            8
Reality-TV       1
Name: count, dtype: int64


In [None]:
df.fillna('', inplace=True)
df['combined_features'] = df['genre'] + ' ' + df['director'] + ' ' + df['star']

df['year'] = pd.to_numeric(df['year'], errors='coerce')
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['votes'] = pd.to_numeric(df['votes'], errors='coerce')

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [None]:
df.to_csv('/content/film_dataset.csv', index=False)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity_score(user_input, tfidf_matrix):
    user_input_transformed = tfidf.transform(user_input['combined_features'])
    cosine_sim = cosine_similarity(user_input_transformed, tfidf_matrix)
    return cosine_sim[0]


In [None]:
def get_recommendations(user_input, tfidf_matrix, df):
    filtered_data = df[
        (df['genre'].str.contains(user_input['genre'][0], case=False)) &
        (df['year'] >= user_input['year'][0]) &
        (df['runtime'] <= user_input['runtime'][0]) &
        (df['rating'] >= user_input['rating'][0]) &
        (df['votes'] >= user_input['votes'][0])
    ].reset_index(drop=True)

    if len(filtered_data) >= 8:
        similarity_scores = get_similarity_score(user_input, tfidf_matrix[filtered_data.index])
        filtered_data['similarity_score'] = similarity_scores
        filtered_data = filtered_data.sort_values(by='similarity_score', ascending=False)
        recommendations = filtered_data.head(8).to_dict(orient='records')
    else:
        additional_data = df[df['genre'].str.contains(user_input['genre'][0], case=False)]
        additional_data = additional_data[~additional_data.index.isin(filtered_data.index)].reset_index(drop=True)
        additional_similarity_scores = get_similarity_score(user_input, tfidf_matrix[additional_data.index])
        additional_data['similarity_score'] = additional_similarity_scores
        additional_data = additional_data.sort_values(by='similarity_score', ascending=False)
        remaining_count = 8 - len(filtered_data)
        if remaining_count > 0:
            additional_recommendations = additional_data.head(remaining_count).to_dict(orient='records')
            recommendations = filtered_data.to_dict(orient='records') + additional_recommendations
        else:
            recommendations = filtered_data.to_dict(orient='records')

    return recommendations

dummy_data = pd.DataFrame({
    'year': [2000],
    'runtime': [200],
    'genre': ['Action'],
    'rating': [6.0],
    'director': [''],
    'star': [''],
    'votes': [1000]
})
dummy_data['combined_features'] = dummy_data['genre'] + ' ' + dummy_data['director'] + ' ' + dummy_data['star']

recommendations = get_recommendations(dummy_data, tfidf_matrix, df)
print(recommendations)

ValueError: If using all scalar values, you must pass an index