# Cocktail Recommender System

## 1. Load and Analyze the Data

In [1]:
import pandas as pd

# Load the cocktails dataset
cocktails_file_path = 'cocktails.csv'
cocktails_df = pd.read_csv(cocktails_file_path)

# Display the first few rows of the dataset
cocktails_df.head()


Unnamed: 0,Alcohol,Name,Category,Making,Base Wine,Base Wine Amount,Liquor,Liquor Amount,Juice,Juice Amount,...,Soda Amount,Others,Taste,Type of Glass,Salty,Savory,Sour,Bitter,Sweet,Spicy
0,5,Tequila Sunset,Short,Blend,Tequila,30,-,-,Lemon,30,...,-,Ice,Mild,Champagne Saucer,0,17,83,17,50,0
1,7,Chi-Chi,Long,Shake,Vodka,30,-,-,Pineapple,80,...,-,Coconut Milk,Mild,Armagnac Glasss,0,17,67,0,50,0
2,10,Sex on the Beach,Long,Build,Vodka,15,Melon_20/Raspberry_10,30,Pineapple,80,...,-,-,Mild,Highball,0,0,83,0,50,0
3,10,Horse’s Neck,Long,Build,Brandy,45,-,-,-,-,...,200,Lemon Piece,Mild,Old Fashioned,17,17,83,0,17,0
4,11,EL Diablo,Long,Build,Tequila,30,-,-,-,-,...,250,Orange Piece,Mild,Highball,17,0,50,17,50,0


## 2. Generate Synthetic User Ratings Data

In [3]:
import numpy as np

def generate_synthetic_ratings(num_records):
    user_ids = np.random.randint(1, 1001, num_records)
    cocktail_names = np.random.choice(cocktails_df['Name'].unique(), num_records)
    user_ratings = np.random.randint(1, 6, num_records)
    synthetic_data = pd.DataFrame({
        'user_id': user_ids,
        'cocktail_name': cocktail_names,
        'user_rating': user_ratings
    })
    return synthetic_data

# Generate synthetic ratings data
num_records = 1000  # Change this number to generate different sizes of datasets
ratings_df = generate_synthetic_ratings(num_records)

# Display the first few rows of the synthetic ratings dataset
ratings_df.head()


Unnamed: 0,user_id,cocktail_name,user_rating
0,840,Bloody Mary,1
1,293,Cosmopolitan,1
2,346,Blue Hawaii,2
3,705,Black Russian,1
4,309,Cosmopolitan,2


## 3. Perform Exploratory Data Analysis (EDA)

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

# EDA on cocktails data
def eda_cocktails(data):
    print("Cocktails Data Info:")
    print(data.info())
    print("\nCocktails Data Description:")
    print(data.describe())

    # Plotting the distribution of cocktail ingredients
    plt.figure(figsize=(12, 6))
    data['ingredient'].value_counts().head(20).plot(kind='bar')
    plt.title('Top 20 Ingredients in Cocktails')
    plt.xlabel('Ingredients')
    plt.ylabel('Frequency')
    plt.show()

# EDA on synthetic ratings data
def eda_ratings(data):
    print("Ratings Data Info:")
    print(data.info())
    print("\nRatings Data Description:")
    print(data.describe())

    # Plotting the distribution of ratings
    plt.figure(figsize=(8, 4))
    sns.countplot(data['user_rating'])
    plt.title('Distribution of User Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()

# Perform EDA
eda_cocktails(cocktails_df)
eda_ratings(ratings_df)


Cocktails Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Alcohol           48 non-null     int64 
 1   Name              48 non-null     object
 2   Category          48 non-null     object
 3   Making            48 non-null     object
 4   Base Wine         48 non-null     object
 5   Base Wine Amount  48 non-null     int64 
 6   Liquor            48 non-null     object
 7   Liquor Amount     48 non-null     object
 8   Juice             48 non-null     object
 9   Juice Amount      48 non-null     object
 10  Spice             48 non-null     object
 11  Spice Amount      48 non-null     object
 12  Soda              48 non-null     object
 13  Soda Amount       48 non-null     object
 14  Others            48 non-null     object
 15  Taste             48 non-null     object
 16  Type of Glass     48 non-null     object
 1

KeyError: 'ingredient'

<Figure size 1200x600 with 0 Axes>

## 4. Feature Engineering and Selection

In [5]:
from sklearn.preprocessing import LabelEncoder

# Feature Engineering: Encoding categorical features
le = LabelEncoder()
cocktails_df['encoded_ingredient'] = le.fit_transform(cocktails_df['ingredient'])

# Display the transformed data
cocktails_df.head()


KeyError: 'ingredient'

## 5. Build and Evaluate Models

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Content-Based Filtering
def content_based_filtering(data, cocktail_name, top_n=10):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['ingredients'])
    
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    indices = pd.Series(data.index, index=data['cocktail_name']).drop_duplicates()
    idx = indices[cocktail_name]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    
    cocktail_indices = [i[0] for i in sim_scores]
    return data['cocktail_name'].iloc[cocktail_indices]

# Example usage
recommended_cocktails = content_based_filtering(cocktails_df, 'Mojito')
print(recommended_cocktails)


KeyError: 'ingredients'

In [7]:
# Collaborative Filtering

In [8]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare the data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'cocktail_name', 'user_rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

# Collaborative Filtering using SVD
svd = SVD()
svd.fit(trainset)

# Predictions and evaluation
predictions = svd.test(testset)
accuracy.rmse(predictions)


RMSE: 1.3066


1.3065681042101565

## 6. Evaluation and Selection of the Best Model