In [1]:
import pandas as pd
# RMSE for collaborative filtering
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import pandas as pd

movies = {
    "Totoro": {
        "content_path": "totoro_content_based.csv",
        "collaborative_path": "totoro_collaborative.csv",
        "output_path": "totoro_combined_comparison.csv"
    },
    "Howl": {
        "content_path": "howl_content_based.csv",
        "collaborative_path": "howl_collaborative.csv",
        "output_path": "howl_combined_comparison.csv"
    },
    "Kiki": {
        "content_path": "kiki_content_based.csv",
        "collaborative_path": "kiki_collaborative.csv",
        "output_path": "kiki_combined_comparison.csv"
    }
}

def process_movie(content_path, collaborative_path, output_path):
    content_df = pd.read_csv(content_path)
    collaborative_df = pd.read_csv(collaborative_path)

    combined_df = pd.concat(
        [
            content_df.rename(columns={
                "Recommended Movies": "Content-Based: Name",
                "Cosine Similarity": "Content-Based: Score"
            })[['Content-Based: Name', 'Content-Based: Score']],
            collaborative_df.rename(columns={
                "Movie Name": "Collaborative: Name",
                "Similarity Score": "Collaborative: Score"
            })[['Collaborative: Name', 'Collaborative: Score']]
        ],
        axis=1
    )
    

    combined_df['Content-Based: Score'] = combined_df['Content-Based: Score'].round(3)
    combined_df['Collaborative: Score'] = combined_df['Collaborative: Score'].round(3)

    combined_df.to_csv(output_path, index=False)
    print(f"Combined comparison for {output_path.split('_')[0]} saved!")

for movie, paths in movies.items():
    process_movie(paths["content_path"], paths["collaborative_path"], paths["output_path"])


Combined comparison for totoro saved!
Combined comparison for howl saved!
Combined comparison for kiki saved!


In [3]:
# Precision Recall

def calculate_precision_recall(combined_csv_path):
    df = pd.read_csv(combined_csv_path)
    
    # Normalize names for consistency (e.g., "Laputa: Castle in the Sky" → "Castle in the Sky")
    df['Content-Based: Name'] = df['Content-Based: Name'].str.strip().str.replace("Laputa: Castle in the Sky", "Castle in the Sky", regex=False)
    df['Collaborative: Name'] = df['Collaborative: Name'].str.strip().str.replace("Laputa: Castle in the Sky", "Castle in the Sky", regex=False)
    
    content_based = set(df['Content-Based: Name'])
    collaborative = set(df['Collaborative: Name'])
    
    overlap = content_based.intersection(collaborative)
    
    precision = len(overlap) / len(content_based) if len(content_based) > 0 else 0
    recall = len(overlap) / len(collaborative) if len(collaborative) > 0 else 0
    
    return precision, recall, list(overlap)

movies = [
    {"file": "totoro_combined_comparison.csv", "name": "My Neighbor Totoro"},
    {"file": "howl_combined_comparison.csv", "name": "Howl's Moving Castle"},
    {"file": "kiki_combined_comparison.csv", "name": "Kiki's Delivery Service"}
]

results = []

for movie in movies:
    precision, recall, overlap = calculate_precision_recall(f"{movie['file']}")
    results.append({
        "Movie": movie["name"],
        "Precision (Content-Based)": round(precision, 2),
        "Recall (Content-Based)": round(recall, 2),
        "Overlap (Relevant Movies)": ", ".join(overlap)  # Include overlapping movies
    })

results_df = pd.DataFrame(results)

print(results_df)

results_df.to_csv("calculated_precision_recall.csv", index=False)



                     Movie  Precision (Content-Based)  Recall (Content-Based)  \
0       My Neighbor Totoro                        0.4                     0.4   
1     Howl's Moving Castle                        0.4                     0.4   
2  Kiki's Delivery Service                        0.2                     0.2   

                Overlap (Relevant Movies)  
0  Spirited Away, Kiki's Delivery Service  
1    Princess Mononoke, Castle in the Sky  
2                      My Neighbor Totoro  


In [1]:
# RMSE for collaborative filtering
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import pandas as pd

ratings_file = 'prepared_ratings.csv' 
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(ratings_file, reader=reader)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)

model_rmse = rmse(predictions, verbose=True)

results = [{
    'User': pred.uid,
    'Item': pred.iid,
    'Actual Rating': pred.r_ui,
    'Predicted Rating': round(pred.est, 2)
} for pred in predictions]

results_df = pd.DataFrame(results)

results_df.to_csv("collaborative_filtering_predictions.csv", index=False)

print(f"RMSE: {model_rmse}")
print(results_df.head())

RMSE: 0.8871
RMSE: 0.8870778187204674
  User   Item  Actual Rating  Predicted Rating
0  523  31658            5.0              4.36
1  103  31658            4.5              4.05
2  414   5618            5.0              4.46
3  434   5618            5.0              4.20
4  606   6350            4.0              4.30
