In [2]:
import pandas as pd
import numpy as np
import os
import re

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
movies = pd.read_csv('data/movies.csv')

In [4]:
def flatten(xss):
    return [x for xs in xss for x in xs]

In [5]:
raw_genres = movies['genres'].unique()
genres = set(flatten([item.split('|') for item in raw_genres]))
print('Genres', genres)

Genres {'Romance', 'Horror', 'Action', 'Western', '(no genres listed)', 'Animation', 'Thriller', 'Mystery', 'Crime', 'Comedy', 'IMAX', 'Documentary', 'Children', 'Fantasy', 'Sci-Fi', 'Adventure', 'War', 'Musical', 'Drama', 'Film-Noir'}


In [6]:
genre_counts = movies.iloc[:, 4:].sum()  # Assuming genre columns start from the 5th column
genre_counts_df = pd.DataFrame({'Genre': genre_counts.index, 'Count': genre_counts.values})

## Splitting up DF

In [7]:
def extractGenre(data, genre, column="genres"):
    solution = []
    for i in range(len(data)):
        if genre in data.loc[i, column]:
            solution.append(data.loc[i, :])
    return pd.DataFrame(solution)

In [13]:
genre_dataframes = {}


for genre in genres:
    genre_dataframes[genre] = extractGenre(movies, genre)
    
for genre, dataframe in genre_dataframes.items():
    globals()[f"{genre.lower()}_movies"] = dataframe
    dataframe.to_csv(f"genre_data/{genre.lower()}_movies.csv", index=False)

# Splitting people up

In [11]:
ratings = pd.read_csv("data/ratings.csv")

In [47]:
ratings_summary = ratings.groupby("userId").agg({"movieId": "count"})
ratings_summary = ratings_summary.rename(columns={"movieId": "reviewCount"})
ratings_summary


Unnamed: 0_level_0,reviewCount
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [22]:
genres.remove("(no genres listed)")

In [34]:
def summarize_by_genre(genre_df, data=ratings):
    merged_data = data.merge(genre_df, how="inner", on="movieId")

    # Group by userId and calculate average ratings and associated movieId
    summary_df = merged_data.groupby('userId').agg({
        'rating': 'mean',
        'movieId': 'count'
    }).reset_index()
    return summary_df

In [50]:
genre_by_user = {}

for genre in genres:
    summary_df = summarize_by_genre(globals()[f"{genre.lower()}_movies"], ratings)
    summary_df = summary_df.merge(ratings_summary, how = "inner", on = "userId")
    genre_by_user[genre] = summary_df

In [53]:
for genre, dataframe in genre_by_user.items():
    globals()[f"{genre.lower()}_users"] = dataframe
    dataframe.to_csv(f"genre_data/{genre.lower()}_users.csv", index=False)