In [108]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample


In [109]:
csv_file_path = '../data/goodreads_genre_data.csv'
df = pd.read_csv(csv_file_path)

In [110]:
df['Genres'] = df['Genres'].apply(literal_eval)
df['genre'] = df['Genres'].apply(lambda x: x[0] if x else None)

In [111]:
df = df[['genre', 'Description']]

In [112]:
# Convert the 'Genre' column to lowercase
df['genre'] = df['genre'].str.lower()

In [113]:
# Replace NaN values in the 'Description' column with an empty string
df['Description'] = df['Description'].fillna('')

In [114]:
# Define genre combinations
genre_mapping = {
    'religion': ['prayer', 'theology', 'catholic', 'christianity', 'islam', 'spirituality', 'chrisitian', 'religion'],
    'self help': ['self help', 'personal development', 'productivity'],
    'cookbooks': ['cooking', 'vegetarian', 'food', 'cookbook'],
    'biography': ['biography memoir', 'autobiography', 'memoir', 'biography'],
    'romance': ['love inspired', 'romance'],
    'history': ['historical', 'history', 'historical fiction'],
    'fantasy': ['fantasy', 'epic fantasy'],
    'health': ['health', 'medicine', 'medical', 'mental health'],
    'mystery': ['crime', 'mystery thriller', 'thriller', 'mystery'],
    'classics': ['classics', 'literature']
}

# Combine genres based on the mapping
for new_genre, old_genres in genre_mapping.items():
    df.loc[df['genre'].isin(old_genres), 'genre'] = new_genre

In [115]:
# Define the list of genres to consider
desired_genres = ['mystery', 'history', 'biography', 'romance', 'cookbooks', 'science fiction', 'fantasy',
                  'classics', 'health', 'religion', 'self help', 'true crime', 'politics', 'business', 'poetry',
                  'westerns', 'fiction', 'nonfiction']

# Filter the DataFrame to include only desired genres
df = df[df['genre'].isin(desired_genres)]

In [116]:
# Determine the minimum number of samples among the selected genres
min_sample_size = df['genre'].value_counts().min()

In [117]:
# Resample each genre to have the minimum sample size
resampled_dfs = []
for genre in desired_genres:
    genre_df = df[df['genre'] == genre]
    resampled_df = resample(genre_df, replace=False, n_samples=min_sample_size, random_state=42)
    resampled_dfs.append(resampled_df)

In [118]:
# Combine the resampled DataFrames back into a single DataFrame
df_resampled = pd.concat(resampled_dfs)

In [119]:
# Extract unique genres from the 'Genre' column
unique_genres_resampled = df_resampled['genre'].unique()

In [120]:
# Create a dictionary to store similarities for each unique genre
genre_similarities_resampled = {}

In [122]:
# Loop through each unique genre and calculate similarity with science fiction
for genre in unique_genres_resampled:
    # Extract descriptions for the current genre and science fiction
    genre_desc = df_resampled[df_resampled['genre'] == genre]['Description'].values
    sci_fi_desc = df_resampled[df_resampled['genre'] == 'science fiction']['Description'].values

    # Combine descriptions into a list for vectorization
    all_descriptions = list(genre_desc) + list(sci_fi_desc)

    # Use TF-IDF vectorizer to convert descriptions into numerical vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_descriptions)

    # Calculate cosine similarity between the genre and science fiction
    similarity_scores = cosine_similarity(tfidf_matrix)

    # Extract the similarity score for the genre of interest
    genre_similarity = similarity_scores[0, len(genre_desc)]

    # Store the similarity score in the dictionary
    genre_similarities_resampled[genre] = genre_similarity

In [123]:
# Convert the dictionary to a DataFrame for better visualization
similarities_df_resampled = pd.DataFrame(list(genre_similarities_resampled.items()), columns=['Genre', 'Similarity to Sci-Fi'])

In [127]:
similarities_df_resampled.to_csv('../data/genre_similarities.csv', index=False)