In [32]:
import pandas as pd
import numpy as np
import pandas.api.types as pdtypes

import warnings
warnings.filterwarnings("ignore")

In [33]:
df_games = pd.read_parquet('../data/df_games.parquet')
df_reviews = pd.read_parquet('../data/processed/df_reviews.parquet')
df_items = pd.read_parquet('../data/processed/df_items.parquet')

#### def developer_stats(dev)
Number of items and percentage of free content per year by developer company

In [34]:
def developer(developer):
    # Filter the dataframe by developer
    df_dev = df_games[df_games['developer'] == developer]
    
    # Calculate the quantity of items released by year
    items_by_year = df_dev.groupby(df_dev['release_year'])['id'].count()
    
    # Calculate the percentage of free content
    free_content_by_year = (df_dev[df_dev['price'] == 0].groupby(df_dev['release_year'])['id'].count() / items_by_year * 100).fillna(0)
    
    # Create a list of dictionaries with the results
    result = [{'Year': int(year), 'Items Released': int(count), '% of Free Content': percent} for year, count, percent in zip(items_by_year.index, items_by_year.values, free_content_by_year.values)]
    
    # Convert the list to a dictionary
    output = {f'Year: {item["Year"]}': {'Items Released': item['Items Released'], '% of Free Content': item['% of Free Content']} for item in result}
    
    return output

In [35]:
developer('Valve')

{'Year: 1998': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 1999': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2000': {'Items Released': 2, '% of Free Content': 0.0},
 'Year: 2001': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2003': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2004': {'Items Released': 5, '% of Free Content': 0.0},
 'Year: 2006': {'Items Released': 2, '% of Free Content': 0.0},
 'Year: 2007': {'Items Released': 4, '% of Free Content': 50.0},
 'Year: 2008': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2009': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2010': {'Items Released': 2, '% of Free Content': 50.0},
 'Year: 2011': {'Items Released': 2, '% of Free Content': 0.0},
 'Year: 2012': {'Items Released': 1, '% of Free Content': 0.0},
 'Year: 2016': {'Items Released': 1, '% of Free Content': 100.0},
 'Year: 2017': {'Items Released': 3, '% of Free Content': 0.0}}

#### def userdata(user_id)
per user returns a dictionary containing: {user id: amount of money spent, percentage of recommendation, number of items}

In [16]:
def userdata(user_id):

    df_items['item_id'] = df_items['item_id'].astype(float)
    user_items = df_items.loc[df_items['user_id'] == user_id]
    money_spent = user_items.merge(df_games, left_on='item_id', right_on='id')['price'].sum()
    number_of_items = float(user_items['items_count'].unique()[0])

    # Filter and count only True values (recommendations)
    user_recommendations = user_items.merge(df_reviews, left_on='item_id', right_on='item_id')['recommend']
    recommend_rate = user_recommendations.where(user_recommendations == True).count() 

    # Calculate total items to avoid division by zero
    total_items = user_items['items_count'].sum()

    # Calculate recommendation rate (avoiding division by zero)
    recommend_rate = recommend_rate / total_items if total_items > 0 else 0

    user_data = {
        'user id': user_id,
        'money spent': round(money_spent, 2),
        'number of items': round(number_of_items,3),
        'recommend rate': recommend_rate
    }

    return user_data



In [17]:
userdata('thugnificent')

{'user id': 'thugnificent',
 'money spent': 98499.16,
 'number of items': 6700.0,
 'recommend rate': 0.000719380708398307}

In [18]:
userdata('Derp-e')

{'user id': 'Derp-e',
 'money spent': 369.75,
 'number of items': 18.0,
 'recommend rate': 21.108024691358025}

#### def UserForGenre (genre)
Returns the user with the most hours played in a given genre, playtime divided by year: {'genre': 'user_id', "Hours played": [{year: yyyy, hours: n},{year: yyyy, hours: n},{year: yyyy, hours: n}]}

In [19]:
def UserForGenre(genero:str):

  # Merge DataFrames based on a common column (e.g., 'item_name')
  df_merged = df_items.merge(df_games, left_on='item_name', right_on='title')

  # Filter data for the given genre
  genre_data = df_merged[df_merged['genres'] == genero]

  # Calculate total playtime per user per year (assuming playtime_forever in minutes)
  user_year_playtime = (
      genre_data
      .groupby(['user_id', genre_data['release_year']])['playtime_forever']
      .sum()
      .apply(lambda x: x / 60)  # Convert minutes to hours
      .reset_index()
  )

  # Group by user ID and sum playtime across years
  user_playtime_total = user_year_playtime.groupby('user_id')['playtime_forever'].sum()

  # Find user with the most playtime
  top_user_id = user_playtime_total.idxmax()

  # Filter data for the top user
  top_user_data = user_year_playtime[user_year_playtime['user_id'] == top_user_id]

  # Prepare playtime details
  playtime_details = [
      {'year': row["release_year"], 'hours': round(row["playtime_forever"], 2)}
      for _, row in top_user_data.iterrows()
  ]

  # Return user details dictionary
  return {
      "genre": genero,
      "user_id": top_user_id,
      "Hours played": playtime_details
  }


In [20]:
UserForGenre('Action')

{'genre': 'Action',
 'user_id': 'shinomegami',
 'Hours played': [{'year': 1988, 'hours': 2.27},
  {'year': 1991, 'hours': 0.0},
  {'year': 1992, 'hours': 0.0},
  {'year': 1993, 'hours': 0.0},
  {'year': 1994, 'hours': 0.0},
  {'year': 1995, 'hours': 0.03},
  {'year': 1996, 'hours': 19.35},
  {'year': 1997, 'hours': 1.65},
  {'year': 1998, 'hours': 61.93},
  {'year': 1999, 'hours': 5.05},
  {'year': 2000, 'hours': 159.53},
  {'year': 2001, 'hours': 40.48},
  {'year': 2002, 'hours': 0.0},
  {'year': 2003, 'hours': 8879.27},
  {'year': 2004, 'hours': 2.0},
  {'year': 2005, 'hours': 5.72},
  {'year': 2006, 'hours': 34.05},
  {'year': 2007, 'hours': 1139.23},
  {'year': 2008, 'hours': 3.0},
  {'year': 2009, 'hours': 3496.75},
  {'year': 2010, 'hours': 156.75},
  {'year': 2011, 'hours': 1484.37},
  {'year': 2012, 'hours': 300.65},
  {'year': 2013, 'hours': 2439.15},
  {'year': 2014, 'hours': 1519.92},
  {'year': 2015, 'hours': 1690.82},
  {'year': 2016, 'hours': 709.38},
  {'year': 2017, 'ho

#### def best_developer_year(year)
returns top 3 most recommended developers in a given year. [{First place: X},{Second place: Y},{Third place: Z}]

In [5]:
def best_developer_year(year):
    df_merged = df_reviews.merge(df_games, left_on='item_id', right_on='id')
    df_merged['posted'] = pd.to_datetime(df_merged['posted'])
  # Filter data for the given year
    df_year = df_merged[df_merged['posted'].dt.year == year]
    top_devs = (df_year.groupby('developer')['recommend'].count().reset_index().sort_values(by='recommend', ascending=False).head(3))

    rankings = ["1st place", "2nd place", "3rd place"]
    top_devs_dict = dict(zip(rankings, top_devs.to_dict('records')))

    return top_devs_dict


In [6]:
best_developer_year(2015)

{'1st place': {'developer': 'Valve', 'recommend': 3500},
 '2nd place': {'developer': 'Facepunch Studios', 'recommend': 2050},
 '3rd place': {'developer': 'Smartly Dressed Games', 'recommend': 1590}}

In [117]:
best_developer_year(2001)

{}

In [21]:
def best_developer_year2(year):
    df_merged = df_reviews.merge(df_games, left_on='item_id', right_on='id')
  # Filter data for the given year
    df_year = df_merged[df_merged['release_year'] == year]
    top_devs = (df_year.groupby('developer')['recommend'].count().reset_index().sort_values(by='recommend', ascending=False).head(3))

    rankings = ["1st place", "2nd place", "3rd place"]
    top_devs_dict = dict(zip(rankings, top_devs.to_dict('records')))

    return top_devs_dict

In [22]:
best_developer_year2(2001)

{'1st place': {'developer': 'Firaxis Games', 'recommend': 4},
 '2nd place': {'developer': 'FireFly Studios', 'recommend': 4},
 '3rd place': {'developer': 'Remedy Entertainment', 'recommend': 4}}

#### def developer_reviews_analysis(developer)
per developer, returns a dictionary containing the total count of reviews for each type: {'developer': [Negative = n, positive = p]}

In [4]:
def developer_reviews_analysis(developer):
  """
  Analyzes developer reviews and returns a dictionary with review counts.

  Args:
      developer: The name of the developer to analyze.

  Returns:
      A dictionary with the developer name as the key and a list containing 
      the count of negative and positive reviews as values.
  """

  # Merge reviews and games on item_id and developer
  # Specify how to handle differing column names if needed
  merged_df = df_games.merge(df_reviews, left_on='id', right_on='item_id', how='inner')

  # Count reviews by sentiment category
  review_counts = merged_df['sentiment_category'].value_counts().to_dict()

  # Convert category counts to a list with desired format
  review_list = {
      'Negative' : review_counts.get(0, 0),  # Use integer 0 for negative category
      'Positive' : review_counts.get(2, 0),  # Use integer 2 for positive category
  }

  # Create the dictionary with developer name and review counts
  return {developer: review_list}


In [5]:
developer_reviews_analysis('Re-Logic')

{'Re-Logic': {'Negative': 15941, 'Positive': 65280}}