In [3]:
# import libraries
import pandas as pd
import numpy as np
import json


import pyarrow as pa
import pyarrow.parquet as pq
import warnings
warnings.filterwarnings("ignore")

In [5]:
df_games = pd.read_parquet('../data/df_games.parquet')
df_reviews = pd.read_parquet('../data/processed/df_reviews.parquet')
df_items = pd.read_parquet('../data/processed/df_items.parquet')


## best_developer_reviews

In [6]:
df_recs = pd.read_csv('./data/processed/df_recommendation.csv')
df_recs.head()

Unnamed: 0,user_id,rating,item_name
0,76561197970982479,5,Killing Floor
1,76561197970982479,5,Zeno Clash
2,76561197970982479,5,Metro 2033
3,js41637,5,Barbie™ Dreamhouse Party™
4,js41637,5,Euro Truck Simulator 2


In [7]:
df_developer = df_reviews.merge(df_games, left_on='item_id', right_on='id')
df_developer.head()

Unnamed: 0,user_id,posted,item_id,recommend,review,sentiment_category,genres,title,release_year,price,id,developer
0,76561197970982479,2011-11-05,1250.0,True,Simple yet with great replayability. In my opi...,2,Action,Killing Floor,2009,19.99,1250.0,Tripwire Interactive
1,76561197970982479,2011-07-15,22200.0,True,It's unique and worth a playthrough.,2,Action,Zeno Clash,2009,9.99,22200.0,ACE Team
2,76561197970982479,2011-07-15,22200.0,True,It's unique and worth a playthrough.,2,Indie,Zeno Clash,2009,9.99,22200.0,ACE Team
3,js41637,2013-09-08,227300.0,True,For a simple (it's actually not all that simpl...,2,Indie,Euro Truck Simulator 2,2013,19.99,227300.0,SCS Software
4,js41637,2013-09-08,227300.0,True,For a simple (it's actually not all that simpl...,2,Simulation,Euro Truck Simulator 2,2013,19.99,227300.0,SCS Software


In [8]:
df_developer.drop(columns=['item_id', 'id','review', 'genres', 'title', 'posted','price'],axis=1 ,inplace=True)

df_developer.head()

Unnamed: 0,user_id,recommend,sentiment_category,release_year,developer
0,76561197970982479,True,2,2009,Tripwire Interactive
1,76561197970982479,True,2,2009,ACE Team
2,76561197970982479,True,2,2009,ACE Team
3,js41637,True,2,2013,SCS Software
4,js41637,True,2,2013,SCS Software


In [9]:
df_developer = df_developer.merge(df_recs, left_on='user_id', right_on='user_id')
df_developer.head()

Unnamed: 0,user_id,recommend,sentiment_category,release_year,developer,rating,item_name
0,76561197970982479,True,2,2009,Tripwire Interactive,5,Killing Floor
1,76561197970982479,True,2,2009,Tripwire Interactive,5,Zeno Clash
2,76561197970982479,True,2,2009,Tripwire Interactive,5,Metro 2033
3,76561197970982479,True,2,2009,ACE Team,5,Killing Floor
4,76561197970982479,True,2,2009,ACE Team,5,Zeno Clash


In [10]:
df_developer.drop(columns=['item_name'],axis=1 ,inplace=True)
df_developer.head()

Unnamed: 0,user_id,recommend,sentiment_category,release_year,developer,rating
0,76561197970982479,True,2,2009,Tripwire Interactive,5
1,76561197970982479,True,2,2009,Tripwire Interactive,5
2,76561197970982479,True,2,2009,Tripwire Interactive,5
3,76561197970982479,True,2,2009,ACE Team,5
4,76561197970982479,True,2,2009,ACE Team,5


In [11]:
df_developer.duplicated().sum()

293581

In [12]:
df_developer.drop_duplicates(inplace=True)

In [13]:
def best_developer_year(year: int):

    # Filter the data for the given year
    df_year = df_developer[df_developer['release_year'] == year]

    # Calculate the number of recommendations for each developer
    dev_recommend_counts = df_year['developer'].value_counts()

    # Get the top 3 developers based on the number of recommendations
    top_devs_index = dev_recommend_counts.nlargest(3).index

    # Create a dictionary with the rankings and the corresponding developers
    rankings = ["1st place", "2nd place", "3rd place"]
    top_devs_dict = {rank: {'developer': dev, 'recommend': dev_recommend_counts[dev]} for rank, dev in zip(rankings, top_devs_index)}

    return top_devs_dict

In [14]:
best_developer_year(2015)

{'1st place': {'developer': 'Rockstar North', 'recommend': 518},
 '2nd place': {'developer': 'Kyle Seeley', 'recommend': 302},
 '3rd place': {'developer': 'Bethesda Game Studios', 'recommend': 285}}

In [163]:
def best_developer_year_by_rating(year: int):
  """
  This function finds the top 3 developers in a given year based on their average rating in the df_developer dataframe, presenting the scores as "x out of 5".

  Args:
      year (int): The year for which to find the top developers.

  Returns:
      dict: A dictionary containing the top 3 developers with their rankings and average ratings presented as "x out of 5".
  """

  # Filter data for the given year
  df_year = df_developer[df_developer['release_year'] == year]

  # Calculate average rating for each developer
  top_devs = (df_year.groupby('developer')['rating'].mean().reset_index().sort_values(by='rating', ascending=False).head(10))

  # Create rankings list
  rankings = ["1st place", "2nd place", "3rd place"]

  # Format average rating as "x out of 5"
  top_devs['rating_score'] = top_devs['rating'].apply(lambda x: f"{x:.1f} out of 5")

  # Create dictionary with rankings and formatted rating scores
  top_devs_dict = dict(zip(rankings, top_devs[['developer', 'rating_score']].to_dict('records')))

  return top_devs_dict


In [164]:
best_developer_year_by_rating(2015)

{'1st place': {'developer': 'Lionheart Creations',
  'rating_score': '5.0 out of 5'},
 '2nd place': {'developer': 'GabberGames.com', 'rating_score': '5.0 out of 5'},
 '3rd place': {'developer': 'Gatling Goat Studios',
  'rating_score': '5.0 out of 5'}}

In [204]:
pq.write_table(pa.Table.from_pandas(df_developer), './data/df_developer.parquet')

## developer_reviews_analysis

In [15]:
def developer_reviews_analysis(developer):
  """
  Analyzes developer reviews and returns a dictionary with review counts.

  Args:
      developer: The name of the developer to analyze.

  Returns:
      A dictionary with the developer name as the key and a list containing 
      the count of negative and positive reviews as values.
  """

  # Merge reviews and games on item_id and developer
  # Specify how to handle differing column names if needed


  # Count reviews by sentiment category
  review_counts = df_developer['sentiment_category'].value_counts().to_dict()

  # Convert category counts to a list with desired format
  review_list = {
      'Negative' : review_counts.get(0, 0),  # Use integer 0 for negative category
      'Positive' : review_counts.get(2, 0),  # Use integer 2 for positive category
  }

  # Create the dictionary with developer name and review counts
  return {developer: review_list}

In [16]:
developer_reviews_analysis('Re-Logic')

{'Re-Logic': {'Negative': 13949, 'Positive': 46465}}

## userforgenre

In [56]:
df_genre = df_items.merge(df_games, left_on='item_name', right_on='title')
df_genre

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,genres,title,release_year,price,id,developer
0,76561197970982479,277,10,Counter-Strike,6.0,Action,Counter-Strike,2000,9.99,10.0,Valve
1,js41637,888,10,Counter-Strike,0.0,Action,Counter-Strike,2000,9.99,10.0,Valve
2,Riot-Punch,328,10,Counter-Strike,0.0,Action,Counter-Strike,2000,9.99,10.0,Valve
3,doctr,541,10,Counter-Strike,93.0,Action,Counter-Strike,2000,9.99,10.0,Valve
4,corrupted_soul,115,10,Counter-Strike,108.0,Action,Counter-Strike,2000,9.99,10.0,Valve
...,...,...,...,...,...,...,...,...,...,...,...
7631002,76561198071808318,10,444770,Mimpi Dreams,46.0,Indie,Mimpi Dreams,2016,7.99,444770.0,Silicon Jelly
7631003,76561198107283457,33,354280,ChaosTower,164.0,Action,ChaosTower,2016,5.99,354280.0,CashcowGames
7631004,76561198107283457,33,354280,ChaosTower,164.0,Casual,ChaosTower,2016,5.99,354280.0,CashcowGames
7631005,76561198107283457,33,354280,ChaosTower,164.0,Indie,ChaosTower,2016,5.99,354280.0,CashcowGames


In [57]:
df_genre.drop(columns=['items_count','title','id','item_name','developer','price'],axis=1 ,inplace=True)

df_genre

Unnamed: 0,user_id,item_id,playtime_forever,genres,release_year
0,76561197970982479,10,6.0,Action,2000
1,js41637,10,0.0,Action,2000
2,Riot-Punch,10,0.0,Action,2000
3,doctr,10,93.0,Action,2000
4,corrupted_soul,10,108.0,Action,2000
...,...,...,...,...,...
7631002,76561198071808318,444770,46.0,Indie,2016
7631003,76561198107283457,354280,164.0,Action,2016
7631004,76561198107283457,354280,164.0,Casual,2016
7631005,76561198107283457,354280,164.0,Indie,2016


In [58]:
print(df_genre.duplicated().sum())
df_genre.drop_duplicates(inplace=True)

10


In [59]:


# Group by user_id and count the occurrences of item_id
user_id_counts = df_genre.groupby('user_id')['item_id'].count().reset_index(name='count')
# Filter the dataframe to include only user_id's with count greater than a threshold (e.g., 1000)
filtered_user_ids = user_id_counts[user_id_counts['count'] > 300]['user_id']

# Filter df_genres based on the filtered user_ids
df_genre = df_genre[df_genre['user_id'].isin(filtered_user_ids)]


In [60]:
df_genre

Unnamed: 0,user_id,item_id,playtime_forever,genres,release_year
0,76561197970982479,10,6.0,Action,2000
1,js41637,10,0.0,Action,2000
2,Riot-Punch,10,0.0,Action,2000
3,doctr,10,93.0,Action,2000
5,cadmusthreepointoh,10,0.0,Action,2000
...,...,...,...,...,...
7630993,AQuestionableCharacter,443700,5.0,Strategy,2016
7630994,AQuestionableCharacter,443700,5.0,Early Access,2016
7630995,thekushcorner,433400,13.0,Adventure,2016
7630996,thekushcorner,433400,13.0,Indie,2016


In [61]:
df_genre['playtime_hours'] = df_genre['playtime_forever'].apply(lambda x: x/60 if x != 0 else 0)
df_genre

Unnamed: 0,user_id,item_id,playtime_forever,genres,release_year,playtime_hours
0,76561197970982479,10,6.0,Action,2000,0.100000
1,js41637,10,0.0,Action,2000,0.000000
2,Riot-Punch,10,0.0,Action,2000,0.000000
3,doctr,10,93.0,Action,2000,1.550000
5,cadmusthreepointoh,10,0.0,Action,2000,0.000000
...,...,...,...,...,...,...
7630993,AQuestionableCharacter,443700,5.0,Strategy,2016,0.083333
7630994,AQuestionableCharacter,443700,5.0,Early Access,2016,0.083333
7630995,thekushcorner,433400,13.0,Adventure,2016,0.216667
7630996,thekushcorner,433400,13.0,Indie,2016,0.216667


In [29]:
#Cuento filas
cant_filas= len(df_genres)

#Calculo la mitad
mitad_filas= cant_filas // 3
 
#Selecciono la mitad superior
df_genres= df_genre.iloc[:mitad_filas]

df_genres


Unnamed: 0,user_id,item_id,playtime_forever,genres,release_year
0,76561197970982479,10,6.0,Action,2000
1,js41637,10,0.0,Action,2000
2,Riot-Punch,10,0.0,Action,2000
3,doctr,10,93.0,Action,2000
4,corrupted_soul,10,108.0,Action,2000
...,...,...,...,...,...
2543660,76561198082169104,252490,842.0,Early Access,2013
2543661,76561198109551324,252490,51.0,Action,2013
2543662,76561198109551324,252490,51.0,Adventure,2013
2543663,76561198109551324,252490,51.0,Indie,2013


In [51]:
def UserForGenre1(genre:str):

  # Filter data for the given genre
  genre_data = df_genre[df_genre['genres'] == genre]

  # Calculate total playtime per user per year (assuming playtime_forever in minutes)
  user_year_playtime = (
      genre_data
      .groupby(['user_id', genre_data['release_year']])['playtime_forever']
      .sum()
      .apply(lambda x: x / 60)  # Convert minutes to hours
      .reset_index()
  )

  # Group by user ID and sum playtime across years
  user_playtime_total = df_genre.groupby('user_id')['playtime_forever'].sum()

  # Find user with the most playtime
  top_user_id = user_playtime_total.idxmax()

  # Filter data for the top user
  top_user_data = user_year_playtime[user_year_playtime['user_id'] == top_user_id]

  # Prepare playtime details
  playtime_details = [
      {'year': row["release_year"], 'hours': round(row["playtime_forever"], 2)}
      for _, row in top_user_data.iterrows()
  ]

  # Return user details dictionary
  return {
      "genre": genre,
      "user_id": top_user_id,
      "Hours played": playtime_details
  }


In [62]:
def UserForGenre(genre:str):
  # Filter data for the given genre
  genre_data = df_genre[df_genre['genres'].apply(lambda x: genre in x)]

  # Calculate total playtime per user per year (assuming playtime_forever in hours)
  user_year_playtime = (
      genre_data
      .groupby(['user_id', genre_data['release_year']])['playtime_hours']
      .sum()
      .reset_index()
  )

  # Group by user ID and sum playtime across years
  user_playtime_total = df_genre.groupby('user_id')['playtime_hours'].sum()

  # Find user with the most playtime
  top_user_id = user_playtime_total.idxmax()

  # Filter data for the top user
  top_user_data = user_year_playtime[user_year_playtime['user_id'] == top_user_id]

  # Prepare playtime details
  playtime_details = [
      {'year': row["release_year"], 'hours': round(row["playtime_hours"], 2)}
      for _, row in top_user_data.iterrows()
  ]

  # Return user details dictionary
  return {
      "genre": genre,
      "user_id": top_user_id,
      "Hours played": playtime_details
  }

In [63]:
UserForGenre('Adventure')

{'genre': 'Adventure',
 'user_id': 'REBAS_AS_F-T',
 'Hours played': [{'year': 2002, 'hours': 0.0},
  {'year': 2003, 'hours': 31.05},
  {'year': 2005, 'hours': 18.55},
  {'year': 2006, 'hours': 0.0},
  {'year': 2007, 'hours': 0.0},
  {'year': 2008, 'hours': 20.68},
  {'year': 2009, 'hours': 281.88},
  {'year': 2010, 'hours': 267.82},
  {'year': 2011, 'hours': 916.9},
  {'year': 2012, 'hours': 1797.15},
  {'year': 2013, 'hours': 3428.13},
  {'year': 2014, 'hours': 2582.68},
  {'year': 2015, 'hours': 9590.17},
  {'year': 2016, 'hours': 11928.88},
  {'year': 2017, 'hours': 543.67}]}

In [64]:
pq.write_table(pa.Table.from_pandas(df_genre), '../data/df_genre.parquet')

## userdata

In [90]:
df_items['item_id'] = pd.to_numeric(df_items['item_id'])
df_userdata = df_items.merge(df_games, left_on='item_id', right_on='id')
df_userdata.head()

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,genres,title,release_year,price,id,developer
0,76561197970982479,277,10,Counter-Strike,6.0,Action,Counter-Strike,2000,9.99,10.0,Valve
1,76561197970982479,277,20,Team Fortress Classic,0.0,Action,Team Fortress Classic,1999,4.99,20.0,Valve
2,76561197970982479,277,30,Day of Defeat,7.0,Action,Day of Defeat,2003,4.99,30.0,Valve
3,76561197970982479,277,40,Deathmatch Classic,0.0,Action,Deathmatch Classic,2001,4.99,40.0,Valve
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0,Action,Half-Life: Opposing Force,1999,4.99,50.0,Gearbox Software


In [91]:
df_userdata.drop(columns=['playtime_forever','title','id','item_name', 'release_year','developer','genres'],axis=1 ,inplace=True)

df_userdata.head()

Unnamed: 0,user_id,items_count,item_id,price
0,76561197970982479,277,10,9.99
1,76561197970982479,277,20,4.99
2,76561197970982479,277,30,4.99
3,76561197970982479,277,40,4.99
4,76561197970982479,277,50,4.99


In [92]:
df_userdata = df_userdata.merge(df_reviews, left_on='user_id', right_on='user_id')
df_userdata.head()

Unnamed: 0,user_id,items_count,item_id_x,price,posted,item_id_y,recommend,review,sentiment_category
0,76561197970982479,277,10,9.99,2011-11-05,1250.0,True,Simple yet with great replayability. In my opi...,2
1,76561197970982479,277,10,9.99,2011-07-15,22200.0,True,It's unique and worth a playthrough.,2
2,76561197970982479,277,10,9.99,2011-04-21,43110.0,True,Great atmosphere. The gunplay can be a bit chu...,2
3,76561197970982479,277,20,4.99,2011-11-05,1250.0,True,Simple yet with great replayability. In my opi...,2
4,76561197970982479,277,20,4.99,2011-07-15,22200.0,True,It's unique and worth a playthrough.,2


In [93]:
df_userdata.drop(columns=['posted','item_id_y','review', 'sentiment_category'],axis=1 ,inplace=True)
df_userdata.head()

Unnamed: 0,user_id,items_count,item_id_x,price,recommend
0,76561197970982479,277,10,9.99,True
1,76561197970982479,277,10,9.99,True
2,76561197970982479,277,10,9.99,True
3,76561197970982479,277,20,4.99,True
4,76561197970982479,277,20,4.99,True


In [94]:
# change name of column 'item_id_x' in df_userdata  to item_id
df_userdata.rename(columns={"item_id_x": "item_id"}, inplace=True)
df_userdata.head()

Unnamed: 0,user_id,items_count,item_id,price,recommend
0,76561197970982479,277,10,9.99,True
1,76561197970982479,277,10,9.99,True
2,76561197970982479,277,10,9.99,True
3,76561197970982479,277,20,4.99,True
4,76561197970982479,277,20,4.99,True


In [95]:
df_userdata.drop(columns=['item_id'],axis=1 ,inplace=True)
df_userdata.head()

Unnamed: 0,user_id,items_count,price,recommend
0,76561197970982479,277,9.99,True
1,76561197970982479,277,9.99,True
2,76561197970982479,277,9.99,True
3,76561197970982479,277,4.99,True
4,76561197970982479,277,4.99,True


In [96]:
df_userdata.head()

Unnamed: 0,user_id,items_count,price,recommend
0,76561197970982479,277,9.99,True
1,76561197970982479,277,9.99,True
2,76561197970982479,277,9.99,True
3,76561197970982479,277,4.99,True
4,76561197970982479,277,4.99,True


In [201]:
df_userdata.duplicated().sum()


11606039

In [202]:
df_userdata.drop_duplicates(inplace=True)

In [203]:
pq.write_table(pa.Table.from_pandas(df_userdata), './data/df_userdata.parquet')

In [224]:
@profile
def userdata(user_id: str):

    # Filter user_items by user_id
    user_items = df_userdata.loc[df_userdata['user_id'] == user_id]

    # Calculate money spent
    money_spent = user_items['price'].sum()

    # Get the number of items
    number_of_items = float(user_items['items_count'].unique()[0])

    # Filter and count only True values (recommendations)
    user_recommendations = user_items['recommend']
    recommend_rate = user_recommendations.where(user_recommendations == True).count() 

    # Calculate total items to avoid division by zero
    total_items = user_items['items_count'].sum()

    # Calculate recommendation rate (avoiding division by zero)
    recommend_rate = recommend_rate / total_items if total_items > 0 else 0

    user_data = {
        'user id': user_id,
        'money spent': round(money_spent, 2),
        'number of items': number_of_items,
        'recommend rate': round(recommend_rate,3)
    }
    return user_data

In [225]:
userdata('76561197970982479')

ERROR: Could not find file /var/folders/63/bjbs5j7d0r1c76dg_t4j0d6m0000gn/T/ipykernel_75707/3848202824.py


{'user id': '76561197970982479',
 'money spent': 294.32,
 'number of items': 277.0,
 'recommend rate': 0.004}

In [222]:
userdata('Derp-e')

{'user id': 'Derp-e',
 'money spent': 89.95,
 'number of items': 18.0,
 'recommend rate': 0.056}