# EDA & Data Loading

___
## Games Dataset


In [26]:
# Import libraries

# Data Manipulation
import pandas as pd
import numpy as np
import project_data_utils

# NLP libraries
from textblob import TextBlob
from textblob import Word
import spacy_fastlang
import spacy
from spacy import displacy
import nltk

from nltk.probability import FreqDist
import os
from collections import Counter
import statistics

from tqdm import tqdm, tqdm_notebook

# Data vizualization Libraries
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [2]:
# Load list of games
df_games = pd.read_csv("data/metacritic_games_full.csv" ,index_col=0)
df_games['user_score'] = pd.to_numeric(df_games['user_score'],errors='coerce')
df_games['metacritic_user_differenc'] = df_games['metascore'] - df_games['user_score']
# Create metascore and user discrepancy and tranform to numeric
df_games.sort_values(by='metacritic_user_differenc')
df_games

Unnamed: 0,title,platform,release_date,other_platforms,metascore,user_score,developer,publisher,genre,players,rating,summary,url,metacritic_user_differenc
0,Persona 5 Royal,pc,"Oct 21, 2022","['PlayStation 4', 'PlayStation 5', 'Switch', '...",97.0,8.9,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,,Prepare for an all-new RPG experience in Perso...,https://metacritic.com/game/pc/persona-5-royal,88.1
1,Elden Ring,xbox-series-x,"Feb 25, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...",96.0,7.7,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,M,A New World Created By Hidetaka Miyazaki And G...,https://metacritic.com/game/xbox-series-x/elde...,88.3
2,Elden Ring,playstation-5,"Feb 25, 2022","['PC', 'PlayStation 4', 'Xbox One', 'Xbox Seri...",96.0,7.9,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,M,A New World Created By Hidetaka Miyazaki And G...,https://metacritic.com/game/playstation-5/elde...,88.1
3,Portal Companion Collection,switch,"Jun 28, 2022",,96.0,8.3,Valve Software,Valve Software,"['Miscellaneous', ', ...",,,"Including Portal and Portal 2, the Companion C...",https://metacritic.com/game/switch/portal-comp...,87.7
4,Persona 5 Royal,xbox-series-x,"Oct 21, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Swit...",95.0,8.4,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,M,Prepare for an all-new RPG experience in Perso...,https://metacritic.com/game/xbox-series-x/pers...,86.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19634,Resident Evil: Survivor,playstation,"Aug 30, 2000",,39.0,6.2,Capcom,Capcom,"['Action Adventure', ', ...",1 Player,M,At last a first-person shooting gamer based in...,https://metacritic.com/game/playstation/reside...,32.8
19635,ECW Anarchy Rulz,dreamcast,"Dec 30, 2000",,38.0,6.8,Acclaim Studios Salt Lake City,Acclaim,"['Action', ', ...",,M,Anarchy in the USA!\r *All new control system!...,https://metacritic.com/game/dreamcast/ecw-anar...,31.2
19636,Duke Nukem: Land of the Babes,playstation,"Sep 19, 2000",,37.0,6.9,n-Space,GT Interactive,"['Action', ', ...",1-2,M,I am basing this review on the four votes tha...,https://metacritic.com/game/playstation/duke-n...,30.1
19637,Mortal Kombat: Special Forces,playstation,"Jun 30, 2000",,28.0,2.8,Midway,Midway,"['Action', ', ...",1 Player,M,Игра унылейшая. В свое время купил её думая чт...,https://metacritic.com/game/playstation/mortal...,25.2


In [3]:
# Check dtypes
df_games.dtypes

title                         object
platform                      object
release_date                  object
other_platforms               object
metascore                    float64
user_score                   float64
developer                     object
publisher                     object
genre                         object
players                       object
rating                        object
summary                       object
url                           object
metacritic_user_differenc    float64
dtype: object

## User Reviews Dataset

In [4]:
df_user_rev = pd.read_csv("data/metacritic_user_reviews_full.csv")
df_user_rev_full = df_user_rev # full copy
df_user_rev.head()

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page
0,0,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,Trix122,"\n100+ h of main story , good gameplay and pro...","Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
1,1,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,7,Runwin,El juego tiene una cantidad de relleno abrumad...,"Oct 26, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
2,2,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,Godskrieg,\nAbsolute god of recent JRPGs.\rIf you like t...,"Oct 24, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
3,3,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,jackspade152,\nSimply amazing. I finished this Game on ps4 ...,"Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
4,4,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,MLGANDREWPLAYS,People have been waiting for a long time for t...,"Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...


### Title aggreagted data
Remove platforma granularity.

In [5]:
# Group by title so we remove the platform granularity
df_user_rev.groupby(['title']).size().sort_values()

# Group by and perform and a series of operations
df_user_rev = df_user_rev.groupby(['title']).agg({'url':'count', 'user_score':'mean', 'reviewer':'count'}).sort_values(by='url')
df_user_rev = df_user_rev.rename(columns={'url':'num_user_reviews', 'user_score': 'avg_user_score', 'reviewer':'num_user_reviewers'})
df_user_rev.sort_values(by='num_user_reviews', ascending=False)

Unnamed: 0_level_0,num_user_reviews,avg_user_score,num_user_reviewers
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cyberpunk 2077,12058,5.056394,12058
Warcraft III: Reforged,10100,0.070198,10100
The Last of Us Part II,10100,4.112376,10100
Ghost of Tsushima,9505,9.289637,9505
Death Stranding,8764,7.219763,8764
...,...,...,...
Trulon: The Shadow Engine,1,6.000000,1
Tumblestone,1,0.000000,1
Fallen Legion: Revenants,1,9.000000,1
Oblitus,1,4.000000,1


## Critic Reviews Dataset

In [6]:
df_critic_rev = pd.read_csv("data/metacritic_critic_reviews_full.csv")
df_critic_rev_full = df_critic_rev #full copy
df_critic_rev.head()

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page
0,0,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,99,Game Rant,From its captivating music and memorable story...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
1,1,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,97,God is a Geek,Persona 5 Royal is quite frankly one of the be...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
2,2,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,95,The Mako Reactor,Persona 5 Royal was already the best version o...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
3,3,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,95,Hooked Gamers,Persona 5 Royal is chock full of amazing music...,"Oct 28, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
4,4,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,94,Hobby Consolas,"If you love RPGs and didn't play it on PS4, do...","Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...


In [7]:
df_critic_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433384 entries, 0 to 433383
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   433384 non-null  int64 
 1   url          433384 non-null  object
 2   title        433384 non-null  object
 3   platform     433384 non-null  object
 4   user_score   433384 non-null  int64 
 5   reviewer     433384 non-null  object
 6   review       433362 non-null  object
 7   date         244119 non-null  object
 8   review_type  433384 non-null  object
 9   url_page     433384 non-null  object
dtypes: int64(2), object(8)
memory usage: 33.1+ MB


### Title Aggregated Data
Remove platforma granularity.

In [8]:
# Group by title so we remove the platform granularity
df_critic_rev.groupby(['title']).size().sort_values()

df_critic_rev = df_critic_rev.groupby(['title']).agg({'url':'count', 'user_score':'mean', 'reviewer':'count'}).sort_values(by='url')
df_critic_rev = df_critic_rev.rename(columns={'url':'num_critic_reviews', 'user_score': 'avg_critic_score', 'reviewer':'num_critic_reviewers'})

In [9]:
df_critic_rev

Unnamed: 0_level_0,num_critic_reviews,avg_critic_score,num_critic_reviewers
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EyePet: Move Edition,1,91.000000,1
Kingdom Hearts 358/2 Days,1,90.000000,1
BioShock 2: Minerva's Den,1,91.000000,1
DCS: Black Shark,1,94.000000,1
Tournament of Legends,1,80.000000,1
...,...,...,...
Madden NFL 07,224,80.156250,224
X-Men: The Official Game,224,52.843750,224
Marvel: Ultimate Alliance,229,80.371179,229
Resident Evil 5,231,81.155844,231


## Join Games with User and Critic Reviews


In [10]:
# join on user_rev
df = df_games.merge(df_user_rev, on='title', how='left', indicator=True)
df = df.rename(columns={'_merge': 'merge_indicator_user_revs'})

In [11]:
# join on critic_rev
df = df.merge(df_critic_rev, on='title', how='left', indicator=True)
df = df.rename(columns={'_merge': 'merge_indicator_critic_revs'})
df

Unnamed: 0,title,platform,release_date,other_platforms,metascore,user_score,developer,publisher,genre,players,...,url,metacritic_user_differenc,num_user_reviews,avg_user_score,num_user_reviewers,merge_indicator_user_revs,num_critic_reviews,avg_critic_score,num_critic_reviewers,merge_indicator_critic_revs
0,Persona 5 Royal,pc,"Oct 21, 2022","['PlayStation 4', 'PlayStation 5', 'Switch', '...",97.0,8.9,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,...,https://metacritic.com/game/pc/persona-5-royal,88.1,973.0,8.861254,973.0,both,70.0,91.314286,70.0,both
1,Elden Ring,xbox-series-x,"Feb 25, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...",96.0,7.7,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,...,https://metacritic.com/game/xbox-series-x/elde...,88.3,7070.0,7.438755,7070.0,both,89.0,90.988764,89.0,both
2,Elden Ring,playstation-5,"Feb 25, 2022","['PC', 'PlayStation 4', 'Xbox One', 'Xbox Seri...",96.0,7.9,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,...,https://metacritic.com/game/playstation-5/elde...,88.1,7070.0,7.438755,7070.0,both,89.0,90.988764,89.0,both
3,Portal Companion Collection,switch,"Jun 28, 2022",,96.0,8.3,Valve Software,Valve Software,"['Miscellaneous', ', ...",,...,https://metacritic.com/game/switch/portal-comp...,87.7,28.0,7.250000,28.0,both,5.0,93.000000,5.0,both
4,Persona 5 Royal,xbox-series-x,"Oct 21, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Swit...",95.0,8.4,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,...,https://metacritic.com/game/xbox-series-x/pers...,86.6,973.0,8.861254,973.0,both,70.0,91.314286,70.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19634,Resident Evil: Survivor,playstation,"Aug 30, 2000",,39.0,6.2,Capcom,Capcom,"['Action Adventure', ', ...",1 Player,...,https://metacritic.com/game/playstation/reside...,32.8,,,,left_only,10.0,47.700000,10.0,both
19635,ECW Anarchy Rulz,dreamcast,"Dec 30, 2000",,38.0,6.8,Acclaim Studios Salt Lake City,Acclaim,"['Action', ', ...",,...,https://metacritic.com/game/dreamcast/ecw-anar...,31.2,,,,left_only,15.0,40.133333,15.0,both
19636,Duke Nukem: Land of the Babes,playstation,"Sep 19, 2000",,37.0,6.9,n-Space,GT Interactive,"['Action', ', ...",1-2,...,https://metacritic.com/game/playstation/duke-n...,30.1,,,,left_only,7.0,43.285714,7.0,both
19637,Mortal Kombat: Special Forces,playstation,"Jun 30, 2000",,28.0,2.8,Midway,Midway,"['Action', ', ...",1 Player,...,https://metacritic.com/game/playstation/mortal...,25.2,,,,left_only,7.0,31.000000,7.0,both


## Define List of Games to be Analysized


In [12]:
print('before droppping rows with null reviews: ', df.shape)
print('after droppping rows with null reviews: ', df.dropna(subset=['num_user_reviews']).shape)

# drop games without user reviews
df = df.dropna(subset=['num_user_reviews'])
df.shape

before droppping rows with null reviews:  (19639, 22)
after droppping rows with null reviews:  (10751, 22)


(10751, 22)

In [16]:
# Get only one platform.
# !Change to include averages
df = df.groupby('title').first()
df = df.sort_values(by='num_user_reviews', ascending=False).head(1)

In [17]:
# Get the top N games and pass to a list
lst_games = df.index.tolist()
print(lst_games)
print("Number of games to be analyzed: ", len(lst_games))

['Cyberpunk 2077']
Number of games to be analyzed:  1


# NLP

## NLP Pre-process

### Select Workig dataframe for NLP task

In [18]:
# Select only a subset of the reviews
df_user_rev = df_user_rev_full[df_user_rev_full['title'].isin(lst_games)]
print('Number of reviews: ' ,df_user_rev.shape)

Number of reviews:  (12058, 10)


In [19]:
df_user_rev

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page
33592,33592,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,supportgoodgame,\nThis is not fun open world game like GTA 4/5...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...
33593,33593,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,1,Jennifer_,\nOne of the worst AAA games of all time. Bori...,"Feb 16, 2022",user,https://metacritic.com/game/playstation-5/cybe...
33594,33594,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,JustHonestGamer,Cyberpunk 2077 was in development about 8 year...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...
33595,33595,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,lonelydaysover,\nThis patch didn't fix pretty much anything t...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...
33596,33596,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,OnlyOneHere,This game is still a huge steaming pile of gar...,"Feb 16, 2022",user,https://metacritic.com/game/playstation-5/cybe...
...,...,...,...,...,...,...,...,...,...,...
199103,199103,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,1,Vincents,\nUnfinished. Too many bugs. Need to do ALL si...,"Dec 29, 2020",user,https://metacritic.com/game/playstation-4/cybe...
199104,199104,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,1,YSA2015,لعبة خايسة مستحيل تستمتع باللعبة علي اي منصة غ...,"Dec 30, 2020",user,https://metacritic.com/game/playstation-4/cybe...
199105,199105,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,4,Crazyclown05,This has little to no effort put into AI. The ...,"Dec 31, 2020",user,https://metacritic.com/game/playstation-4/cybe...
199106,199106,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,yaindmachine,**** this goddamn studio for lying so god damn...,"Jan 1, 2021",user,https://metacritic.com/game/playstation-4/cybe...


#### Distribution of Number of Words per Review

In [20]:
# Create word Count 
df_user_rev['word_count'] = df_user_rev['review'].apply(lambda x: len(str(x).split()))

df_user_rev['word_count'].iplot(
    kind='hist',
    bins=300,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Text Length Distribution',
    )

#### [TODO] Statistical Summary of the NLP's Data Frame

### Clean Text

In [21]:
# Function to clean the text
def preprocess(text):
    text = text.replace('\n', "")
    text = text.replace('\r', "")
    return text

In [23]:
tqdm.pandas()
df_user_rev['review'] = df_user_rev['review'].progress_apply(lambda x: preprocess(x))

100%|██████████| 12058/12058 [00:00<00:00, 416264.74it/s]


In [24]:
#inspect the text
print('3 random review \n')
cl = df_user_rev.sample(3)['review'].values
for c in cl:
    print(c)

3 random review 

Игра для ПК. Консоли идут в сторону и кукурекают на 0 баллов
just meh..................................................................
So I played the game on a fairly budget pc (ryzen 5 1600 /rx 580 8gb) with optimised 60fps settings (high textures and everything else on medium low with cas res scale to 80% and a fps lock) and I didn't experience so many immersion breaking bugs or performance hiccups (it really felt like the farcry experience and that's not so much a compliment knowing how buggy and fun inthe same time ubisoft games are). The only really annoying bug was the lod  low polygon charachters some times after loading the game because I've installed it on a hddd (like the consoles so even with the really slow iw hdd setting.. It still has issues with mechanical drives even if mine is faster than a console ). Still I had and I'm still having plentiful of fun. I actually thought I woukd hate the combat but i feels fun. Driving might be  needing some improvem

### Detect and Remove non Enlgish Reviews

Samping the data we noticed that the dataset contained reviews in multiple languagues. In this section, we'll investitage this.

In [25]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("language_detector")
def detectLanguage(text):
    text = ' '.join(text.split()[:5]) # In order to save memory, analyse only the first 5 workds
    doc = nlp(text)
    return doc._.language

# print(detectLanguage("I am happy"))



In [26]:
# Create languae_review colum
print(f'Number of rows before removing non engligh words: {df_user_rev.shape[0]} \n')
df_user_rev['language_review'] = df_user_rev['review'].progress_map(lambda text: detectLanguage(text))

Number of rows before removing non engligh words: 12058 



100%|██████████| 12058/12058 [01:36<00:00, 124.31it/s]


In [29]:
# Inspect number of reviews per language
df_user_rev.groupby(['language_review']).size().sort_values().tail()

# df_user_rev[df_user_rev['language_review'] == 'es']['review'].sample(5).values

language_review
pl     191
es     323
pt     345
ru    1337
en    9356
dtype: int64

12058

In [34]:
print("Before filtering out non-english reviews: ", len(df_user_rev['language_review']))
# select languages with only English Reviews
df_user_rev = df_user_rev[df_user_rev['language_review'] == 'en']
print("Only english reviews: ", len(df_user_rev['language_review']))
df_user_rev

Before filtering out non-english reviews:  12058
Only english reviews:  9356


Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page,word_count,language_review
33592,33592,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,supportgoodgame,This is not fun open world game like GTA 4/5 o...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...,41,en
33593,33593,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,1,Jennifer_,One of the worst AAA games of all time. Boring...,"Feb 16, 2022",user,https://metacritic.com/game/playstation-5/cybe...,20,en
33594,33594,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,JustHonestGamer,Cyberpunk 2077 was in development about 8 year...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...,210,en
33595,33595,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,lonelydaysover,This patch didn't fix pretty much anything tha...,"Feb 17, 2022",user,https://metacritic.com/game/playstation-5/cybe...,49,en
33596,33596,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,0,OnlyOneHere,This game is still a huge steaming pile of gar...,"Feb 16, 2022",user,https://metacritic.com/game/playstation-5/cybe...,445,en
...,...,...,...,...,...,...,...,...,...,...,...,...
199101,199101,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,kbaylake,This is the worst game ever runs worse than a ...,"Dec 28, 2020",user,https://metacritic.com/game/playstation-4/cybe...,31,en
199103,199103,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,1,Vincents,Unfinished. Too many bugs. Need to do ALL side...,"Dec 29, 2020",user,https://metacritic.com/game/playstation-4/cybe...,47,en
199105,199105,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,4,Crazyclown05,This has little to no effort put into AI. The ...,"Dec 31, 2020",user,https://metacritic.com/game/playstation-4/cybe...,424,en
199106,199106,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,yaindmachine,**** this goddamn studio for lying so god damn...,"Jan 1, 2021",user,https://metacritic.com/game/playstation-4/cybe...,79,en


### Word inflection (lemmatization) 

In [35]:
# Download packages in order to lemmatize
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[nltk_data] Downloading package wordnet to /Users/gio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/gio/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [36]:
# Function to lemmatize sentence
def lemmatize_spacy(sentence):
    # Parse the sentence using the loaded 'en' model object `nlp`
    doc = nlp(sentence)
    # Extract the lemma for each token and join
    return " ".join([token.lemma_ for token in doc])

In [37]:
# Lemmatize words
df_user_rev['lemmatized_user_rev'] = df_user_rev['review'].progress_map(lambda text: lemmatize_spacy(text))

100%|██████████| 9356/9356 [01:52<00:00, 83.01it/s] 


In [51]:
# inspect some reviews
print('3 Lemmatized Reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity == 1, ['lemmatized_user_rev']].sample(3).values
for c in cl:
    print(c[0])

3 Lemmatized Reviews: 

awesome gameplay , awesome graphic , awesome story . run like a dream on the Series X
good hhjjjjjjjjjjjjjjjjjjjjjjjjjiuijujjjjjijiuujjuuuujjjjjijjjjjjjjjjjjjjuujjjjjjjjjjjj
one of the good game of all time


### Blob Polarity Analys

In [38]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [39]:
df_user_rev['polarity'] = df_user_rev['review'].map(lambda text: TextBlob(text).sentiment.polarity)

Positive Reviews

In [40]:
print('3 random reviews with the highest positive sentiment polarity: \n')
cl = df_user_rev.loc[df_user_rev.polarity == 1, ['review']].sample(3).values
for c in cl:
    print(c[0])

3 random reviews with the highest positive sentiment polarity: 

series x, good!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
The beginning was a bit "meh" but when you dive into it, it is one of the best games
One of the Greatest Games Ever Made. The storyline is masterpiece, wish it were longer, can't wait for expansions.


Neutral Reviews

In [42]:
print('3 random neutral reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity == 0, ['review']].sample(3).values
for c in cl:
    print(c[0])

3 random neutral reviews: 

When only money matters this company a lie to us  bug bug bug bug bug bug bug
Trash! Freeze, framedrop, bug, low budget , prev-prevgenNo refund on Playstation.
Well yes there are Bugs, but we had worser games. And Version 1.04 ist in the way!


Negative Reviews

In [43]:
print('3 random negative reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity == -1, ['review']].sample(3).values
for c in cl:
    print(c[0])

3 random negative reviews: 

Absolutely disgusting, they knew what they were doing and they released it as an alpha build so they could get the december holiday cash grab no matter the repercussions
Ps4 error window every 30 minutes. Terrible product, it needs problably a couple of years to be playable
Awful performance, awful appearance and a lot of bugs and crashes.Refund 2077


Distribution of Polarity Score

In [44]:
#The distribution of review sentiment polarity score

df_user_rev['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Distribution of Review Ratings

In [45]:
df_user_rev['user_score'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

## [TODO] Blob Subjectivity Analysis
Subjectivity quantifies the amount of personal opinion and factual information contained in the text. The higher subjectivity means that the text contains personal opinion rather than factual information

## N-Grams Distribution

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

### 1-Gram Analysis

In [52]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
    
common_words = get_top_n_words(df_user_rev['lemmatized_user_rev'], 40)
# for word, freq in common_words:
#     print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review after removing stop words')


### 2-Grams Analysis

In [53]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(df_user_rev['lemmatized_user_rev'], 20)

# for word, freq in common_words:
#     print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')


### 3-Grams Analysis

In [54]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(df_user_rev['lemmatized_user_rev'], 20)

# for word, freq in common_words:
#     print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review after removing stop words')

## Part-Of-Speech Tagging (POS) Analysis

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gio/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
blob = TextBlob(str(df_user_rev['lemmatized_user_rev']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    title='Top 20 Part-of-speech tagging for review corpus')

## Slicing Based on occurency of "gameplay"

In [80]:
# Uncomment if you like to visualize how the regex is working

# df_test = df_user_rev[df_user_rev['title'] == "Cyberpunk 2077" ].head(1)

# print(df_test['lemmatized_user_rev'].values)

# df_test[df_test["lemmatized_user_rev"].str.contains(r"\bdate simulators\b")]


['this be not fun open world game like GTA 4/5 or Watch Dogs , no sir . this be date simulator for people who have never speak with woman before . lifeless city , boring story and soulless character . not recommend this game to anyone .']


Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page,word_count,language_review,lemmatized_user_rev,polarity


In [82]:
df_user_rev = df_user_rev[df_user_rev["lemmatized_user_rev"].str.contains(r"\bgameplay\b")]
df_user_rev

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page,word_count,language_review,lemmatized_user_rev,polarity
33598,33598,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,8,Awanka,This review is for the 1.5 next gen version th...,"Apr 29, 2022",user,https://metacritic.com/game/playstation-5/cybe...,386,en,this review be for the 1.5 next gen version th...,0.127308
33603,33603,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,7,HaloFever,Cyberpunk 2077 is a living example of failure ...,"Apr 29, 2022",user,https://metacritic.com/game/playstation-5/cybe...,779,en,cyberpunk 2077 be a live example of failure in...,-0.008383
33604,33604,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,8,80sChild,"After the ""Next Gen"" update the game is finall...","Mar 11, 2022",user,https://metacritic.com/game/playstation-5/cybe...,152,en,"after the "" Next Gen "" update the game be fina...",0.168182
33608,33608,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,10,Samyar91,"Well..just I have to say,I’m glad that I didn’...","Jun 14, 2022",user,https://metacritic.com/game/playstation-5/cybe...,162,en,"well .. just I have to say , I’m glad that I d...",0.028646
33610,33610,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,9,Entertainer,"Aw man, just finished C2077 on PS5... goosebum...","Mar 15, 2022",user,https://metacritic.com/game/playstation-5/cybe...,139,en,"aw man , just finish C2077 on PS5 ... goosebum...",0.210083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199055,199055,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,shadow_arc,Literally doesn't work. Shambles of a release ...,"Jan 1, 2021",user,https://metacritic.com/game/playstation-4/cybe...,31,en,literally do not work . shamble of a release a...,0.800000
199064,199064,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,1,monti86,"Looks like a scam from CDPR, they said that it...","Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,24,en,"look like a scam from CDPR , they say that it ...",-1.000000
199067,199067,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,ViliusRepublic,"Gameplay and story is good, it's getting bette...","Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,16,en,"gameplay and story be good , it be get well ev...",0.566667
199088,199088,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,4,juanrga912,DO NOT play this game on the base PS4. I playe...,"Dec 16, 2020",user,https://metacritic.com/game/playstation-4/cybe...,347,en,do not play this game on the base PS4 . I play...,-0.018639


## Game Aesthetics Analysis

### Adjectives pre-nominal modifiers to "gameplay" 
e.g. "smooth gameplay"

In [46]:
my_dict = {}

my_dict['giovanin'] = 2

my_dict['gabriel'] = 2

my_dict['gabriel'] += 2

my_dict


{'giovanin': 2, 'gabriel': 4}

In [None]:
{'gameplay' : {'smooth' : 2}, {'fast' : 2}}

In [50]:
# pass a text and the noun to be a
def get_adj_modifiers(text, word):
    doc = nlp(text) #pass in to a spacy doc object
    for chunk in doc.noun_chunks:
        adj = []
        noun = ""
        for tok in chunk:
            if tok.pos_ == "NOUN":
                if tok.text not in noun_adj_pairs: # if noun (e.g. not in dictionary keys)
                    noun_adj_pairs[tok.text] = 1 #create key for the NOUN
                else:
                     noun_adj_pairs[tok.pos] += 1
            if tok.pos_ == "ADJ":
                adj.append(tok.text)
        # if noun:
        #     noun_adj_pairs.update({noun:adj})
    return noun_adj_pairs

# Manual testing

#Load spacy loader #pass in to a spacy doc object
nlp = spacy.load('en_core_web_sm')

# get pair of {NOUN : {ADJ:count}}
noun_adj_pairs = {} 

In [51]:
phrases_text_adj = ["fast gameplay. bad character", "smooth gameplay"]

print(get_adj_modifiers(phrases_text_adj[0], 'gampeplay'))
# print(get_adj_modifiers(phrases_text_adj[1], 'gampeplay'))

{'gameplay': 1, 'character': 1}


Experiementing Adjectives extracting
* Extract all the adjectives which wereused as a pre-nominal modifier to “gameplay”
* Adjectival complement of “gameplay” (e.g. “gameplay was smooth”)

In [109]:
df_user_rev[df_user_rev["lemmatized_user_rev"].str.contains(r"\bsmooth gameplay\b")]

print(df_user_rev[df_user_rev['Unnamed: 0'] == 121017]['review'].values)

["I don't believe that anyone could give this game a score like 0-3, it deserves objectively 8-9. I play on PC. Played for 100+ hours. No crashes, no bugs (I started playing on 1.06 version). There was one bug with the missing sound from NPC, but it was fixed by restarting the game. the game is REALLY IMMERSIVE, I can't stop playing it even after I finished the main quest and all the main side quests (affecting the storyline). The city is really HUGE. I'm playing on higest settings (i7-10500, gtx1660), so the graphics feels awesome. The big thing I'm missing is that I cannot see myself from 3rd person view (what's the point of customizing appearance). And yes, the driving is horrible. But it's worth to go on foot anyway to see around and build your athletics stat. I downloaded it from torrent first to try it out, but then decided to buy it, because it's absolutely worth it. There will be some bugs, of course, but I play Elder Scrolls Online which is 6 years old, and there are still a l

In [23]:
print(get_adj_modifiers("gameplay was smooth", 'gameplay'))

{'gameplay': []}


In [30]:
# check all tags
doc = nlp("Gameplay was easy and smooth. The characters were not well designed")
for token in doc:
    print(token.text, token.pos_, token.tag_)

displacy.render(doc, style="dep")

Gameplay NOUN NN
was AUX VBD
easy ADJ JJ
and CCONJ CC
smooth ADJ JJ
. PUNCT .
The DET DT
characters NOUN NNS
were AUX VBD
not PART RB
well ADV RB
designed VERB VBN


In [16]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("gameplay was smooth")
for chunk in doc.noun_chunks:
    print(chunk.text)

gameplay


In [4]:
spacy.explain("nmod")

'modifier of nominal'

In [92]:
print('3 random reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity > -1, ['review']].sample(3).values
for c in cl:    
    print(c[0])

3 random reviews: 

Buggy, and render problems, but story, personal story path options, character development, gameplay, and game mechanics are all very good and outweigh the bugs and rendering issues for me. Those things may take me out of the experience for a short period of time but not too long later I'm sucked right back into the experience, I'm very much enjoying the game. Only issues for me is the bugs and rendering.
After multiple patches and 18 months later, the game still randomly crashes my xbox one x (5+ times and counting). I agree that the game looks good, but the atmosphere is bland. Everything is like window dressing, a background of non interactive buildings and people with nothing to say. Furthermore, the game is bloated, like Ubisoft proportions bloated. You've got 28/29 main missions, 67 side missions, 17 cyberpsycho's, 70 gigs (merc missions) and at least 80 hustles (minor skirmishes). I know people want value for money, but quantity is not the same as quality. Esp

## Text Network Analysis
* visualize many-to-many relationships and their strength
* discover complex structures in the data (such as customers using the same email address to receive specific discounts)

https://towardsdatascience.com/text-network-analysis-generate-beautiful-network-visualisations-a373dbe183ca

### Chord Diagram
https://datavizcatalogue.com/methods/chord_diagram.html

## Gameplay Aesthetics from Game Reviews  
How do players understand, perceive, and talk about gameplay?

1. Extracting all sentences in which the word “gameplay” (and variations such as “gameplays”) appeared.
2. identified the POS of the words and parsed the sentences using a POS tagger and a parser
3. Extract all the adjectives which wereused as a pre-nominal modifier to “gameplay” (e.g. “smooth gameplay”) or as an adjectival complement of “gameplay” (e.g. “gameplay was smooth”)

In [None]:
# 1 Extracting all sentences in which the word “gameplay” (and variations such as “gameplays”) appeared.


# Join DataFrames