# EDA & Data Loading

___
## Games Dataset


In [2]:
# Import libraries

# Data Manipulation
import pandas as pd
import numpy as np
import project_data_utils
import os
from collections import Counter
import statistics

# NLP libraries
from textblob import TextBlob
from textblob import Word
import spacy_fastlang
import spacy
from spacy import displacy
import nltk
import re

from nltk.probability import FreqDist
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher

# Monitoring libraries
from tqdm import tqdm, tqdm_notebook

# Data vizualization Libraries
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [3]:
# Load list of games
df_games = pd.read_csv("data/metacritic_games_full.csv" ,index_col=0)
df_games['user_score'] = pd.to_numeric(df_games['user_score'],errors='coerce')
df_games['metacritic_user_differenc'] = df_games['metascore'] - df_games['user_score']
# Create metascore and user discrepancy and tranform to numeric
df_games.sort_values(by='metacritic_user_differenc')
df_games

Unnamed: 0,title,platform,release_date,other_platforms,metascore,user_score,developer,publisher,genre,players,rating,summary,url,metacritic_user_differenc
0,Persona 5 Royal,pc,"Oct 21, 2022","['PlayStation 4', 'PlayStation 5', 'Switch', '...",97.0,8.9,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,,Prepare for an all-new RPG experience in Perso...,https://metacritic.com/game/pc/persona-5-royal,88.1
1,Elden Ring,xbox-series-x,"Feb 25, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...",96.0,7.7,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,M,A New World Created By Hidetaka Miyazaki And G...,https://metacritic.com/game/xbox-series-x/elde...,88.3
2,Elden Ring,playstation-5,"Feb 25, 2022","['PC', 'PlayStation 4', 'Xbox One', 'Xbox Seri...",96.0,7.9,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,M,A New World Created By Hidetaka Miyazaki And G...,https://metacritic.com/game/playstation-5/elde...,88.1
3,Portal Companion Collection,switch,"Jun 28, 2022",,96.0,8.3,Valve Software,Valve Software,"['Miscellaneous', ', ...",,,"Including Portal and Portal 2, the Companion C...",https://metacritic.com/game/switch/portal-comp...,87.7
4,Persona 5 Royal,xbox-series-x,"Oct 21, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Swit...",95.0,8.4,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,M,Prepare for an all-new RPG experience in Perso...,https://metacritic.com/game/xbox-series-x/pers...,86.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19634,Resident Evil: Survivor,playstation,"Aug 30, 2000",,39.0,6.2,Capcom,Capcom,"['Action Adventure', ', ...",1 Player,M,At last a first-person shooting gamer based in...,https://metacritic.com/game/playstation/reside...,32.8
19635,ECW Anarchy Rulz,dreamcast,"Dec 30, 2000",,38.0,6.8,Acclaim Studios Salt Lake City,Acclaim,"['Action', ', ...",,M,Anarchy in the USA!\r *All new control system!...,https://metacritic.com/game/dreamcast/ecw-anar...,31.2
19636,Duke Nukem: Land of the Babes,playstation,"Sep 19, 2000",,37.0,6.9,n-Space,GT Interactive,"['Action', ', ...",1-2,M,I am basing this review on the four votes tha...,https://metacritic.com/game/playstation/duke-n...,30.1
19637,Mortal Kombat: Special Forces,playstation,"Jun 30, 2000",,28.0,2.8,Midway,Midway,"['Action', ', ...",1 Player,M,Игра унылейшая. В свое время купил её думая чт...,https://metacritic.com/game/playstation/mortal...,25.2


In [4]:
# Check dtypes
df_games.dtypes

title                         object
platform                      object
release_date                  object
other_platforms               object
metascore                    float64
user_score                   float64
developer                     object
publisher                     object
genre                         object
players                       object
rating                        object
summary                       object
url                           object
metacritic_user_differenc    float64
dtype: object

## User Reviews Dataset

In [5]:
df_user_rev = pd.read_csv("data/metacritic_user_reviews_full.csv")
df_user_rev_full = df_user_rev # full copy
df_user_rev.head()

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page
0,0,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,Trix122,"\n100+ h of main story , good gameplay and pro...","Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
1,1,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,7,Runwin,El juego tiene una cantidad de relleno abrumad...,"Oct 26, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
2,2,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,Godskrieg,\nAbsolute god of recent JRPGs.\rIf you like t...,"Oct 24, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
3,3,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,jackspade152,\nSimply amazing. I finished this Game on ps4 ...,"Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...
4,4,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,10,MLGANDREWPLAYS,People have been waiting for a long time for t...,"Oct 25, 2022",user,https://metacritic.com/game/pc/persona-5-royal...


### Title aggreagted data
Remove platforma granularity.

In [6]:
# Group by title so we remove the platform granularity
df_user_rev.groupby(['title']).size().sort_values()

# Group by and perform and a series of operations
df_user_rev = df_user_rev.groupby(['title']).agg({'url':'count', 'user_score':'mean', 'reviewer':'count'}).sort_values(by='url')
df_user_rev = df_user_rev.rename(columns={'url':'num_user_reviews', 'user_score': 'avg_user_score', 'reviewer':'num_user_reviewers'})
df_user_rev.sort_values(by='num_user_reviews', ascending=False)

Unnamed: 0_level_0,num_user_reviews,avg_user_score,num_user_reviewers
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cyberpunk 2077,12058,5.056394,12058
Warcraft III: Reforged,10100,0.070198,10100
The Last of Us Part II,10100,4.112376,10100
Ghost of Tsushima,9505,9.289637,9505
Death Stranding,8764,7.219763,8764
...,...,...,...
Trulon: The Shadow Engine,1,6.000000,1
Tumblestone,1,0.000000,1
Fallen Legion: Revenants,1,9.000000,1
Oblitus,1,4.000000,1


## Critic Reviews Dataset

In [7]:
df_critic_rev = pd.read_csv("data/metacritic_critic_reviews_full.csv")
df_critic_rev_full = df_critic_rev #full copy
df_critic_rev.head()

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page
0,0,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,99,Game Rant,From its captivating music and memorable story...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
1,1,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,97,God is a Geek,Persona 5 Royal is quite frankly one of the be...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
2,2,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,95,The Mako Reactor,Persona 5 Royal was already the best version o...,"Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
3,3,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,95,Hooked Gamers,Persona 5 Royal is chock full of amazing music...,"Oct 28, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...
4,4,https://metacritic.com/game/pc/persona-5-royal,Persona 5 Royal,pc,94,Hobby Consolas,"If you love RPGs and didn't play it on PS4, do...","Oct 17, 2022",critic,https://metacritic.com/game/pc/persona-5-royal...


In [8]:
df_critic_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433384 entries, 0 to 433383
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   433384 non-null  int64 
 1   url          433384 non-null  object
 2   title        433384 non-null  object
 3   platform     433384 non-null  object
 4   user_score   433384 non-null  int64 
 5   reviewer     433384 non-null  object
 6   review       433362 non-null  object
 7   date         244119 non-null  object
 8   review_type  433384 non-null  object
 9   url_page     433384 non-null  object
dtypes: int64(2), object(8)
memory usage: 33.1+ MB


### Title Aggregated Data
Remove platforma granularity.

In [9]:
# Group by title so we remove the platform granularity
df_critic_rev.groupby(['title']).size().sort_values()

df_critic_rev = df_critic_rev.groupby(['title']).agg({'url':'count', 'user_score':'mean', 'reviewer':'count'}).sort_values(by='url')
df_critic_rev = df_critic_rev.rename(columns={'url':'num_critic_reviews', 'user_score': 'avg_critic_score', 'reviewer':'num_critic_reviewers'})

In [10]:
df_critic_rev

Unnamed: 0_level_0,num_critic_reviews,avg_critic_score,num_critic_reviewers
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EyePet: Move Edition,1,91.000000,1
Kingdom Hearts 358/2 Days,1,90.000000,1
BioShock 2: Minerva's Den,1,91.000000,1
DCS: Black Shark,1,94.000000,1
Tournament of Legends,1,80.000000,1
...,...,...,...
Madden NFL 07,224,80.156250,224
X-Men: The Official Game,224,52.843750,224
Marvel: Ultimate Alliance,229,80.371179,229
Resident Evil 5,231,81.155844,231


## Join on the Games DF the aggregate of User and Critic Reviews

In [11]:
# join on user_rev
df = df_games.merge(df_user_rev, on='title', how='left', indicator=True)
df = df.rename(columns={'_merge': 'merge_indicator_user_revs'})

In [12]:
# join on critic_rev
df = df.merge(df_critic_rev, on='title', how='left', indicator=True)
df = df.rename(columns={'_merge': 'merge_indicator_critic_revs'})
df

Unnamed: 0,title,platform,release_date,other_platforms,metascore,user_score,developer,publisher,genre,players,...,url,metacritic_user_differenc,num_user_reviews,avg_user_score,num_user_reviewers,merge_indicator_user_revs,num_critic_reviews,avg_critic_score,num_critic_reviewers,merge_indicator_critic_revs
0,Persona 5 Royal,pc,"Oct 21, 2022","['PlayStation 4', 'PlayStation 5', 'Switch', '...",97.0,8.9,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,...,https://metacritic.com/game/pc/persona-5-royal,88.1,973.0,8.861254,973.0,both,70.0,91.314286,70.0,both
1,Elden Ring,xbox-series-x,"Feb 25, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...",96.0,7.7,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,...,https://metacritic.com/game/xbox-series-x/elde...,88.3,7070.0,7.438755,7070.0,both,89.0,90.988764,89.0,both
2,Elden Ring,playstation-5,"Feb 25, 2022","['PC', 'PlayStation 4', 'Xbox One', 'Xbox Seri...",96.0,7.9,From Software,Bandai Namco Games,"['Role-Playing', ', ...",Up to 4,...,https://metacritic.com/game/playstation-5/elde...,88.1,7070.0,7.438755,7070.0,both,89.0,90.988764,89.0,both
3,Portal Companion Collection,switch,"Jun 28, 2022",,96.0,8.3,Valve Software,Valve Software,"['Miscellaneous', ', ...",,...,https://metacritic.com/game/switch/portal-comp...,87.7,28.0,7.250000,28.0,both,5.0,93.000000,5.0,both
4,Persona 5 Royal,xbox-series-x,"Oct 21, 2022","['PC', 'PlayStation 4', 'PlayStation 5', 'Swit...",95.0,8.4,Atlus,Sega,"['Role-Playing', ', ...",No Online Multiplayer,...,https://metacritic.com/game/xbox-series-x/pers...,86.6,973.0,8.861254,973.0,both,70.0,91.314286,70.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19634,Resident Evil: Survivor,playstation,"Aug 30, 2000",,39.0,6.2,Capcom,Capcom,"['Action Adventure', ', ...",1 Player,...,https://metacritic.com/game/playstation/reside...,32.8,,,,left_only,10.0,47.700000,10.0,both
19635,ECW Anarchy Rulz,dreamcast,"Dec 30, 2000",,38.0,6.8,Acclaim Studios Salt Lake City,Acclaim,"['Action', ', ...",,...,https://metacritic.com/game/dreamcast/ecw-anar...,31.2,,,,left_only,15.0,40.133333,15.0,both
19636,Duke Nukem: Land of the Babes,playstation,"Sep 19, 2000",,37.0,6.9,n-Space,GT Interactive,"['Action', ', ...",1-2,...,https://metacritic.com/game/playstation/duke-n...,30.1,,,,left_only,7.0,43.285714,7.0,both
19637,Mortal Kombat: Special Forces,playstation,"Jun 30, 2000",,28.0,2.8,Midway,Midway,"['Action', ', ...",1 Player,...,https://metacritic.com/game/playstation/mortal...,25.2,,,,left_only,7.0,31.000000,7.0,both


In [13]:
print('before droppping rows with null reviews: ', df.shape)
print('after droppping rows with null reviews: ', df.dropna(subset=['num_user_reviews']).shape)

# drop games without user reviews
df = df.dropna(subset=['num_user_reviews'])

before droppping rows with null reviews:  (19639, 22)
after droppping rows with null reviews:  (10751, 22)


# NLP

### Define Games to be Analysized
We're not analysing all of the games, either because some of them does not have a relevant number of reviews or because it'd be too computationnally expensive to do so.

In [14]:
# Get rid of the plataform granularity
# !Change to include averages
df = df.groupby('title').first()
df = df.sort_values(by='num_user_reviews', ascending=False).head(1) # CHANGE HERE IN ORDER TO SELECT MORE GAMES

In [15]:
# Get the top N games and pass to a list
lst_games = df.index.tolist()
print(lst_games)
print("Number of games to be analyzed: ", len(lst_games))

['Cyberpunk 2077']
Number of games to be analyzed:  1


In [16]:
# Select only a subset of the reviews based on the lst_games
df_user_rev = df_user_rev_full[df_user_rev_full['title'].isin(lst_games)]
print('Number of reviews: ' ,df_user_rev.shape)

Number of reviews:  (12058, 10)


### Statistical Summary Before Pre-Processing

#### [TODO] Statistical Summary of the NLP's Data Frame

#### Number of Words per Review

In [17]:
# Create word Count 
df_user_rev['word_count'] = df_user_rev['review'].apply(lambda x: len(str(x).split()))

df_user_rev['word_count'].iplot(
    kind='hist',
    bins=300,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Text Length Distribution',
    )

## Pre-process
Perfom all the pre-processing tasks on the review text.

### Clean Text
Since most of the text is already cleaned from the scrapper process, here very little has to be done

In [18]:
# Function to clean the text
def preprocess(text):
    text = text.replace('\n', "")
    text = text.replace('\r', "")
    text = re.sub(' +', ' ', text) #remove multiple spaces on the text
    return text

In [19]:
# Apply preprocess fuction
tqdm.pandas()
df_user_rev['review'] = df_user_rev['review'].progress_apply(lambda x: preprocess(x))

100%|██████████| 12058/12058 [00:00<00:00, 22225.34it/s]


In [21]:
#inspect the text
print('3 random review \n')
cl = df_user_rev.sample(3)['review'].values
for c in cl:
    print(c)

3 random review 

Overhyped past-gen unoptimized poop < that's what this game is now.Gameplay is Normal, nothing that you can't see in other game, RPG ? < did not find any, there are some RPG elements, but they mostly suck, very banal, Characters are normal, nothing special and nothing good, Story is OKish, not best not worst, BUT - still can't understand how this past-gen graphics require so much resources to show somehow acceptable FPS on PC, how it is possible, that Korean MMORPG from 2012 looks more Next-gen, than this 2020 Next-gen game, which requires a lot of resources to have somehow OKish FPS ?Feels like a downgraded version of Saints Row 3..... Looks worse, than modern Korean Pay to Win MMORPGs, Story is banal, characters are banal, world is interesting for a while, but quite empty and repetitive, gameplay is banal and works like a Bitcoin Miner.... Also CDPR lied a lot about this game, enough time to say that this game is a Fraud. P.S. - Music is really good but sound effect

### Create column with Spacy Objects for each Review
After a basic cleaning of the text, we create a column with the spacy object DOC for each review

In [22]:
# Load neccessary libraries in other to detect the language and perform other operations
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("language_detector")



<spacy_fastlang.LanguageDetector at 0x7fea2f415110>

In [24]:
df_user_rev['review_spacy_obj'] = df_user_rev['review'].progress_apply(lambda x: nlp(x))

100%|██████████| 12058/12058 [05:30<00:00, 36.44it/s]


In [35]:
print("The column review_spacy_obj is of the:" , type(df_user_rev.at[33592, 'review_spacy_obj']))
# With this column we can apply whichever methods and operations spacy's allow us.

The column review_spacy_obj is of the: <class 'spacy.tokens.doc.Doc'>


In [43]:
# Uncomment to inspect how spacy objects behave

# doc = df_user_rev.at[33592, 'review_spacy_obj']

# print(" ".join([token.text for token in doc]))

# print(" ".join([token.lemma_ for token in doc]))
# print(type(" ".join([token.lemma_ for token in doc])))

This is not fun open world game like GTA 4/5 or Watch Dogs , no sir . This is dating simulator for people who have never spoken with women before . Lifeless city , boring story and soulless characters . Not recommending this game to anyone .
this be not fun open world game like GTA 4/5 or Watch Dogs , no sir . this be date simulator for people who have never speak with woman before . lifeless city , boring story and soulless character . not recommend this game to anyone .
<class 'str'>


### Remove non Enlgish Reviews

Samping the data we noticed that the dataset contained reviews in multiple languagues. In this section, we'll investitage this.

In [None]:
# Create column with language
df_user_rev['language'] =  df_user_rev['review_spacy_obj'].progress_apply(lambda doc: doc._.language)

100%|██████████| 12058/12058 [00:00<00:00, 143310.29it/s]


In [45]:
print("Before filtering out non-english reviews: ", len(df_user_rev['language']))
# select languages with only English Reviews
df_user_rev = df_user_rev[df_user_rev['language'] == 'en']
print("Only english reviews: ", len(df_user_rev['language']))
df_user_rev.sample(5)

Before filtering out non-english reviews:  9538
Only english reviews:  9538


Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page,word_count,review_spacy_obj,language
119487,119487,https://metacritic.com/game/pc/cyberpunk-2077,Cyberpunk 2077,pc,0,erv9,"I feel really sorry for all, who can not enjoy...","Dec 12, 2020",user,https://metacritic.com/game/pc/cyberpunk-2077/...,121,"(I, feel, really, sorry, for, all, ,, who, can...",en
198338,198338,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,twrm5555,This game is unplayable and should not have be...,"Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,36,"(This, game, is, unplayable, and, should, not,...",en
195539,195539,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,cricetus,"on ps4, the game is not optimized at all and w...","Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,27,"(on, ps4, ,, the, game, is, not, optimized, at...",en
123950,123950,https://metacritic.com/game/pc/cyberpunk-2077,Cyberpunk 2077,pc,0,MightyG,I give 0 points to counter these bought 10/10 ...,"Jan 2, 2021",user,https://metacritic.com/game/pc/cyberpunk-2077/...,134,"(I, give, 0, points, to, counter, these, bough...",en
198462,198462,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,Milliudes,Thank you so much for the politics in the vide...,"Mar 4, 2022",user,https://metacritic.com/game/playstation-4/cybe...,64,"(Thank, you, so, much, for, the, politics, in,...",en


### Lemmatization
Since we already have the spacy doc obj we simply perform an interation over the token and return it the text verion of the lemma 

In [46]:
# Download packages in order to lemmatize
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/gio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/gio/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [48]:
# Function to lemmatize sentence
def lemmatize_spacy(doc):
    return " ".join([token.lemma_ for token in doc])

In [49]:
# Lemmatize words
df_user_rev['lemmatized_user_rev'] = df_user_rev['review_spacy_obj'].progress_map(lambda doc: lemmatize_spacy(doc))

100%|██████████| 9538/9538 [00:01<00:00, 9145.99it/s] 


In [52]:
# inspect some reviews
print('3 Lemmatized Reviews: \n')
cl = df_user_rev.loc[df_user_rev.user_score > 0, ['lemmatized_user_rev']].sample(3).values
for c in cl:
    print(c[0])

3 Lemmatized Reviews: 

30 hour into the game . the game be very fun to play . some glitch and bug but nothing game breaking ( most just need to reload a previous autosave ) . however I think the over - marketing before the release of the game be seriously hurt the review of this game . free - roam be basically non - existent as nearly all content be in the quest . for example , a building will only be occupy if you accept a certain quest . so while I do like this game and can recommend it without hesitation , I do not think I will do multiple playthrough unless big change be make .
the game be a masterpiece . one of the good game ever . everything be fine ( non - critical bug ) . it be a pity that the release of the game turn into a scandal due to the lack of the game at the time of release , the creator be as close to incredible success as possible . in any case , such game be rarely make
I be about 15 hour into the game so far . the game be good , not great , do not live up to the h

## Slicing Based on occurency of "gameplay"

In [None]:
# Uncomment if you like to visualize how the regex is working

# df_test = df_user_rev[df_user_rev['title'] == "Cyberpunk 2077" ].head(1)
# print(df_test['lemmatized_user_rev'].values)
# df_test[df_test["lemmatized_user_rev"].str.contains(r"\bdate simulators\b")]


In [53]:
# Select only reviews that contain the word "gameplay"
df_user_rev = df_user_rev[df_user_rev["lemmatized_user_rev"].str.contains(r"\bgameplay\b")]
df_user_rev

Unnamed: 0.1,Unnamed: 0,url,title,platform,user_score,reviewer,review,date,review_type,url_page,word_count,review_spacy_obj,language,lemmatized_user_rev
33598,33598,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,8,Awanka,This review is for the 1.5 next gen version th...,"Apr 29, 2022",user,https://metacritic.com/game/playstation-5/cybe...,386,"(This, review, is, for, the, 1.5, next, gen, v...",en,this review be for the 1.5 next gen version th...
33603,33603,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,7,HaloFever,Cyberpunk 2077 is a living example of failure ...,"Apr 29, 2022",user,https://metacritic.com/game/playstation-5/cybe...,779,"(Cyberpunk, 2077, is, a, living, example, of, ...",en,cyberpunk 2077 be a live example of failure in...
33604,33604,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,8,80sChild,"After the ""Next Gen"" update the game is finall...","Mar 11, 2022",user,https://metacritic.com/game/playstation-5/cybe...,152,"(After, the, "", Next, Gen, "", update, the, gam...",en,"after the "" Next Gen "" update the game be fina..."
33608,33608,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,10,Samyar91,"Well..just I have to say,I’m glad that I didn’...","Jun 14, 2022",user,https://metacritic.com/game/playstation-5/cybe...,162,"(Well, .., just, I, have, to, say, ,, I’m, gla...",en,"well .. just I have to say , I’m glad that I d..."
33610,33610,https://metacritic.com/game/playstation-5/cybe...,Cyberpunk 2077,playstation-5,9,Entertainer,"Aw man, just finished C2077 on PS5... goosebum...","Mar 15, 2022",user,https://metacritic.com/game/playstation-5/cybe...,139,"(Aw, man, ,, just, finished, C2077, on, PS5, ....",en,"aw man , just finish C2077 on PS5 ... goosebum..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199055,199055,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,shadow_arc,Literally doesn't work. Shambles of a release ...,"Jan 1, 2021",user,https://metacritic.com/game/playstation-4/cybe...,31,"(Literally, does, n't, work, ., Shambles, of, ...",en,literally do not work . shamble of a release a...
199064,199064,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,1,monti86,"Looks like a scam from CDPR, they said that it...","Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,24,"(Looks, like, a, scam, from, CDPR, ,, they, sa...",en,"look like a scam from CDPR , they say that it ..."
199067,199067,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,0,ViliusRepublic,"Gameplay and story is good, it's getting bette...","Dec 11, 2020",user,https://metacritic.com/game/playstation-4/cybe...,16,"(Gameplay, and, story, is, good, ,, it, 's, ge...",en,"gameplay and story be good , it be get well ev..."
199088,199088,https://metacritic.com/game/playstation-4/cybe...,Cyberpunk 2077,playstation-4,4,juanrga912,DO NOT play this game on the base PS4. I playe...,"Dec 16, 2020",user,https://metacritic.com/game/playstation-4/cybe...,347,"(DO, NOT, play, this, game, on, the, base, PS4...",en,do not play this game on the base PS4 . I play...


In [54]:
cl = df_user_rev.loc[df_user_rev.user_score > -100, ['lemmatized_user_rev']].sample(3).values
for c in cl:
    print(c[0])

the bad " game " I have ever play . you can not see anything , you feel like you ’re go to throw up or have a seizure , and the AI be legitimately the bad in any game I ’ve ever see . they spend all this time on customize your penis and pubic hair , and hide * * * * all over the map ( not joke about either of these ) but decide the gameplay be not important at all . I request a refund and you should too . what an absolute disgrace of a game . and what ’ bad be how they deliberately try to hide all these issue , even prevent people from show video of the gameplay . shame on you .
the user score be highly misleading , due to their be a lot of piss off people right now , because of the bug . this game be a masterpeice and CD Project Red have clearly put a huge amount of effort into this . the problem with this game be that it push the xbox 1 to the limit of its capability . it still look good on xbox 1 but there be a night and day difference , between how it look on the xbox 1 and the ser

## Game Aesthetics Analysis

**Goal** is to get the top words that describe certain important features of a game (e.g. gameplay, story and so on):

**How:** Count the adjectives that modify gameplay for each game. Then we'll have:
Cyber Punk (smooth: 30, fast:24, tedious: 20)

**Usage:** Foreseeing a business application, we can use this feature to probide a more detaile description of the game "The gameplay is best described by the users as "Smooth, fast, and tedious"

**Glossay**
* amod -> adjectival modifier (it can modify noun or verb)
* conj -> conjunction

In [None]:
spacy.explain("conj")

### Adjectival modifiers to "gameplay" 
e.g. "smooth gameplay", "fast gameplay"

In [134]:
# updates a global variable adj_count, which holds a count of every adjective for each noun
def count_adjs(doc, matcher):
    matches = matcher(doc) #Pass the doc into a matcher object

    # Load teh count for each adjective in a dictionary
    for match_id, start, end in matches:
        
        # Depending on the rule, the adj and noun are placed different in the text span returned by spacy
        # case 1 when rule is "noun + lemma = be + adj"
        if nlp.vocab.strings[match_id] == "noun_lemma_adj":
            # "gameplay was smooth"
            noun_modified = doc[start].lower_ #get the lower case of the noun
            adj_modifier = doc[end-1].lower_ #get the lower case fo the adj

        # case 2 when the rule is "adj_noun"
        if nlp.vocab.strings[match_id] == "adj_noun":
            # "gameplay was smooth"
            adj_modifier = doc[start].lemma_.lower() #get the lower case of the adj
            noun_modified = doc[end-1].lemma_.lower() #get the lower case fo the noun

        # UPDATE DICTIONARY
        # Add noun if don't exist
        if noun_modified not in adj_count.keys():
            adj_count[noun_modified] = {}
        else:
            pass
        
        # Add count of a specific adjective for a specifi noun
        if adj_modifier in adj_count[noun_modified].keys():
            adj_count[noun_modified][adj_modifier] += 1
        else:
            adj_count[noun_modified][adj_modifier] = 1

In [135]:
# DEFINE MATCHER AND PATTERNS
matcher = Matcher(nlp.vocab)

# this pattern will match for exemple: "smooth gameplay", "tedious characters"
pattern_adj_noun = [
        {'POS': {'IN': ['ADJ']}}, 
        {'POS': {'IN': ['NOUN']}}
        ]# IN is used instead of adding all the patterns mannually.

# this pattern will match e.g."gameplay was smooth", "gameplay is smooth", which based on our EDA is a very commom form of describing the words
pattern_noun_lemma_adj = [
        {'POS': {'IN': ['NOUN']}}, 
        {'LEMMA': {'IN': ['be']}},
        {'POS': {'IN': ['ADJ']}},
]

matcher.add('adj_noun', [pattern_adj_noun])
matcher.add('noun_lemma_adj', [pattern_noun_lemma_adj])

In [140]:
# test matcher
doc = nlp("The smooth gameplays")
matches = matcher(doc)

adj_count = {}
count_adjs(doc, matcher=matcher)
adj_count

{'gameplay': {'smooth': 1}}

In [137]:
# iterate over the reviews and uptdate the global variable "adj_count"
adj_count = {}

df_user_rev['review_spacy_obj'].progress_map(lambda doc: count_adjs(doc=doc, matcher=matcher))

100%|██████████| 1208/1208 [00:00<00:00, 1518.06it/s]


33598     None
33603     None
33604     None
33608     None
33610     None
          ... 
199055    None
199064    None
199067    None
199088    None
199105    None
Name: review_spacy_obj, Length: 1208, dtype: object

In [138]:
print(pd.DataFrame(adj_count['gameplay'], index=['count']).T.sort_values(by='count', ascending=False))
print()
print(pd.DataFrame(adj_count['gameplay'], index=['count']).T.sort_values(by='count', ascending=False).sum())

             count
great           38
fun             34
good            29
actual          10
bad             10
...            ...
huge             1
unpolished       1
stunning         1
beautiful        1
interactive      1

[157 rows x 1 columns]

count    415
dtype: int64


#### Normalization
1. lemmatize adjectives and aggregate counts (e.g. best and bests, Great and great)
2. lemmatize and aggregate adjective count dictionaries (e.g. graphic and graphics)

In [None]:
# pass a text and the noun to be a
def get_adj_modifiers(text, word):
    doc = nlp(text) #pass in to a spacy doc object
    for chunk in doc.noun_chunks:
        adj = []
        noun = ""
        for tok in chunk:
            if tok.pos_ == "NOUN":
                if tok.text not in noun_adj_pairs: # if noun (e.g. not in dictionary keys)
                    noun_adj_pairs[tok.text] = 1 #create key for the NOUN
                else:
                     noun_adj_pairs[tok.pos] += 1
            if tok.pos_ == "ADJ":
                adj.append(tok.text)
        if noun:
            noun_adj_pairs.update({noun:adj})
    return noun_adj_pairs

# Manual testing

#Load spacy loader #pass in to a spacy doc object
nlp = spacy.load('en_core_web_sm')

# get pair of {NOUN : {ADJ:count}}
noun_adj_pairs = {} 

In [None]:
phrases_text_adj = "fast gameplay. bad character smooth gameplay"

print(get_adj_modifiers(phrases_text_adj, 'gampeplay'))
# print(get_adj_modifiers(phrases_text_adj[1], 'gampeplay'))

Experiementing Adjectives extracting
* Extract all the adjectives which wereused as a pre-nominal modifier to “gameplay”
* Adjectival complement of “gameplay” (e.g. “gameplay was smooth”)

In [None]:
df_user_rev[df_user_rev["lemmatized_user_rev"].str.contains(r"\bsmooth gameplay\b")]

print(df_user_rev[df_user_rev['Unnamed: 0'] == 121017]['review'].values)

In [None]:
print(get_adj_modifiers("gameplay was smooth", 'gameplay'))

In [None]:
# check all tags
doc = nlp("Gameplay is also pretty damn good")
for token in doc.noun_chunks:
    print(token.text)

displacy.render(doc, style="dep")

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("gameplay was smooth")
for chunk in doc.noun_chunks:
    print(chunk.text)

In [None]:
spacy.explain("attr")

In [None]:
print('3 random reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity > -1, ['review']].sample(3).values
for c in cl:    
    print(c[0])

## Gameplay Aesthetics from Game Reviews  
How do players understand, perceive, and talk about gameplay?

1. Extracting all sentences in which the word “gameplay” (and variations such as “gameplays”) appeared.
2. identified the POS of the words and parsed the sentences using a POS tagger and a parser
3. Extract all the adjectives which wereused as a pre-nominal modifier to “gameplay” (e.g. “smooth gameplay”) or as an adjectival complement of “gameplay” (e.g. “gameplay was smooth”)

In [None]:
# 1 Extracting all sentences in which the word “gameplay” (and variations such as “gameplays”) appeared.


# Sentiment Analysis

### Blob Polarity Analys

In [None]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
df_user_rev['polarity'] = df_user_rev['review'].map(lambda text: TextBlob(text).sentiment.polarity)

Positive Reviews

In [None]:
print('3 random reviews with the highest positive sentiment polarity: \n')
cl = df_user_rev.loc[df_user_rev.polarity == 1, ['review']].sample(3).values
for c in cl:
    print(c[0])

Neutral Reviews

In [None]:
print('3 random neutral reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity == 0, ['review']].sample(3).values
for c in cl:
    print(c[0])

Negative Reviews

In [None]:
print('3 random negative reviews: \n')
cl = df_user_rev.loc[df_user_rev.polarity == -1, ['review']].sample(3).values
for c in cl:
    print(c[0])

Distribution of Polarity Score

In [None]:
#The distribution of review sentiment polarity score

df_user_rev['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Distribution of Review Ratings

In [None]:
df_user_rev['user_score'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

## [TODO] Blob Subjectivity Analysis
Subjectivity quantifies the amount of personal opinion and factual information contained in the text. The higher subjectivity means that the text contains personal opinion rather than factual information

## N-Grams Distribution

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### 1-Gram Analysis

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
    
common_words = get_top_n_words(df_user_rev['lemmatized_user_rev'], 40)
# for word, freq in common_words:
#     print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review after removing stop words')


### 2-Grams Analysis

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(df_user_rev['lemmatized_user_rev'], 20)

# for word, freq in common_words:
#     print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')


### 3-Grams Analysis

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(df_user_rev['lemmatized_user_rev'], 20)

# for word, freq in common_words:
#     print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review after removing stop words')

## Part-Of-Speech Tagging (POS) Analysis

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
blob = TextBlob(str(df_user_rev['lemmatized_user_rev']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    title='Top 20 Part-of-speech tagging for review corpus')

# How to count


### Return a dictionary from a mtacher

In [None]:
# return of mathces
for id,start,end in matches:
    noun_modified = doc[start]
    print(noun_modified)

In [None]:
test_phrases = ["The smooth gameplay",
                "The tedious gameplay",
                "The smooth gameplay",
                "The horrible graphics"]
# final output {smooth : 2, tedious : 1}

In [None]:
# iterate over the phrases

adj_count = {}

for phrase in test_phrases:
    count_adjs(review=phrase, matcher=new_matcher)

print(adj_count)

In [None]:
from collections import Counter

doc = nlp(u'The smooth gameplay. The fast gameplay')

# all tokens that arent stop words or punctuations
words = [token.text
         for token in doc
         if not token.is_stop and not token.is_punct]

# noun tokens that arent stop words or punctuations
nouns = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(5)

# five most common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(5)


print(words)

common_words
common_nouns

In [None]:
import spacy
import spacy.attrs
nlp = spacy.load("en_core_web_sm")
doc = nlp("It all happened between November 2007 and November 2008")

# Returns integers that map to parts of speech
counts_dict = doc.count_by(spacy.attrs.IDS['POS'])

# Print the human readable part of speech tags
for pos, count in counts_dict.items():
    human_readable_tag = doc.vocab[pos].text
    print(human_readable_tag, count)

In [None]:
new_matcher = Matcher(nlp.vocab)
pattern = [{'POS': {'IN': ['ADJ']}}, 
           {'POS': 'NOUN'}]# IN is used instead of adding all the patterns mannually.
new_matcher.add('PIZZA_RULE', [pattern])

doc = nlp("The fast gameplay")
matches = new_matcher(doc)
for match_id, start, end in matches:
    string_label = nlp.vocab.strings[match_id] 
    matched_text = doc[start:end]
    print(string_label, matched_text)


### Basic Exemple Dependency Matcher

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('The fast gameplay')

noun_adj_pairs = {}
for chunk in doc.noun_chunks:
    adj = []
    noun = ""
    print(chunk)
    for tok in chunk:
        if tok.pos_ == "NOUN":
            noun = tok.text
        if tok.pos_ == "ADJ":
            adj.append(tok.text)
        if tok.dep_ == 
    if noun:
        noun_adj_pairs.update({noun:adj})

# expected output
noun_adj_pairs

### Expericing with Matchers

In [None]:
spacy.explain("nsubj")

In [None]:
matcher = Matcher(nlp.vocab)

patterns = [
    [{'POS':'ADJ'}, {'POS':'NOUN'}],
    ]
matcher.add("demo", patterns)

doc = nlp("The gameplay was smooth.")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)
displacy.render(doc, style="dep")


In [None]:
doc = nlp("The gameplay was smooth and fast.")
displacy.render(doc, style="dep")

for tok in doc:
    print(tok.text, tok.pos_, tok.dep_, tok.tag_)

In [None]:
spacy.explain("VBD")

### Dependency Matcher

In [None]:
doc = nlp("The smooth, easy ,and fast gameplay")
displacy.render(doc, style="dep")


In [None]:
nlp = spacy.load("en_core_web_sm")
pattern = [
  {
    "RIGHT_ID": "target",
    "RIGHT_ATTRS": {"POS": "NOUN"}
  },
  # founded -> subject
  {
    "LEFT_ID": "target",
    "REL_OP": ">",
    "RIGHT_ID": "modifier",
    "RIGHT_ATTRS": {"DEP": {"IN": ["det", "nummod"]}}
  },
]

matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [pattern])


text = "The smooth, easy ,and fast gameplay"
doc = nlp(text)
for match_id, (target, modifier) in matcher(doc):
    print(doc[modifier], doc[target], sep="\t")

In [None]:
for tok in doc:
    if tok.dep_ == 'amod':
        print(tok.text)

In [None]:
for tok in doc:
    print(tok.text, tok.pos_)

In [None]:
nlp = spacy.load('en_core_web_sm')
phrase = "Giovanni is doing some nice research about NLP and Jonah seems to be sick, that motherfucker"
doc = nlp(phrase)
displacy.render(doc, style="dep")
spacy.explain("amod")

In [None]:
nlp = spacy.load('en_core_web_sm')
phrase = "It's a gameplay smooth"
doc = nlp(phrase)
displacy.render(doc, style="dep")
spacy.explain("compound")

### Basic Exemple Dependency Matcher

### Expericing with Matchers

In [None]:
matcher = Matcher(nlp.vocab)

patterns = [
    [{'POS':'ADJ'}, {'POS':'NOUN'}],
    ]
matcher.add("demo", patterns)

doc = nlp("The smooth gameplay.")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)
displacy.render(doc, style="dep")


In [None]:
doc = nlp("The smooth gameplay.")
displacy.render(doc, style="dep")

### Dependency Matcher

In [None]:
nlp = spacy.load("en_core_web_sm")
pattern = [
  {
    "RIGHT_ID": "target",
    "RIGHT_ATTRS": {"POS": "NOUN"}
  },
  # founded -> subject
  {
    "LEFT_ID": "target",
    "REL_OP": ">",
    "RIGHT_ID": "modifier",
    "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "nummod", "acomp"]}}
  },
]

matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [pattern])

text = "The gameplay was smooth"
doc = nlp(text)
for match_id, (target, modifier) in matcher(doc):
    print(doc[modifier], doc[target], sep="\t")

In [None]:
for tok in doc:
    if tok.dep_ == 'amod':
        print(tok.text)

In [None]:
nlp = spacy.load('en_core_web_sm')
phrase = "It's a smooth gameplay"
doc = nlp(phrase)
displacy.render(doc, style="dep")
spacy.explain("amod")

In [None]:
nlp = spacy.load('en_core_web_sm')
phrase = "It's a gameplay smooth"
doc = nlp(phrase)
displacy.render(doc, style="dep")
spacy.explain("compound")

# Word Similirarity

In [None]:

tokens = nlp("good great")
  
for token in tokens:
    # Printing the following attributes of each token.
    # text: the word string, has_vector: if it contains
    # a vector representation in the model, 
    # vector_norm: the algebraic norm of the vector,
    # is_oov: if the word is out of vocabulary.
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
  
token1, token2 = tokens[0], tokens[1]
  
print("Similarity:", token1.similarity(token2))

# Noun Chunks

In [None]:
txt = "A smooth and fast gameplay"
doc = nlp(txt)

chunks = []
for chunk in doc.noun_chunks:
    out = {}
    noun = chunk.root
    if noun.pos_ != 'NOUN':
        continue
    out['noun'] = noun
    for tok in chunk:
        if tok != noun:
            out[tok.pos_] = tok
    chunks.append(out)
    
print(chunks)

In [None]:
for chunk in doc.noun_chunks:
    print(chunk.root)

In [None]:
pattern = [
  {
    "RIGHT_ID": "target",
    "RIGHT_ATTRS": {"POS": "NOUN"}
  },
  # founded -> subject
  {
    "LEFT_ID": "target",
    "REL_OP": ">",
    "RIGHT_ID": "modifier",
    "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "nummod"]}}
  },
]

matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [pattern])

text = "A smooth and fast gameplay"
doc = nlp(text)
for match_id, (target, modifier) in matcher(doc):
    print(doc[modifier], doc[target], sep="\t")