In [1]:
# Utility cell

import re
# Utility: trim_content

def trim_content(content: str) -> str:
    removed_spaces = re.sub("\s\s+" , " ", content)
    removed_new_lines = re.sub("\n" , "", removed_spaces)
    return removed_new_lines

# Utility: list_filter

def list_filter(list_obj, callback):
    new_list = list()

    for value in list_obj:
        if callback(value):
            new_list.append(value)

    return new_list


In [2]:
# Metacritic Data cell

from bs4 import BeautifulSoup
import requests


class MetacriticData:
    def __init__(self):
        self.DEFAULT_RESPONSE = 'Unknown'
        self.DEFAULT_RESPONSE_FLOAT = float(-1)
        self.search_url = "https://www.metacritic.com/autosearch"
        self.search_headers = {
            'authority': 'www.metacritic.com',
            'accept': '*/*',
            'x-requested-with': 'XMLHttpRequest',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/95.0.4638.54 Safari/537.36',
            'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'sec-gpc': '1',
            'origin': 'https://www.metacritic.com',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'cors',
            'sec-fetch-dest': 'empty',
            'referer': 'https://www.metacritic.com/game/playstation-4/grand-theft-auto-v',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
            'cookie': 'mc_s_s=d_2; OptanonAlertBoxClosed=2021-10-31T12:22:29.841Z; '
                    'ctk=NjE3ZThhOTEwMjAwMDAwMGRmN2Q3M2I4NmRiMg%3D%3D; omniEvents[pageFindingMethod]=Internal:Search; '
                    '_BB.bs=a|3; metapv=3; _BB.d=0|||2; '
                    'OptanonConsent=isIABGlobal=false&datestamp=Sun+Oct+31+2021+13%3A23%3A10+GMT%2B0100+('
                    'Central+European+Standard+Time)&version=6.20.0&hosts=&consentId=316c06a0-7ce1-4feb-8905-2c9cfeec47a2'
                    '&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1'
                    '&geolocation=PL%3B24&AwaitingReconsent=false '
        }
        self.web_scrapping_headers = {
            'authority': 'www.metacritic.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/95.0.4638.54 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
                    'application/signed-exchange;v=b3;q=0.9',
            'sec-gpc': '1',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://www.metacritic.com/game/playstation-4/grand-theft-auto-v',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
            'cookie': 'mc_s_s=d_2; OptanonAlertBoxClosed=2021-10-31T12:22:29.841Z; '
                    'ctk=NjE3ZThhOTEwMjAwMDAwMGRmN2Q3M2I4NmRiMg%3D%3D; _BB.bs=a|3; metapv=11; _BB.d=0|||10; '
                    'OptanonConsent=isIABGlobal=false&datestamp=Sun+Oct+31+2021+16%3A12%3A56+GMT%2B0100+('
                    'Central+European+Standard+Time)&version=6.20.0&hosts=&consentId=316c06a0-7ce1-4feb-8905-2c9cfeec47a2'
                    '&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1'
                    '&geolocation=PL%3B24&AwaitingReconsent=false '
        }

    def _select_content(self, web_content: str) -> str:
        if len(web_content) > 0:
            return web_content[0].text
        else:
            return self.DEFAULT_RESPONSE
        
    def _select_content_as_number(self, web_content: str, default_value: int = int(-1)) -> int:
        if len(web_content) > 0:
            try:
                return int(web_content[0].text)
            except: 
                return default_value
        
        else:
            return default_value
        
    def _select_reliability(self, ups: str, thumbs: str) -> float:
        try:
            reliability = int(ups) / int(thumbs)
    
            return reliability
        except:
            return self.DEFAULT_RESPONSE_FLOAT
        
    def search(self, title: str, platform: str = "PC Game") -> str:
        payload = "search_term=" + title + "&image_size=98&search_each=true"
        response = requests.request("POST", self.search_url, headers=self.search_headers, data=payload)
        results = response.json()['autoComplete']['results']
        selected_platform_results = list_filter(results, lambda elem: elem['refType'] == platform)
        exact_title_selected_platform_results = list_filter(selected_platform_results, lambda elem: elem['name'] == title)

        if len(exact_title_selected_platform_results) <= 0:
            raise ValueError('The title \'' + title + '\' on platform \'' + platform + '\' was not found.')

        return exact_title_selected_platform_results[0]
    
    def web_scrapping(self, base_url: str, endpoint: str, pages_enabled: bool = False) -> list:
        if pages_enabled:
            have_content = True
            page_index = 0
            response = list()
        
            while have_content:
                url = base_url + endpoint + '?page=' + str(page_index)
                request = requests.request("GET", url, headers=self.web_scrapping_headers)
                request_text = request.text

                if request.status_code != 200:
                    raise ValueError('Cannot fetch reviews for \'' + url + '\'.')

                if page_index > 50:
                    raise ValueError('Probably fetched too many pages for \'' + url + '\'.')

                if 'There are no user reviews yet -' in request_text:
                    have_content = False
                else:
                    response.append(request_text)

                page_index += 1
            
            return response

        else:
            url = base_url + endpoint
            request = requests.request("GET", url, headers=self.web_scrapping_headers)

            if request.status_code != 200:
                raise ValueError('Cannot fetch reviews for \'' + url + '\'.')

            return [request.text]
        

    def find_reviews(self, web_contents: list, expert: bool = False) -> list:
        response = list()

        for web_content in web_contents:
            soup = BeautifulSoup(web_content, 'html.parser')
            main_content = soup.findAll('div', attrs={"id": "main"})[0]
            soup_main = BeautifulSoup(str(main_content), 'html.parser')
            reviews_elements = soup_main.findAll('li', attrs={"class": "review"})

            for review in reviews_elements:
                soup_child = BeautifulSoup(str(review), 'html.parser')
                review_score = self._select_content(soup_child.findAll('div', attrs={"class": "metascore_w"}))
                review_content = self._select_content(soup_child.findAll('div', attrs={"class": "review_body"}))
                review_source = self._select_content(soup_child.findAll('div', attrs={"class": "source"}))
                review_name = self._select_content(soup_child.findAll('div', attrs={"class": "name"}))
                review_date = self._select_content(soup_child.findAll('div', attrs={"class": "date"}))
                review_ups = self._select_content_as_number(soup_child.findAll('span', attrs={"class": "total_ups"}), int(0))
                review_thumbs = self._select_content_as_number(soup_child.findAll('span', attrs={"class": "total_thumbs"}), int(0))
                review_reliabilty = self._select_reliability(review_ups, review_thumbs)

                if expert:
                    response.append({
                        'name': trim_content(review_source), 
                        'date': trim_content(review_date),
                        'score': trim_content(review_score), 
                        'content': trim_content(review_content), 
                    })
                else:
                    response.append({
                        'name': trim_content(review_name), 
                        'date': trim_content(review_date),
                        'score': trim_content(review_score), 
                        'content': trim_content(review_content), 
                        'ups': review_ups, 
                        'thumbs': review_thumbs, 
                        'reliability': review_reliabilty
                    })

        return response

    def get_reviews(self, title: str, platform: str = 'PC Game') -> list:
        results = self.search(title, platform)

        scrapped_critic_review = self.web_scrapping(results['url'], '/critic-reviews', False)
        scrapped_user_review = self.web_scrapping(results['url'], '/user-reviews', True)

        critic_reviews = pd.DataFrame(self.find_reviews(scrapped_critic_review, True))
        user_reviews = pd.DataFrame(self.find_reviews(scrapped_user_review))

        return [critic_reviews, user_reviews]

In [4]:
#Opinie ekspertów
#MetacriticData().get_reviews('The Sims 4', 'PC Game')[0]

In [5]:
import pandas as pd
#Opinie userów
df = MetacriticData().get_reviews('Sackboy: A Big Adventure', 'PS5 Game')[1]

print("Eee?");

df


Eee?


Unnamed: 0,name,date,score,content,ups,thumbs,reliability
0,SparkeyMark86,"Nov 17, 2020",9,The most fun PlayStation platformer I played s...,2,2,1.0
1,Jleffel22,"Nov 14, 2020",9,Such a fun cute game! I love all these unique ...,2,2,1.0
2,felipenavarro92,"Dec 10, 2020",10,"Pensei que seria um jogo generico, mas a cada ...",1,1,1.0
3,Drawnimo,"Nov 16, 2020",10,Sackboy: A Big Adventure is a whimsically fant...,1,1,1.0
4,amarosa,"Nov 20, 2020",10,Mundo mágico top PlayStation 5 arrasou!!!Diver...,1,1,1.0
...,...,...,...,...,...,...,...
134,Cookie_Punki,"Jan 3, 2022",8,Sackboy: A Big Adventure es el juego de la sag...,0,0,-1.0
135,jaimesafe,"Oct 29, 2021",9,"Divertido, variadas y originales mecánicas, di...",0,0,-1.0
136,THEENDACCOUNT,"Jan 29, 2022",10,GOODGOODGOODGOODGOODGOODGOODGOODgoodgoodgoodgo...,0,0,-1.0
137,penmumble,"Feb 15, 2022",10,I cannot express how much I adore this game. A...,0,0,-1.0


In [7]:
import pandas as pd
from langdetect import detect

df['Language'] = df['content'].apply(detect)

df

Unnamed: 0,name,date,score,content,ups,thumbs,reliability,Language
0,SparkeyMark86,"Nov 17, 2020",9,The most fun PlayStation platformer I played s...,2,2,1.0,en
1,Jleffel22,"Nov 14, 2020",9,Such a fun cute game! I love all these unique ...,2,2,1.0,en
2,felipenavarro92,"Dec 10, 2020",10,"Pensei que seria um jogo generico, mas a cada ...",1,1,1.0,pt
3,Drawnimo,"Nov 16, 2020",10,Sackboy: A Big Adventure is a whimsically fant...,1,1,1.0,en
4,amarosa,"Nov 20, 2020",10,Mundo mágico top PlayStation 5 arrasou!!!Diver...,1,1,1.0,pt
...,...,...,...,...,...,...,...,...
134,Cookie_Punki,"Jan 3, 2022",8,Sackboy: A Big Adventure es el juego de la sag...,0,0,-1.0,es
135,jaimesafe,"Oct 29, 2021",9,"Divertido, variadas y originales mecánicas, di...",0,0,-1.0,es
136,THEENDACCOUNT,"Jan 29, 2022",10,GOODGOODGOODGOODGOODGOODGOODGOODgoodgoodgoodgo...,0,0,-1.0,so
137,penmumble,"Feb 15, 2022",10,I cannot express how much I adore this game. A...,0,0,-1.0,en


In [8]:
import numpy as np

df = df.loc[df['Language'] == "en"]
df['desc'] = np.where(
    df['score'].astype(int) < 4 , "negative", np.where(
    df['score'].astype(int) > 7, "positive", "neutral"))


df

Unnamed: 0,name,date,score,content,ups,thumbs,reliability,Language,desc
0,SparkeyMark86,"Nov 17, 2020",9,The most fun PlayStation platformer I played s...,2,2,1.0,en,positive
1,Jleffel22,"Nov 14, 2020",9,Such a fun cute game! I love all these unique ...,2,2,1.0,en,positive
3,Drawnimo,"Nov 16, 2020",10,Sackboy: A Big Adventure is a whimsically fant...,1,1,1.0,en,positive
6,Tunisaiyan,"Nov 14, 2020",10,It's so good to play it with your brother. The...,1,1,1.0,en,positive
7,Kratos_0611,"Nov 14, 2020",10,This game had low expectations from me but it ...,1,1,1.0,en,positive
...,...,...,...,...,...,...,...,...,...
131,konjakthesober,"Oct 11, 2021",8,Fun game to play witch others but no so much a...,0,0,-1.0,en,positive
132,VGConsoleAPE,"Sep 27, 2021",10,Jim Ryan came out swinging at PS5's launch whe...,0,0,-1.0,en,positive
133,fadz13,"Sep 28, 2021",10,Sackboy is LOADS of fun. It's a great single-p...,0,0,-1.0,en,positive
137,penmumble,"Feb 15, 2022",10,I cannot express how much I adore this game. A...,0,0,-1.0,en,positive


In [9]:
import contractions

#to lowercase


df['text'] = df['content'].apply(lambda x:x.lower())
#hadn't to had not 
df['text'] = df['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
#back to zusammen word
df['text'] = [' '.join(map(str, l)) for l in df["text"]]

#remove punctuation like . , 
df["text"] = df['text'].str.replace('[^\w\s]','')
#remove numbers
df["text"] = df.text.str.replace('\d+', '')
df

ModuleNotFoundError: No module named 'contractions'

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from stop_words import get_stop_words
stop = stopwords.words('english')
stop_words = list(get_stop_words('en')) 

In [10]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df

KeyError: 'text'

In [None]:

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:


df['tokenized'] = df['text_without_stopwords'].apply(word_tokenize)
df['tokenized'] = df['tokenized'].apply(nltk.tag.pos_tag)


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
df['wordnet_pos'] = df['tokenized'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
wnl = WordNetLemmatizer()
df['lemmatized'] = df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df['lemmatized'] = [' '.join(map(str, l)) for l in df['lemmatized']]
df

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots()
fig.suptitle("desc", fontsize=12)
df["desc"].reset_index().groupby("desc").count().sort_values(by= 
       "index").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.show()

In [None]:
#Najczęstsze słowa
from collections import Counter
cnt = Counter()
for text in df["lemmatized"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [11]:
#count vectorizer and Count
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(df.lemmatized)

print(X_train_counts.shape)
print(X_train_counts.toarray())


ModuleNotFoundError: No module named 'sklearn'

In [12]:
df_names = pd.DataFrame(X_train_counts[0].T.todense(),
    	index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df_names = df_names.sort_values('Count Vectorizer', ascending=False)
df_names

NameError: name 'X_train_counts' is not defined

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer  = TfidfTransformer()
X_train_tfidf = vectorizer.fit_transform(df.lemmatized)

print(X_train_tfidf.shape)
print(X_train_tfidf.toarray())

ModuleNotFoundError: No module named 'sklearn'

In [14]:
df_names = pd.DataFrame(X_train_tfidf[0].T.todense(),
    	index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df_names = df_names.sort_values('TF-IDF', ascending=False)
df_names

NameError: name 'X_train_tfidf' is not defined

In [15]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
corpus_embeddings = embedder.encode(X_train_tfidf)

ModuleNotFoundError: No module named 'sentence_transformers'