# What do you like in boardgames
The goal of this project is to explore the data from boardgamegeek.com and try to discover the most-liked and disliked aspects of each of the top 10 games (as of 25/06/2024)

In [44]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import math
import time
import ast
import string
import pickle
import os.path
import matplotlib.pyplot as plt

import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
import torch.nn.functional as F
import fasttext
from setfit import AbsaModel, AbsaTrainer, TrainingArguments
from pyabsa import available_checkpoints, TaskCodeOption, AspectTermExtraction as ATEPC, ModelSaveOption, DeviceTypeOption
from datasets import load_dataset
import simplemma
from nltk.stem import WordNetLemmatizer
import warnings

transformers.logging.set_verbosity_error()

In [2]:
# loading data from boarggamegeek about boardgames downloaded from boardgamegeek.com (26/06/2024)
df_boardgames = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_ranks.csv')

In [3]:
# keep only the top 10 games, the df is already sorted by highest rated
df_boardgames_10 = df_boardgames[:10]

In [5]:
def donwload_game_comments(game_id):
    comments=[]
    base_url = f'https://api.geekdo.com/xmlapi2/thing?type=boardgame&id={game_id}&comments=1'
    print(f'Downloading comments for game with id {game_id}')
    r = requests.get(base_url)
    # parse downloaded xml
    root = ET.fromstring(r.content)
    # extract the total number of comments for that game
    number_of_comments = int(root[0].find('comments').attrib['totalitems'])
    # calculate the number of pages to request to download all comments
    number_of_pages = math.ceil(number_of_comments / 100)
    time.sleep(1)
    # download comments from all pages
    for i in range(number_of_pages):
        # create url for the next page
        url = f'{base_url}&page={i+1}'
        while(True):
            r = requests.get(url)
            # parse downloaded xml
            root = ET.fromstring(r.content)
            # iterate over comments to store them
            for comment in root.iter('comment'):
                comment.attrib['boardgame_id'] = game_id
                comments.append(comment.attrib)
            print(f"{len(comments)}/{number_of_comments}")
            time.sleep(1.5)
            if len(list(root.iter('comment')))>0:
                break
            else:
                print('repeating page download since no comment was received')
    # the API does not give comments sometimes, so we check if the number of comments obtained so far matches the expectation
    if len(comments)!=number_of_comments:
        print(f'Failed to download ALL comments for game with id: {game_id}')
    return pd.DataFrame(comments)

In [4]:
# if reviews are not already downloaded, download them and store them
if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments.csv'):
    df_comments = pd.DataFrame()
    for i,boardgame in df_boardgames_10.iterrows():
        df_game_comments = donwload_game_comments(boardgame['id'])
        df_comments = pd.concat([df_comments, df_game_comments])
    df_comments.to_csv('/content/drive/MyDrive/Boardgames/boardgames_comments.csv')
else:
    df_comments = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments.csv',index_col=0)

In [7]:
df_comments

Unnamed: 0,username,rating,value,boardgame_id
0,1 Family Meeple,,SLEEVED[IMG]https://cf.geekdo-static.com/mbs/m...,224517
1,13inha,,G,224517
2,1bez,10.0,"Great game, full controllo of your strategy th...",224517
3,2bit,7.5,"Very clever game, enjoyable overall. Plus poi...",224517
4,2d20,9.0,Brilliant! Fits right into my wheelhouse all ...,224517
...,...,...,...,...
67905,Zvonmirus,7.5,Only played the beginning scenarios with my el...,291457
67906,Zygomax,,BGS Prize Nov. 2021,291457
67907,_Kenneth,9.5,Cooperative Legacy (2P-3P) ✓ Completion,291457
67908,_LSK_,5.0,Too hard under the rules in the game and a bit...,291457


In [5]:
def clean_comments(df_comments,length=None):
    df_comments_cleaned = df_comments.copy()
    # make sure comments are strings
    df_comments_cleaned['value'] = df_comments_cleaned['value'].map(lambda value: str(value))
    # initialized model to detect language
    model_lang_detection = fasttext.load_model('./content/drive/MyDrive/Boardgames/lid.176.ftz')
    # add column to dataframe with language
    df_comments_cleaned['lang'] = df_comments_cleaned['value'].map(lambda comment: model_lang_detection.predict(comment, k=1)[0][0].replace('__label__',''))
    # filter dataframe to keep english language only
    df_comments_cleaned = df_comments_cleaned[df_comments_cleaned['lang']=='en']
    # remove short comments, remove bottom 25% elements
    if length==None:
        length = df_comments_cleaned['value'].map(lambda comment: len(comment)).quantile(0.25)
    df_comments_cleaned = df_comments_cleaned[df_comments_cleaned['value'].map(lambda t:len(t)>length)]
    df_comments_cleaned = df_comments_cleaned.reset_index()
    return df_comments_cleaned

In [6]:
df_comments_only_eng_short = clean_comments(df_comments)
df_comments_only_eng_long = clean_comments(df_comments, length=100)

In [7]:
# print some elements to use as example in the report
for elem in df_comments_only_eng_short[df_comments_only_eng_short['value'].apply(lambda v: 'luck' in v and len(v)<100)]['value'][:20]:
    print(elem)

Low luck, high skill game which good moves are not obvious and many strategies are viable.
Great mechanics. Too much luck involved (drawing cards)
Unluckily I sold it before playing it because I did not have a group to play with.
Feel like going through the motions alot of chance and luck in this game.
Great fun but can be very luck dependent.
players -  2 - 4 rules -    MEDIUM luck -     MEDIUM strategy - HIGH  CO-OP
A true masterpiece ! I only wish the mechanics were a tiny bit less luck-based.
Beautiful but too much based on luck
Nothing else comes close if you like the Card Management system and lack of luck.
Solo only - a bit too much luck of the drawer for my taste.
Played on TTS. Preordered. Card luck like Wingspan. Some boards and sponsor powers are broken.
My initial gripes about the luck factor mostly disappeared the more I played.
Overhyped. Too much luck involved with drawing of cards.
too much luck in the cards for the lenght and the complexity of this game
Variability of 

In [8]:
# download reviews about everdell and uno to use as ""control variables""
if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments_everdell.csv'):
    df_everdell = donwload_game_comments(199792)
else:
    df_everdell = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments_everdell.csv',index_col=0)

if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments_uno.csv'):
    df_uno = donwload_game_comments(2223)
else:
    df_uno = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments_uno.csv',index_col=0)

In [9]:
# clean datasets of the newly downloaded games
df_everdell_cleaned = clean_comments(df_everdell)
df_uno_cleaned = clean_comments(df_uno)

In [10]:
df_boardgames_control = df_boardgames[(df_boardgames['id']==2223)|(df_boardgames['id']==199792)]
df_boardgames_control

Unnamed: 0,id,name,yearpublished,rank,bayesaverage,average,usersrated,is_expansion,abstracts_rank,cgs_rank,childrensgames_rank,familygames_rank,partygames_rank,strategygames_rank,thematic_rank,wargames_rank
34,199792,Everdell,2018,35,7.84704,8.02615,54686,0,,,,3.0,,40.0,,
26430,2223,UNO,1971,26431,5.33672,5.45676,29135,0,,,,3179.0,,,,


In [11]:
df_boardgames_10_control = pd.concat([df_boardgames_10,df_boardgames_control])

# Hugging face - Deberta v3 base absa


In [114]:
# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1", use_fast=False)
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_pipeline = pipeline('text-classification', model=absa_model, tokenizer=absa_tokenizer, max_length=512, truncation=True,device=0)

In [115]:
def map_row_to_value(row, aspect):
    if row[f'aspect_{aspect}_label']=='Positive':
        return 1
    elif row[f'aspect_{aspect}_label']=='Negative':
        return -1
    elif row[f'aspect_{aspect}_label']=='Neutral':
        return 0
def map_row_to_value2(row, aspect):
    if row[f'aspect_{aspect}_label']=='Positive':
        return row[f'aspect_{aspect}_score']
    elif row[f'aspect_{aspect}_label']=='Negative':
        return -row[f'aspect_{aspect}_score']
    elif row[f'aspect_{aspect}_label']=='Neutral':
        return 0

In [13]:
aspects = ['luck','bookkeeping','downtime','interaction','bash the leader','complicated','complex']
def analyze_aspects(df_comments, aspects=aspects):
    df_comments_with_aspects = df_comments.copy()
    for aspect in aspects:
        print(f'Analyzing {aspect}')
        results = absa_pipeline(df_comments['value'].to_list(),  text_pair=aspect)
        df_comments_with_aspects[f'aspect_{aspect}'] = results
        df_comments_with_aspects[f'aspect_{aspect}_label'] = [res['label'] for res in results]
        df_comments_with_aspects[f'aspect_{aspect}_score'] = [res['score'] for res in results]
        df_comments_with_aspects[f'aspect_{aspect}_mapped'] = df_comments_with_aspects.apply(lambda row:map_row_to_value(row,aspect),axis=1)
        df_comments_with_aspects[f'aspect_{aspect}_mapped2'] = df_comments_with_aspects.apply(lambda row:map_row_to_value2(row,aspect),axis=1)
    return df_comments_with_aspects

In [None]:
df_comments_only_eng_short_with_aspects = analyze_aspects(df_comments_only_eng_short)

In [129]:
df_everdell_with_aspects = analyze_aspects(df_everdell_cleaned)

Analyzing luck
Analyzing bookkeeping
Analyzing downtime
Analyzing interaction
Analyzing bash the leader
Analyzing complicated
Analyzing complex


In [130]:
df_uno_with_aspects = analyze_aspects(df_uno_cleaned)

Analyzing luck
Analyzing bookkeeping
Analyzing downtime
Analyzing interaction
Analyzing bash the leader
Analyzing complicated
Analyzing complex


In [139]:
df_comments_only_eng_short_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_default_aspects.csv')
df_everdell_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_everdell_default_aspects.csv')
df_uno_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_uno_default_aspects.csv')

In [14]:
df_comments_only_eng_short_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_default_aspects.csv',index_col=0)
df_everdell_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_everdell_default_aspects.csv',index_col=0)
df_uno_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_uno_default_aspects.csv',index_col=0)

In [87]:
# code to fix dictionaries that are stored as string by pd.from_csv 
def fix_dictionaries(row,aspect):
    string_dict = row[f'aspect_{aspect}']
    dict = ast.literal_eval(string_dict)
    return dict
for aspect in aspects:
    srs_dict = df_comments_only_eng_short_with_aspects.apply(lambda row:fix_dictionaries(row,aspect),axis=1)
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}'] = srs_dict
for aspect in aspects:
    results = df_comments_only_eng_short_with_aspects[f'aspect_{aspect}'].to_list()
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_label'] = [res['label'] for res in results]
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_score'] = [res['score'] for res in results]
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_mapped'] = df_comments_only_eng_short_with_aspects.apply(lambda row:map_row_to_value(row,aspect),axis=1)
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_mapped2'] = df_comments_only_eng_short_with_aspects.apply(lambda row:map_row_to_value2(row,aspect),axis=1)

In [173]:
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_short_with_aspects[df_comments_only_eng_short_with_aspects['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]} ({boardgame_data['id'].iloc[0]})")
    scores = []
    for aspect in aspects:
        boardgame_comments_polars = boardgame_comments
        score1 = boardgame_comments[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
        score2 = boardgame_comments[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
        scores.append(f'{score1:.2}')
        #print(f"{aspect}:  : {score2:.2}")
    print(' & '.join(scores))

Brass: Birmingham
0.37 & 0.26 & 0.31 & 0.41 & 0.24 & -0.4 & 0.095
Pandemic Legacy: Season 1
0.34 & 0.24 & 0.33 & 0.39 & 0.21 & -0.33 & 0.1
Gloomhaven
0.29 & 0.17 & 0.22 & 0.3 & 0.17 & -0.42 & -0.0082
Ark Nova
0.24 & 0.2 & 0.18 & 0.25 & 0.16 & -0.4 & 0.034
Twilight Imperium: Fourth Edition
0.32 & 0.21 & 0.2 & 0.33 & 0.17 & -0.44 & 0.033
Dune: Imperium
0.38 & 0.31 & 0.37 & 0.42 & 0.3 & -0.26 & 0.22
Terraforming Mars
0.28 & 0.2 & 0.22 & 0.29 & 0.17 & -0.4 & 0.063
War of the Ring: Second Edition
0.38 & 0.28 & 0.32 & 0.42 & 0.24 & -0.42 & 0.073
Star Wars: Rebellion
0.38 & 0.29 & 0.32 & 0.42 & 0.25 & -0.33 & 0.13
Gloomhaven: Jaws of the Lion
0.35 & 0.25 & 0.33 & 0.4 & 0.23 & -0.38 & 0.038


In [15]:
print("UNO")
scores = []
for aspect in aspects:
    boardgame_comments_polars = df_uno_with_aspects
    score1 = df_uno_with_aspects[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
    score2 = df_uno_with_aspects[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
    print(f"{aspect}: {score1} - {score2}")
    scores.append(f'{score1:.2}')
print(' & '.join(scores))

UNO
luck: 0.12915234822451319 - 0.08718912821489239
bookkeeping: 0.036368843069874 - 0.028064031567838457
downtime: 0.12371134020618557 - 0.09373746944627674
interaction: 0.14461626575028638 - 0.11044979105700332
bash the leader: -0.06786941580756013 - -0.053911250592438766
complicated: -0.6660939289805269 - -0.5434149868982627
complex: -0.16237113402061856 - -0.1334041057566324
0.13 & 0.036 & 0.12 & 0.14 & -0.068 & -0.67 & -0.16


In [16]:
print("Everdell")
scores = []
for aspect in aspects:
    boardgame_comments_polars = df_everdell_with_aspects
    score1 = df_everdell_with_aspects[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
    score2 = df_everdell_with_aspects[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
    print(f"{aspect}: {score1} - {score2}")
    scores.append(f'{score1:.2}')
print(' & '.join(scores))

Everdell
luck: 0.39028658936775323 - 0.3163189872809193
bookkeeping: 0.3218114198206082 - 0.2649124281473166
downtime: 0.37716035878363596 - 0.3226796336858337
interaction: 0.42375847735725225 - 0.36864200744266923
bash the leader: 0.30781010719754975 - 0.25930177459985937
complicated: -0.23911616714066944 - -0.19680007381372447
complex: 0.20017501640778823 - 0.16144633592413854
0.39 & 0.32 & 0.38 & 0.42 & 0.31 & -0.24 & 0.2


# Setfit ABSA

## Pretrained

In [33]:
# Download from the 🤗 Hub
model = AbsaModel.from_pretrained(
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-aspect",
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity",
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [149]:
game_aspects = {}
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_short[df_comments_only_eng_short['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]} ({boardgame_data['id'].iloc[0]})")
    aspects=[]
    polarities=[]
    for comment in boardgame_comments['value']:
        aspect_prediction = model(comment)
        for elem in aspect_prediction:
            if isinstance(elem, dict):
                aspects.append(elem['span'])
                polarities.append(elem['polarity'])
    game_aspects[f'{boardgame_id}'] = (aspects, polarities)

Brass: Birmingham
Pandemic Legacy: Season 1
Gloomhaven
Ark Nova
Twilight Imperium: Fourth Edition
Dune: Imperium
Terraforming Mars
War of the Ring: Second Edition
Star Wars: Rebellion
Gloomhaven: Jaws of the Lion


In [101]:
game_aspects_long = {}
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_long[df_comments_only_eng_long['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]} ({boardgame_data['id'].iloc[0]})")
    aspects=[]
    polarities=[]
    for comment in boardgame_comments['value']:
        aspect_prediction = model(comment)
        for elem in aspect_prediction:
            if isinstance(elem, dict):
                aspects.append(elem['span'])
                polarities.append(elem['polarity'])
    game_aspects[f'{boardgame_id}'] = (aspects, polarities)

Brass: Birmingham
Pandemic Legacy: Season 1
Gloomhaven
Ark Nova
Twilight Imperium: Fourth Edition
Dune: Imperium
Terraforming Mars
War of the Ring: Second Edition
Star Wars: Rebellion
Gloomhaven: Jaws of the Lion


In [None]:
# uno
aspects=[]
polarities=[]
boardgame_id='2223'
for comment in df_uno_cleaned['value']:
    aspect_prediction = model(comment)
    for elem in aspect_prediction:
        if isinstance(elem, dict):
            aspects.append(elem['span'])
            polarities.append(elem['polarity'])
(aspects, polarities)
game_aspects[f'{boardgame_id}'] = (aspects, polarities)

In [None]:
# everdell
aspects=[]
polarities=[]
boardgame_id='199792'
for comment in df_everdell_cleaned['value']:
    aspect_prediction = model(comment)
    for elem in aspect_prediction:
        if isinstance(elem, dict):
            aspects.append(elem['span'])
            polarities.append(elem['polarity'])
game_aspects[f'{boardgame_id}'] = (aspects, polarities)

In [12]:
try:
    game_aspects
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_dict.pkl','wb') as f:
        pickle.dump(game_aspects, f)
except NameError:
    print('loaded from storage')
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_dict.pkl', 'rb') as f:
        game_aspects = pickle.load(f)

loaded from storage


In [39]:
try:
    game_aspects_long
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_long_dict.pkl','wb') as f:
        pickle.dump(game_aspects_long, f)
except NameError:
    print('loaded from storage')
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_long_dict.pkl', 'rb') as f:
        game_aspects_long = pickle.load(f)

loading from storage


In [64]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
def extract_list(game_aspects, min_cutoff=20, n_aspects=10, lemmatization=False):
    for boardgame_id in df_boardgames_10_control['id'].to_list():
        boardgame_data = df_boardgames_10_control[df_boardgames_10_control['id']==boardgame_id]
        filtered_words = ['game','games','Game','Games']
        if(not boardgame_data.empty):
            print(f"{boardgame_data['name'].iloc[0]} ({boardgame_data['id'].iloc[0]})")
            filtered_words.append(boardgame_data['name'].iloc[0])
            name_without_punct = boardgame_data['name'].iloc[0].translate(str.maketrans('', '', string.punctuation))
            filtered_words += name_without_punct.lower().split(' ')
        print(filtered_words)
        srs_aspects = pd.Series(game_aspects[f'{boardgame_id}'][0])
        srs_polarities = pd.Series(game_aspects[f'{boardgame_id}'][1])
        if lemmatization:
            srs_aspects = srs_aspects.apply(lambda aspect:lemmatizer.lemmatize(aspect))
            #srs_aspects = srs_aspects.apply(lambda aspect:simplemma.lemmatize(aspect,lang='en'))
        generated_aspects = pd.DataFrame()
        generated_aspects['id'] = [boardgame_id for _ in range(len(srs_aspects))]
        generated_aspects['aspects'] = srs_aspects.apply(lambda x:x.lower())
        generated_aspects['polarities'] = srs_polarities.apply(lambda x:x.lower())
        generated_aspects_filtered = generated_aspects[~generated_aspects['aspects'].isin(filtered_words)]
        #generated_aspects_all = pd.concat([generated_aspects_all, generated_aspects])
        # consider only aspects with at least 20 observations
        valid_aspects = generated_aspects_filtered['aspects'].value_counts()
        valid_aspects = valid_aspects[valid_aspects>=min_cutoff].index
        #print(valid_aspects)
        results = []
        for aspect in valid_aspects:
            aspect_polarities = generated_aspects_filtered[generated_aspects_filtered['aspects']==aspect]['polarities']
            positive_aspect_count = aspect_polarities.where(lambda v:v=='positive').count()
            negative_aspect_count = aspect_polarities.where(lambda v:v=='negative').count()
            neutral_aspect_count = aspect_polarities.where(lambda v:v=='neutral').count()
            score = (positive_aspect_count-negative_aspect_count)/len(aspect_polarities)
            #print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {score}')
            results.append((aspect,positive_aspect_count,negative_aspect_count,neutral_aspect_count,score))
        scores = [res[4] for res in results]
        aspects = [res[0] for res in results]
        df_aspect_score = pd.DataFrame()
        df_aspect_score['aspect'] = aspects
        df_aspect_score['score'] = scores
        n_aspects = n_aspects if len(aspects)>n_aspects*2 else len(aspects)//2
        print(n_aspects)
        print(f"Best: {df_aspect_score.sort_values(by=['score'],ascending=False)['aspect'][:n_aspects].to_list()}")
        print(f"Worst: {df_aspect_score.sort_values(by=['score'],ascending=True)['aspect'][:n_aspects].to_list()}")
        print('\n')

In [65]:
extract_list(game_aspects, n_aspects=10, min_cutoff=25, lemmatization=True)

Brass: Birmingham (224517)
['game', 'games', 'Game', 'Games', 'Brass: Birmingham', 'brass', 'birmingham']
10
Best: ['martin wallace', 'weight', 'iron clays', 'player interaction', 'location', 'production', 'deluxe edition', 'interaction', 'economy', 'masterpiece']
Worst: ['work', 'downtime', 'score', 'taste', 'hype', 'reason', 'move', 'luck', 'rating', 'connection']


Pandemic Legacy: Season 1 (161936)
['game', 'games', 'Game', 'Games', 'Pandemic Legacy: Season 1', 'pandemic', 'legacy', 'season', '1']
10
Best: ['board game experience', 'gaming experiences', 'co-op game', 'journey', 'gaming experience', 'adventure', 'legacy aspects', 'ride', 'legacy experience', 'love']
Worst: ['chore', 'taste', 'mistake', 'ending', 'replay', 'player game', 'interest', 'desire', 'deck', 'quarterbacking']


Gloomhaven (174430)
['game', 'games', 'Game', 'Games', 'Gloomhaven', 'gloomhaven']
10
Best: ['masterpiece', 'downside', 'variety', 'beast', 'replayability', 'leveling', 'depth', 'character classes', '

# PyABSA - Fast LCF
https://www.mdpi.com/2076-3417/9/16/3389

In [82]:
config = (
    ATEPC.ATEPCConfigManager.get_atepc_config_english()
)  # this config contains 'pretrained_bert', it is based on pretrained models
config.model = ATEPC.ATEPCModelList.FAST_LCF_ATEPC  # improved version of LCF-ATEPC

In [11]:
dataset = ATEPC.ATEPCDatasetList.Restaurant14

In [12]:
warnings.filterwarnings("ignore")

config.batch_size = 16
config.patience = 2
config.log_step = -1
config.seed = [1]
config.verbose = False  # If verbose == True, PyABSA will output the model strcture and seversal processed data examples
config.notice = (
    "This is an training example for aspect term extraction"  # for memos usage
)

trainer = ATEPC.ATEPCTrainer(
    config=config,
    dataset=dataset,
    from_checkpoint="english",  # if you want to resume training from our pretrained checkpoints, you can pass the checkpoint name here
    auto_device=DeviceTypeOption.AUTO,  # use cuda if available
    checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT,  # save state dict only instead of the whole model
    load_aug=False,  # there are some augmentation dataset for integrated datasets, you use them by setting load_aug=True to improve performance
)

[2024-07-04 14:12:49] (2.4.1.post1) Set Model Device: cuda:0
[2024-07-04 14:12:49] (2.4.1.post1) Device Name: NVIDIA GeForce GTX 1070


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

2024-07-04 14:12:49,956 INFO: PyABSA version: 2.4.1.post1
2024-07-04 14:12:49,958 INFO: Transformers version: 4.42.3
2024-07-04 14:12:49,959 INFO: Torch version: 2.3.1+cu121+cuda12.1
2024-07-04 14:12:49,961 INFO: Device: NVIDIA GeForce GTX 1070
2024-07-04 14:12:49,989 INFO: Searching dataset 114.Restaurant14 in https://github.com/yangheng95/ABSADatasets
[2024-07-04 14:12:49] (2.4.1.post1) Clone ABSADatasets from https://github.com/yangheng95/ABSADatasets.git
2024-07-04 14:12:59,978 INFO: You can set load_aug=True in a trainer to augment your dataset (English only yet) and improve performance.
2024-07-04 14:12:59,979 INFO: Please use a new folder to perform new text augment if the former augment in integrated_datasets/atepc_datasets/110.SemEval/114.restaurant14 errored unexpectedly


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

[2024-07-04 14:13:02] (2.4.1.post1) Can not load en_core_web_sm from spacy, try to download it in order to parse syntax tree: [32m
python -m spacy download en_core_web_sm[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 6.5 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


convert examples to features:  16%|█▋        | 589/3608 [00:00<00:03, 865.03it/s]



convert examples to features:  47%|████▋     | 1680/3608 [00:02<00:02, 837.98it/s]



convert examples to features:  56%|█████▌    | 2022/3608 [00:02<00:01, 822.76it/s]



convert examples to features:  89%|████████▉ | 3226/3608 [00:03<00:00, 838.44it/s]



convert examples to features: 100%|██████████| 3608/3608 [00:04<00:00, 810.49it/s]

2024-07-04 14:13:14,326 INFO: Dataset Label Details: {'Neutral': 637, 'Negative': 807, 'Positive': 2160, 'Sum': 3604}



convert examples to features:  50%|█████     | 563/1120 [00:00<00:00, 902.64it/s]



convert examples to features:  98%|█████████▊| 1092/1120 [00:01<00:00, 761.54it/s]



convert examples to features: 100%|██████████| 1120/1120 [00:01<00:00, 828.81it/s]

2024-07-04 14:13:16,263 INFO: Dataset Label Details: {'Neutral': 196, 'Negative': 196, 'Positive': 726, 'Sum': 1118}





pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

2024-07-04 14:13:55,046 INFO: Save cache dataset to fast_lcf_atepc.Restaurant14.dataset.3180c1a8d9b4b975f1c495da780d0298592e143b1cb505f57bc6ff1965589266.cache
2024-07-04 14:13:55,624 INFO: cuda memory allocated:764963840
2024-07-04 14:13:55,625 INFO: ABSADatasetsVersion:None	-->	Calling Count:0
2024-07-04 14:13:55,626 INFO: IOB_label_to_index:{'B-ASP': 1, 'I-ASP': 2, 'O': 3, '[CLS]': 4, '[SEP]': 5}	-->	Calling Count:1
2024-07-04 14:13:55,628 INFO: MV:<metric_visualizer.metric_visualizer.MetricVisualizer object at 0x7fa93cd207f0>	-->	Calling Count:0
2024-07-04 14:13:55,630 INFO: PyABSAVersion:2.4.1.post1	-->	Calling Count:1
2024-07-04 14:13:55,631 INFO: SRD:3	-->	Calling Count:9444
2024-07-04 14:13:55,632 INFO: TorchVersion:2.3.1+cu121+cuda12.1	-->	Calling Count:1
2024-07-04 14:13:55,635 INFO: TransformersVersion:4.42.3	-->	Calling Count:1
2024-07-04 14:13:55,636 INFO: auto_device:True	-->	Calling Count:3
2024-07-04 14:13:55,637 INFO: batch_size:16	-->	Calling Count:4
2024-07-04 14:13:5

Downloading checkpoint: 579MB [00:56, 10.16MB/s]                         

Find zipped checkpoint: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip, unzipping





Done.
[2024-07-04 14:14:58] (2.4.1.post1) [33mIf the auto-downloading failed, please download it via browser: https://huggingface.co/spaces/yangheng/PyABSA/resolve/main/checkpoints/English/ATEPC/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip [0m
2024-07-04 14:14:58,759 INFO: Checkpoint downloaded at: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
2024-07-04 14:14:59,138 INFO: Resume trainer from Checkpoint: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43!
2024-07-04 14:14:59,139 INFO: ***** Running training for Aspect Term Extraction and Polarity Classification *****
2024-07-04 14:14:59,140 INFO:   Num examples = 3604
2024-07-04 14:14:59,142 INFO:   Batch size = 16
2024-07-04 14:14:59,143 INFO:   Num steps = 2250


Epoch:  0| loss_apc:0.0256 | loss_ate:0.0828 |: 100%|██████████| 226/226 [03:26<00:00,  1.09it/s,  APC_ACC: 88.28(max:88.28) | APC_F1: 82.70(max:82.70) | ATE_F1: 83.74(max:83.79)]
Epoch:  1| loss_apc:0.0050 | loss_ate:0.0135 |: 100%|██████████| 226/226 [03:19<00:00,  1.13it/s,  APC_ACC: 88.01(max:88.28) | APC_F1: 81.83(max:82.70) | ATE_F1: 85.10(max:85.10)]
Epoch:  2| loss_apc:0.0008 | loss_ate:0.0074 |: 100%|██████████| 226/226 [03:35<00:00,  1.05it/s,  APC_ACC: 87.30(max:88.28) | APC_F1: 80.56(max:82.70) | ATE_F1: 84.41(max:85.10)]


2024-07-04 14:25:42,127 INFO: 
-------------------------------------------------------------------- Raw Metric Records --------------------------------------------------------------------
╒════════════════════════════════╤═══════════════════════════════════════════════════════╤══════════╤═══════════╤══════════╤═══════╤═══════╤═══════╤═══════╕
│ Metric                         │ Trial                                                 │ Values   │  Average  │  Median  │  Std  │  IQR  │  Min  │  Max  │
╞════════════════════════════════╪═══════════════════════════════════════════════════════╪══════════╪═══════════╪══════════╪═══════╪═══════╪═══════╪═══════╡
│ Max-APC-Test-Acc w/o Valid Set │ fast_lcf_atepc-Restaurant14-microsoft/deberta-v3-base │ [88.28]  │   88.28   │  88.28   │   0   │   0   │ 88.28 │ 88.28 │
├────────────────────────────────┼───────────────────────────────────────────────────────┼──────────┼───────────┼──────────┼───────┼───────┼───────┼───────┤
│ Max-APC-Test-F1 w/o Valid

In [13]:
aspect_extractor = trainer.load_trained_model()
assert isinstance(aspect_extractor, ATEPC.AspectExtractor)

[2024-07-04 14:28:38] (2.4.1.post1) Load aspect extractor from checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/
[2024-07-04 14:28:38] (2.4.1.post1) config: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.config
[2024-07-04 14:28:38] (2.4.1.post1) state_dict: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.state_dict
[2024-07-04 14:28:38] (2.4.1.post1) model: None
[2024-07-04 14:28:38] (2.4.1.post1) tokenizer: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.tokenizer
[2024-07-04 14:28:38] (2.4.1.post1) Set Model Device: cuda:0
[2024-07-04 14:28:38] (2.4.1.post1) Device Name: NVIDIA GeForce GTX 1070


In [16]:
for comment in df_comments_only_eng_short['value'][:1]:
    aspect_extractor.predict(comment)

[2024-07-04 14:30:27] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-07-04 14:30:27] (2.4.1.post1) Example 0: Great [32m<game:Positive Confidence:0.9985>[0m , full controllo of your [32m<strategy:Positive Confidence:0.999>[0m through constant adjustment of your tactic watching what your opponents do .
[2024-07-04 14:30:28] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-07-04 14:30:28] (2.4.1.post1) Example 0: Very clever game , enjoyable overall . Plus points : Great production values for all components and very good value for money . Lovely artwork everywhere . Smooth game play mechanisms - ironed out some oddities of the original game . Fortunes change frequently , so the winner could be anyone . Minus points : The b

In [115]:
ckpts = (
    available_checkpoints(TaskCodeOption.Aspect_Term_Extraction_and_Classification)
)  
aspect_extractor = ATEPC.AspectExtractor(
    checkpoint="english"
)

[2024-07-10 15:37:31] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-10 15:37:31] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-10 15:37:31] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-10 15:37:31] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-10 15:37:31] (2.4.1.post1) [32mDownloading checkpoint:english [0m
[2024-07-10 15:37:31] (2.4.1.post1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2024-07-10 15:37:31] (2.4.1.post1) Checkpoint already downloaded, skip
[2024-07-10 15:37:31] (2.4.1.post1) Load aspect extractor from checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apc



In [21]:
result = aspect_extractor.predict(
    text=df_comments_only_eng_short['value'].to_list(),
    print_result=False,
    ignore_error=True,  # ignore an invalid example, if it is False, invalid examples will raise Exceptions
    eval_batch_size=64,
)

preparing ate inference dataloader: 100%|██████████| 45144/45144 [01:42<00:00, 440.35it/s]
extracting aspect terms: 100%|██████████| 1411/1411 [34:02<00:00,  1.45s/it]
preparing apc inference dataloader: 100%|██████████| 53730/53730 [02:49<00:00, 317.21it/s]
classifying aspect sentiments: 100%|██████████| 1680/1680 [41:58<00:00,  1.50s/it]


[2024-07-04 16:07:50] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


In [116]:
result_everdell = aspect_extractor.predict(
    text=df_everdell_cleaned['value'].to_list(),
    print_result=False,
    ignore_error=True,  # ignore an invalid example, if it is False, invalid examples will raise Exceptions
    eval_batch_size=64,
)

preparing ate inference dataloader: 100%|██████████| 4571/4571 [00:09<00:00, 478.91it/s]
extracting aspect terms: 100%|██████████| 72/72 [04:08<00:00,  3.46s/it]
preparing apc inference dataloader: 100%|██████████| 6653/6653 [00:18<00:00, 367.33it/s]
  lcf_cdm_vec = torch.tensor(
classifying aspect sentiments: 100%|██████████| 104/104 [06:09<00:00,  3.55s/it]


[2024-07-10 15:48:56] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


In [124]:
result_uno = aspect_extractor.predict(
    text=df_uno_cleaned['value'].to_list(),
    print_result=False,
    ignore_error=True,  # ignore an invalid example, if it is False, invalid examples will raise Exceptions
    eval_batch_size=64,
)

preparing ate inference dataloader: 100%|██████████| 3492/3492 [00:04<00:00, 856.57it/s]
extracting aspect terms: 100%|██████████| 55/55 [03:25<00:00,  3.73s/it]
preparing apc inference dataloader: 100%|██████████| 3254/3254 [00:06<00:00, 499.92it/s]
classifying aspect sentiments: 100%|██████████| 51/51 [02:55<00:00,  3.44s/it]

[2024-07-10 15:57:19] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json





In [63]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/pyabsa_aspect_extractor_result.pkl','wb') as f:
    pickle.dump(result, f)

In [94]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/pyabsa_aspect_extractor_result.pkl','rb') as f:
    result = pickle.load(f)

In [105]:
game_aspects_pyabsa = {}
for id in df_boardgames_10['id']:
    game_comments = df_comments_only_eng_short[df_comments_only_eng_short['boardgame_id']==id]
    aspects = []
    polarities = []
    for i in game_comments.index:
        aspects += result[i]['aspect']
        polarities += result[i]['sentiment']
    game_aspects_pyabsa[f'{id}'] = (aspects,polarities)

In [122]:
aspects = []
polarities = []
for i in df_everdell_cleaned.index:
    aspects += result_everdell[i]['aspect']
    polarities += result_everdell[i]['sentiment']
game_aspects_pyabsa[f'199792'] = (aspects,polarities)

In [125]:
aspects = []
polarities = []
for i in df_uno_cleaned.index:
    aspects += result_uno[i]['aspect']
    polarities += result_uno[i]['sentiment']
game_aspects_pyabsa[f'2223'] = (aspects,polarities)

In [31]:
try:
    game_aspects_pyabsa
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_pyabsa_dict.pkl','wb') as f:
        pickle.dump(game_aspects_pyabsa, f)
except NameError:
    print('loaded from storage')
    with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_pyabsa_dict.pkl', 'rb') as f:
        game_aspects_pyabsa = pickle.load(f)

loaded from storage


In [68]:
extract_list(game_aspects_pyabsa, n_aspects=5, min_cutoff=25, lemmatization=True)

Brass: Birmingham (224517)
['game', 'games', 'Game', 'Games', 'Brass: Birmingham', 'brass', 'birmingham']
5
Best: ['mechanic', 'interaction', 'design', 'player interaction', 'component']
Worst: ['teach', 'time', 'rule', 'board', 'learn']


Pandemic Legacy: Season 1 (161936)
['game', 'games', 'Game', 'Games', 'Pandemic Legacy: Season 1', 'pandemic', 'legacy', 'season', '1']
5
Best: ['gaming', 'experience', 'playing', 'money', 'narrative']
Worst: ['ending', 'replayability', 'card', 'rule', 'difficulty']


Gloomhaven (174430)
['game', 'games', 'Game', 'Games', 'Gloomhaven', 'gloomhaven']
5
Best: ['value', 'world', 'content', 'system', 'mechanic']
Worst: ['set up', 'set', 'setup time', 'setup', 'tear down']


Ark Nova (342942)
['game', 'games', 'Game', 'Games', 'Ark Nova', 'ark', 'nova']
5
Best: ['theme', 'mechanic', 'gameplay', 'design', 'play']
Worst: ['randomness', 'luck', 'hour', 'length', 'playtime']


Twilight Imperium: Fourth Edition (233078)
['game', 'games', 'Game', 'Games', 'Twil

In [187]:
df_test = df_comments_only_eng_short[df_comments_only_eng_short['boardgame_id']==224517]['value']
for v in df_test[df_test.apply(lambda x: 'tea' in x.split(' '))].sample(20,replace=True):
    print(v)

Meh.  Not bad but I don’t get all the excitement about this game.  I didn’t really care for the “reset” midway through, either. A decent game but not my cup of tea I guess.
It's a good game, just not my cup of tea for most game nights. Once in awhile is fine.
It's a good game, just not my cup of tea for most game nights. Once in awhile is fine.
Meh.  Not bad but I don’t get all the excitement about this game.  I didn’t really care for the “reset” midway through, either. A decent game but not my cup of tea I guess.
It's a good game, just not my cup of tea for most game nights. Once in awhile is fine.
It's a good game, just not my cup of tea for most game nights. Once in awhile is fine.
Meh.  Not bad but I don’t get all the excitement about this game.  I didn’t really care for the “reset” midway through, either. A decent game but not my cup of tea I guess.
It's a good game, just not my cup of tea for most game nights. Once in awhile is fine.
Because the game is so popular lately i wanted

In [176]:
srs_aspects.apply(lambda x:x.lower())

0        clay coins
1              game
2          strategy
3       interaction
4               art
           ...     
4568           play
4569           game
4570      resources
4571           play
4572       learning
Length: 4573, dtype: object

# Lemmatizer benchmark

In [54]:
import time
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Example list of aspect terms
aspects = ["cards", "card", "gameplay", "playing"] * 1000  # Larger dataset for timing

# Measure the time taken by spaCy lemmatizer
start_time = time.time()
lemmatized_aspects_spacy = [nlp(aspect)[0].lemma_ for aspect in aspects]
spacy_time = time.time() - start_time

print(f"spaCy lemmatization time: {spacy_time:.4f} seconds")

spaCy lemmatization time: 15.1394 seconds


In [58]:
import time
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize(aspect)

# Example list of aspect terms
aspects = ["cards", "card", "gameplay", "playing"] * 1000  # Larger dataset for timing

# Measure the time taken by NLTK lemmatizer
start_time = time.time()
lemmatized_aspects_nltk = [lemmatizer.lemmatize(aspect) for aspect in aspects]
nltk_time = time.time() - start_time

print(f"NLTK lemmatization time: {nltk_time:.4f} seconds")

NLTK lemmatization time: 4.3462 seconds
