# What do you like in boardgames
The goal of this project is to explore the data from boardgamegeek.com and try to discover the most-liked and disliked aspects of each of the top 10 games (as of 25/06/2024)

In [140]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import math
import time
import ast
import string
import statistics
import pickle
import os.path
import matplotlib.pyplot as plt

import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
import torch.nn.functional as F
import fasttext
from setfit import AbsaModel, AbsaTrainer, TrainingArguments
from pyabsa import available_checkpoints, TaskCodeOption, AspectTermExtraction as ATEPC, ModelSaveOption, DeviceTypeOption
from datasets import load_dataset
import warnings

transformers.logging.set_verbosity_error()

In [2]:
# loading data from boarggamegeek about boardgames downloaded from boardgamegeek.com (26/06/2024)
df_boardgames = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_ranks.csv')

In [3]:
# keep only the top 10 games, the df is already sorted by highest rated
df_boardgames_10 = df_boardgames[:10]

In [118]:
def donwload_game_comments(game_id):
    comments=[]
    base_url = f'https://api.geekdo.com/xmlapi2/thing?type=boardgame&id={game_id}&comments=1'
    print(f'Downloading comments for game with id {game_id}')
    r = requests.get(base_url)
    # parse downloaded xml
    root = ET.fromstring(r.content)
    # extract the total number of comments for that game
    number_of_comments = int(root[0].find('comments').attrib['totalitems'])
    # calculate the number of pages to request to download all comments
    number_of_pages = math.ceil(number_of_comments / 100)
    time.sleep(1)
    # download comments from all pages
    for i in range(number_of_pages):
        # create url for the next page
        url = f'{base_url}&page={i+1}'
        while(True):
            r = requests.get(url)
            # parse downloaded xml
            root = ET.fromstring(r.content)
            # iterate over comments to store them
            for comment in root.iter('comment'):
                comment.attrib['boardgame_id'] = game_id
                comments.append(comment.attrib)
            print(f"{len(comments)}/{number_of_comments}")
            time.sleep(1.5)
            if len(list(root.iter('comment')))>0:
                break
            else:
                print('repeating page download since no comment was received')
    # the API does not give comments sometimes, so we check if the number of comments obtained so far matches the expectation
    if len(comments)!=number_of_comments:
        print(f'Failed to download ALL comments for game with id: {game_id}')
    return pd.DataFrame(comments)

In [142]:
# if reviews are not already downloaded, download them and store them
if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments.csv'):
    df_comments = pd.DataFrame()
    for i,boardgame in df_boardgames_10.iterrows():
        df_game_comments = donwload_game_comments(boardgame['id'])
        df_comments = pd.concat([df_comments, df_game_comments])
    df_comments.to_csv('/content/drive/MyDrive/Boardgames/boardgames_comments.csv')
else:
    df_comments = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments.csv',index_col=0)

In [143]:
df_comments

Unnamed: 0,username,rating,value,boardgame_id
0,1 Family Meeple,,SLEEVED[IMG]https://cf.geekdo-static.com/mbs/m...,224517
1,13inha,,G,224517
2,1bez,10.0,"Great game, full controllo of your strategy th...",224517
3,2bit,7.5,"Very clever game, enjoyable overall. Plus poi...",224517
4,2d20,9.0,Brilliant! Fits right into my wheelhouse all ...,224517
...,...,...,...,...
67905,Zvonmirus,7.5,Only played the beginning scenarios with my el...,291457
67906,Zygomax,,BGS Prize Nov. 2021,291457
67907,_Kenneth,9.5,Cooperative Legacy (2P-3P) ✓ Completion,291457
67908,_LSK_,5.0,Too hard under the rules in the game and a bit...,291457


In [5]:
def clean_comments(df_comments,length=None):
    df_comments_cleaned = df_comments.copy()
    # make sure comments are strings
    df_comments_cleaned['value'] = df_comments_cleaned['value'].map(lambda value: str(value))
    # initialized model to detect language
    model_lang_detection = fasttext.load_model('./content/drive/MyDrive/Boardgames/lid.176.ftz')
    # add column to dataframe with language
    df_comments_cleaned['lang'] = df_comments_cleaned['value'].map(lambda comment: model_lang_detection.predict(comment, k=1)[0][0].replace('__label__',''))
    # filter dataframe to keep english language only
    df_comments_cleaned = df_comments_cleaned[df_comments_cleaned['lang']=='en']
    # remove short comments, remove bottom 25% elements
    if length==None:
        length = df_comments_cleaned['value'].map(lambda comment: len(comment)).quantile(0.25)
    df_comments_cleaned = df_comments_cleaned[df_comments_cleaned['value'].map(lambda t:len(t)>length)]
    return df_comments_cleaned

In [152]:
df_comments_only_eng_short = clean_comments(df_comments)
df_comments_only_eng_long = clean_comments(df_comments, length=100)

In [146]:
# print some elements to use as example in the report
for elem in df_comments_only_eng[df_comments_only_eng['value'].apply(lambda v: 'luck' in v and len(v)<100)]['value'][:20]:
    print(elem)

Low luck, high skill game which good moves are not obvious and many strategies are viable.
Great mechanics. Too much luck involved (drawing cards)
Unluckily I sold it before playing it because I did not have a group to play with.
Feel like going through the motions alot of chance and luck in this game.
Great fun but can be very luck dependent.
players -  2 - 4 rules -    MEDIUM luck -     MEDIUM strategy - HIGH  CO-OP
A true masterpiece ! I only wish the mechanics were a tiny bit less luck-based.
Beautiful but too much based on luck
Nothing else comes close if you like the Card Management system and lack of luck.
Solo only - a bit too much luck of the drawer for my taste.
Played on TTS. Preordered. Card luck like Wingspan. Some boards and sponsor powers are broken.
My initial gripes about the luck factor mostly disappeared the more I played.
Overhyped. Too much luck involved with drawing of cards.
too much luck in the cards for the lenght and the complexity of this game
Variability of 

In [149]:
# download reviews about everdell and uno to use as ""control variables""
if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments_everdell.csv'):
    df_everdell = donwload_game_comments(199792)
else:
    df_everdell = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments_everdell.csv',index_col=0)

if not os.path.isfile('./content/drive/MyDrive/Boardgames/boardgames_comments_uno.csv'):
    df_uno = donwload_game_comments(2223)
else:
    df_uno = pd.read_csv('./content/drive/MyDrive/Boardgames/boardgames_comments_uno.csv',index_col=0)

In [124]:
# clean datasets of the newly downloaded games
df_everdell_cleaned = clean_comments(df_everdell)
df_uno_cleaned = clean_comments(df_uno)

Unnamed: 0,username,rating,value,boardgame_id,lang
0,1 Family Meeple,6,[IMG]https://cf.geekdo-static.com/mbs/mb_17587...,199792,en
1,1000rpm,8,"Very good, but brain burning in the last few m...",199792,en
2,100pcBlade,8,Everdell doesn’t really offer anything new but...,199792,en
3,1arska,8,"Great and beautiful game, like Wingspan! But.....",199792,en
5,1point21gigawatts,8,Kickstarted - Available September 2019,199792,en
...,...,...,...,...,...
7028,zyater,8,Deluxe version with wooden tree,199792,en
7029,Zygmoont,10,"My best game so far, best artwork and graphics...",199792,en
7031,Zyyer,9,Loved the little pieces and the amazing art. I...,199792,en
7034,z_zadar,8,"A beautiful game, just a little bit long and o...",199792,en


# Hugging face - Deberta v3 base absa


In [114]:
# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1", use_fast=False)
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_pipeline = pipeline('text-classification', model=absa_model, tokenizer=absa_tokenizer, max_length=512, truncation=True,device=0)

In [115]:
def map_row_to_value(row, aspect):
    if row[f'aspect_{aspect}_label']=='Positive':
        return 1
    elif row[f'aspect_{aspect}_label']=='Negative':
        return -1
    elif row[f'aspect_{aspect}_label']=='Neutral':
        return 0
def map_row_to_value2(row, aspect):
    if row[f'aspect_{aspect}_label']=='Positive':
        return row[f'aspect_{aspect}_score']
    elif row[f'aspect_{aspect}_label']=='Negative':
        return -row[f'aspect_{aspect}_score']
    elif row[f'aspect_{aspect}_label']=='Neutral':
        return 0

In [128]:
aspects = ['luck','bookkeeping','downtime','interaction','bash the leader','complicated','complex']
def analyze_aspects(df_comments, aspects=aspects):
    df_comments_with_aspects = df_comments.copy()
    for aspect in aspects:
        print(f'Analyzing {aspect}')
        results = absa_pipeline(df_comments['value'].to_list(),  text_pair=aspect)
        df_comments_with_aspects[f'aspect_{aspect}'] = results
        df_comments_with_aspects[f'aspect_{aspect}_label'] = [res['label'] for res in results]
        df_comments_with_aspects[f'aspect_{aspect}_score'] = [res['score'] for res in results]
        df_comments_with_aspects[f'aspect_{aspect}_mapped'] = df_comments_with_aspects.apply(lambda row:map_row_to_value(row,aspect),axis=1)
        df_comments_with_aspects[f'aspect_{aspect}_mapped2'] = df_comments_with_aspects.apply(lambda row:map_row_to_value2(row,aspect),axis=1)
    return df_comments_with_aspects

In [None]:
df_comments_only_eng_short_with_aspects = analyze_aspects(df_comments_only_eng_short)

In [129]:
df_everdell_with_aspects = analyze_aspects(df_everdell_cleaned)

Analyzing luck
Analyzing bookkeeping
Analyzing downtime
Analyzing interaction
Analyzing bash the leader
Analyzing complicated
Analyzing complex


In [130]:
df_uno_with_aspects = analyze_aspects(df_uno_cleaned)

Analyzing luck
Analyzing bookkeeping
Analyzing downtime
Analyzing interaction
Analyzing bash the leader
Analyzing complicated
Analyzing complex


In [139]:
df_comments_only_eng_short_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_default_aspects.csv')
df_everdell_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_everdell_default_aspects.csv')
df_uno_with_aspects.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_uno_default_aspects.csv')

In [None]:
df_comments_only_eng_short_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_default_aspects.csv',index_col=0)
df_everdell_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_everdell_default_aspects.csv',index_col=0)
df_uno_with_aspects = pd.read_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_uno_default_aspects.csv',index_col=0)

In [87]:
# code to fix dictionaries that are stored as string by pd.from_csv 
def fix_dictionaries(row,aspect):
    string_dict = row[f'aspect_{aspect}']
    dict = ast.literal_eval(string_dict)
    return dict
for aspect in aspects:
    srs_dict = df_comments_only_eng_short_with_aspects.apply(lambda row:fix_dictionaries(row,aspect),axis=1)
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}'] = srs_dict
for aspect in aspects:
    results = df_comments_only_eng_short_with_aspects[f'aspect_{aspect}'].to_list()
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_label'] = [res['label'] for res in results]
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_score'] = [res['score'] for res in results]
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_mapped'] = df_comments_only_eng_short_with_aspects.apply(lambda row:map_row_to_value(row,aspect),axis=1)
    df_comments_only_eng_short_with_aspects[f'aspect_{aspect}_mapped2'] = df_comments_only_eng_short_with_aspects.apply(lambda row:map_row_to_value2(row,aspect),axis=1)

In [173]:
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_short_with_aspects[df_comments_only_eng_short_with_aspects['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]}")
    scores = []
    for aspect in aspects:
        boardgame_comments_polars = boardgame_comments
        score1 = boardgame_comments[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
        score2 = boardgame_comments[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
        scores.append(f'{score1:.2}')
        #print(f"{aspect}:  : {score2:.2}")
    print(' & '.join(scores))

Brass: Birmingham
0.37 & 0.26 & 0.31 & 0.41 & 0.24 & -0.4 & 0.095
Pandemic Legacy: Season 1
0.34 & 0.24 & 0.33 & 0.39 & 0.21 & -0.33 & 0.1
Gloomhaven
0.29 & 0.17 & 0.22 & 0.3 & 0.17 & -0.42 & -0.0082
Ark Nova
0.24 & 0.2 & 0.18 & 0.25 & 0.16 & -0.4 & 0.034
Twilight Imperium: Fourth Edition
0.32 & 0.21 & 0.2 & 0.33 & 0.17 & -0.44 & 0.033
Dune: Imperium
0.38 & 0.31 & 0.37 & 0.42 & 0.3 & -0.26 & 0.22
Terraforming Mars
0.28 & 0.2 & 0.22 & 0.29 & 0.17 & -0.4 & 0.063
War of the Ring: Second Edition
0.38 & 0.28 & 0.32 & 0.42 & 0.24 & -0.42 & 0.073
Star Wars: Rebellion
0.38 & 0.29 & 0.32 & 0.42 & 0.25 & -0.33 & 0.13
Gloomhaven: Jaws of the Lion
0.35 & 0.25 & 0.33 & 0.4 & 0.23 & -0.38 & 0.038


In [174]:
print("UNO")
scores = []
for aspect in aspects:
    boardgame_comments_polars = df_uno_comments_with_aspects
    score1 = df_uno_comments_with_aspects[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
    score2 = df_uno_comments_with_aspects[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
    print(f"{aspect}: {score1} - {score2}")
    scores.append(f'{score1:.2}')
print(' & '.join(scores))

UNO
luck: 0.12915234822451319 - 0.08718912821489239
bookkeeping: 0.036368843069874 - 0.028064031567838457
downtime: 0.12371134020618557 - 0.09373746944627674
interaction: 0.14461626575028638 - 0.11044979105700332
bash the leader: -0.06786941580756013 - -0.053911250592438766
complicated: -0.6660939289805269 - -0.5434149868982627
complex: -0.16237113402061856 - -0.1334041057566324
0.13 & 0.036 & 0.12 & 0.14 & -0.068 & -0.67 & -0.16


In [175]:
print("Everdell")
scores = []
for aspect in aspects:
    boardgame_comments_polars = df_everdell_with_aspects
    score1 = df_everdell_with_aspects[f'aspect_{aspect}_mapped'].sum()/len(boardgame_comments_polars)
    score2 = df_everdell_with_aspects[f'aspect_{aspect}_mapped2'].sum()/len(boardgame_comments_polars)
    print(f"{aspect}: {score1} - {score2}")
    scores.append(f'{score1:.2}')
print(' & '.join(scores))

Everdell
luck: 0.39028658936775323 - 0.3163189872809193
bookkeeping: 0.3218114198206082 - 0.2649124281473166
downtime: 0.37716035878363596 - 0.3226796336858337
interaction: 0.42375847735725225 - 0.36864200744266923
bash the leader: 0.30781010719754975 - 0.25930177459985937
complicated: -0.23911616714066944 - -0.19680007381372447
complex: 0.20017501640778823 - 0.16144633592413854
0.39 & 0.32 & 0.38 & 0.42 & 0.31 & -0.24 & 0.2


# Hugging face - Twitter-XLM-RoBERTa-Base-Sentiment

In [53]:
# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path, max_length=512, truncation=True,device=0)

In [93]:
sentence = df_comments_only_eng_short['value'].iloc[4]
print(f"Sentence: {sentence}")
print()

# ABSA of "food"
aspect = "player interaction"
inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
outputs = absa_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
probs = probs.detach().numpy()[0]
print(f"Sentiment of aspect '{aspect}' is:")
for prob, label in zip(probs, ["negative", "neutral", "positive"]):
  print(f"Label {label}: {prob}")
print()

# Overall sentiment of the sentence
sentiment = sentiment_model([sentence])[0]
print(f"Overall sentiment: {sentiment['label']} with score {sentiment['score']}")

Sentence: Absolutely brilliant!  I never played the original Brass, and now I don't need to.  The art, mechanics, and pacing of the game is excellent.  I absolutely love the the "tale of two halves" type of mechanic that rewards preparation.  I have limited experience with Euro's, but this game has become my personal Euro litmus test. 

Sentiment of aspect 'player interaction' is:
Label negative: 0.0034680410753935575
Label neutral: 0.01620234176516533
Label positive: 0.9803296327590942

Overall sentiment: positive with score 0.8710653185844421


In [79]:
df_game_statistics = pd.DataFrame()
df_game_statistics['id'] = df_boardgames_10['id']
df_game_statistics['name'] = df_boardgames_10['name']
df_game_statistics

Unnamed: 0,id,name
0,224517,Brass: Birmingham
1,161936,Pandemic Legacy: Season 1
2,174430,Gloomhaven
3,342942,Ark Nova
4,233078,Twilight Imperium: Fourth Edition
5,316554,Dune: Imperium
6,167791,Terraforming Mars
7,115746,War of the Ring: Second Edition
8,187645,Star Wars: Rebellion
9,291457,Gloomhaven: Jaws of the Lion


In [82]:
positive_count_list = []
neutral_count_list = []
negative_count_list = []
positive_values_list = []
neutral_values_list = []
negative_values_list = []
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_short[df_comments_only_eng_short['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]}")
    positive_count = 0
    neutral_count = 0
    negative_count = 0 
    positive_values = 0
    neutral_values = 0
    negative_values = 0
    for comment in boardgame_comments['value']:
        sentiment = sentiment_model(comment)[0]
        if sentiment['label'] == 'neutral':
            neutral_count+=1
            neutral_values+=sentiment['score']
        elif sentiment['label'] == 'positive':
            positive_count+=1
            positive_values+=sentiment['score']
        elif sentiment['label'] == 'negative':
            negative_count+=1
            negative_values+=sentiment['score']
        else:
            raise ValueError('Unexpected label')
    print(f'positive count:{positive_count}')
    print(f'negative count:{negative_count}')
    print(f'neutral count:{neutral_count}')
    print(f'positive values:{positive_values}')
    print(f'negative values:{negative_values}')
    print(f'neutral values:{neutral_values}')
    positive_count_list.append(positive_count)
    neutral_count_list.append(neutral_count)
    negative_count_list.append(negative_count)
    positive_values_list.append(positive_values)
    neutral_values_list.append(neutral_values)
    negative_values_list.append(negative_values)
    print('\n')
df_game_statistics['positive_count'] = positive_count_list
df_game_statistics['negative_count'] = negative_count_list
df_game_statistics['neutral_count'] = neutral_count_list
df_game_statistics['positive_values'] = positive_values_list
df_game_statistics['negative_values'] = negative_values_list
df_game_statistics['neutral_values'] = neutral_values_list

Brass: Birmingham
positive count:2162
negative count:890
neutral count:784
positive values:1686.5857102274895
negative values:562.9474938809872
neutral values:567.607346534729


Pandemic Legacy: Season 1
positive count:3424
negative count:1335
neutral count:807
positive values:2753.8470543026924
negative values:891.0502565205097
neutral values:543.871523141861


Gloomhaven
positive count:4052
negative count:2257
neutral count:1398
positive values:3168.175465911627
negative values:1459.651128321886
neutral values:949.1069539189339


Ark Nova
positive count:1899
negative count:1028
neutral count:775
positive values:1435.8439967036247
negative values:649.9842830896378
neutral values:516.3303357064724


Twilight Imperium: Fourth Edition
positive count:1380
negative count:685
neutral count:425
positive values:1070.431633412838
negative values:424.35453057289124
neutral values:281.38977962732315


Dune: Imperium
positive count:2172
negative count:905
neutral count:855
positive values:1696.64

In [91]:
positive_count = 0
neutral_count = 0
negative_count = 0 
positive_values = 0
neutral_values = 0
negative_values = 0
for comment in df_uno_cleaned['value']:
    sentiment = sentiment_model(comment)[0]
    if sentiment['label'] == 'neutral':
        neutral_count+=1
        neutral_values+=sentiment['score']
    elif sentiment['label'] == 'positive':
        positive_count+=1
        positive_values+=sentiment['score']
    elif sentiment['label'] == 'negative':
        negative_count+=1
        negative_values+=sentiment['score']
uno_data = {'id':2223,'name':'UNO','positive_count':positive_count,'negative_count':negative_count,'neutral_count':neutral_count,
            'positive_values':positive_values,'negative_values':negative_values,'neutral_values':neutral_values}
uno_data

{'id': 2223,
 'name': 'UNO',
 'positive_count': 1707,
 'negative_count': 812,
 'neutral_count': 970,
 'positive_values': 1267.8382885158062,
 'negative_values': 592.9178346097469,
 'neutral_values': 633.9069083333015}

In [99]:
positive_count = 0
neutral_count = 0
negative_count = 0 
positive_values = 0
neutral_values = 0
negative_values = 0
for comment in df_everdell_cleaned['value']:
    sentiment = sentiment_model(comment)[0]
    if sentiment['label'] == 'neutral':
        neutral_count+=1
        neutral_values+=sentiment['score']
    elif sentiment['label'] == 'positive':
        positive_count+=1
        positive_values+=sentiment['score']
    elif sentiment['label'] == 'negative':
        negative_count+=1
        negative_values+=sentiment['score']
everdell_data = {'id':199792,'name':'Everdell','positive_count':positive_count,'negative_count':negative_count,'neutral_count':neutral_count,
            'positive_values':positive_values,'negative_values':negative_values,'neutral_values':neutral_values}
everdell_data

{'id': 199792,
 'name': 'Everdell',
 'positive_count': 2631,
 'negative_count': 874,
 'neutral_count': 1058,
 'positive_values': 2086.8162736296654,
 'negative_values': 558.2833794057369,
 'neutral_values': 764.798805475235}

In [100]:
test_df = df_game_statistics.copy()
dct_uno = {k:[v] for k,v in uno_data.items()}
uno_df = pd.DataFrame.from_dict(dct_uno)
dct_everdell = {k:[v] for k,v in everdell_data.items()}
everdell_df = pd.DataFrame.from_dict(dct_everdell)
test_df = pd.concat([test_df,uno_df,everdell_df])
test_df

Unnamed: 0,id,name,positive_count,negative_count,neutral_count,positive_values,negative_values,neutral_values
0,224517,Brass: Birmingham,2162,890,784,1686.58571,562.947494,567.607347
1,161936,Pandemic Legacy: Season 1,3424,1335,807,2753.847054,891.050257,543.871523
2,174430,Gloomhaven,4052,2257,1398,3168.175466,1459.651128,949.106954
3,342942,Ark Nova,1899,1028,775,1435.843997,649.984283,516.330336
4,233078,Twilight Imperium: Fourth Edition,1380,685,425,1070.431633,424.354531,281.38978
5,316554,Dune: Imperium,2172,905,855,1696.646231,559.537782,592.406301
6,167791,Terraforming Mars,4787,2510,1941,3721.409585,1683.336336,1337.058583
7,115746,War of the Ring: Second Edition,1379,469,557,1085.069836,284.955896,382.520318
8,187645,Star Wars: Rebellion,1821,765,608,1427.392635,476.621553,411.690258
9,291457,Gloomhaven: Jaws of the Lion,1754,714,606,1365.200654,453.625805,427.952996


In [105]:
test_df.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_comments_sentiment_analysis.csv')

In [103]:
negativity_strength = test_df['negative_values'] / test_df['negative_count']
negativity_strength

0    0.632525
1    0.667453
2    0.646722
3    0.632280
4    0.619496
5    0.618274
6    0.670652
7    0.607582
8    0.623035
9    0.635330
0    0.730194
0    0.638768
dtype: float64

In [104]:
positivity_strength = test_df['positive_values'] / test_df['positive_count']
positivity_strength

0    0.780104
1    0.804278
2    0.781879
3    0.756105
4    0.775675
5    0.781145
6    0.777399
7    0.786853
8    0.783851
9    0.778336
0    0.742729
0    0.793165
dtype: float64

In [109]:
polarization_values = test_df['positive_values'] / test_df['negative_values']
polarization_values

0    2.995991
1    3.090563
2    2.170502
3    2.209044
4    2.522494
5    3.032228
6    2.210734
7    3.807852
8    2.994813
9    3.009530
0    2.138304
0    3.737916
dtype: float64

In [110]:
polarization_count = test_df['positive_count'] / test_df['negative_count']
polarization_count

0    2.429213
1    2.564794
2    1.795304
3    1.847276
4    2.014599
5    2.400000
6    1.907171
7    2.940299
8    2.380392
9    2.456583
0    2.102217
0    3.010297
dtype: float64

# Setfit ABSA

## Pretrained

In [None]:
!pip install setfit

Collecting setfit
  Downloading setfit-1.0.3-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.3.0 (from setfit)
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers>=2.2.1 (from setfit)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate>=0.3.0 (from setfit)
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets>=2.3.0->setfit)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_

In [111]:
!spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [100]:


# Download from the 🤗 Hub
model = AbsaModel.from_pretrained(
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-aspect",
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity",
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [149]:
game_aspects = {}
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_short[df_comments_only_eng_short['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]}")
    aspects=[]
    polarities=[]
    for comment in boardgame_comments['value']:
        aspect_prediction = model(comment)
        for elem in aspect_prediction:
            if isinstance(elem, dict):
                aspects.append(elem['span'])
                polarities.append(elem['polarity'])
    game_aspects[f'{boardgame_id}'] = (aspects, polarities)

Brass: Birmingham
Pandemic Legacy: Season 1
Gloomhaven
Ark Nova
Twilight Imperium: Fourth Edition
Dune: Imperium
Terraforming Mars
War of the Ring: Second Edition
Star Wars: Rebellion
Gloomhaven: Jaws of the Lion


In [101]:
game_aspects_long = {}
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng_long[df_comments_only_eng_long['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]}")
    aspects=[]
    polarities=[]
    for comment in boardgame_comments['value']:
        aspect_prediction = model(comment)
        for elem in aspect_prediction:
            if isinstance(elem, dict):
                aspects.append(elem['span'])
                polarities.append(elem['polarity'])
    game_aspects_long[f'{boardgame_id}'] = (aspects, polarities)

Brass: Birmingham
Pandemic Legacy: Season 1
Gloomhaven
Ark Nova
Twilight Imperium: Fourth Edition
Dune: Imperium
Terraforming Mars
War of the Ring: Second Edition
Star Wars: Rebellion
Gloomhaven: Jaws of the Lion


In [107]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_long_dict.pkl','wb') as f:
    pickle.dump(game_aspects_long, f)

In [8]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_long_dict.pkl', 'rb') as f:
    game_aspects_long2 = pickle.load(f)

In [182]:
# uno
aspects=[]
polarities=[]
boardgame_id='2223'
for comment in df_uno_cleaned['value']:
    aspect_prediction = model(comment)
    for elem in aspect_prediction:
        if isinstance(elem, dict):
            aspects.append(elem['span'])
            polarities.append(elem['polarity'])
(aspects, polarities)
game_aspects[f'{boardgame_id}'] = (aspects, polarities)

(['Player eliminiation',
  'game',
  'game',
  'rules',
  'UNO',
  'cards',
  'snoozo Uno',
  'parlor game',
  'game',
  'grass',
  'collection',
  'house',
  'vibes',
  'fun',
  'party card game',
  'people',
  'draw',
  'card',
  'rule',
  'contention',
  'people',
  'game',
  'house',
  'people',
  'people',
  'discussion',
  'house',
  'game',
  'cutthroat game',
  'Uno',
  'family game',
  'games',
  'fact',
  'lot',
  'times',
  '=)',
  'game',
  'tabletop gaming',
  'card game',
  'deck',
  'house',
  'game',
  'turd',
  'Uno',
  'turd',
  'card games',
  'players',
  'Asda Cost',
  'Date',
  'Uno',
  'Crazy Eights',
  'deck',
  'Standard playing cards',
  'Players Best',
  'Boring',
  'game',
  'game',
  'kids UNO',
  'numbers',
  'tactics',
  'player',
  'players hand',
  'grownups',
  'UNO',
  'time activity',
  'Choices',
  'cards',
  'problem',
  'game',
  'minutes',
  'round',
  'game',
  'ratings',
  'Arcane Tinmen',
  'Medium',
  'cards',
  'B&L Own',
  'game',
  'party 

In [184]:
# everdell
aspects=[]
polarities=[]
boardgame_id='199792'
for comment in df_everdell_cleaned['value']:
    aspect_prediction = model(comment)
    for elem in aspect_prediction:
        if isinstance(elem, dict):
            aspects.append(elem['span'])
            polarities.append(elem['polarity'])
game_aspects[f'{boardgame_id}'] = (aspects, polarities)

In [199]:
generated_aspects_all = pd.DataFrame()

In [9]:
# with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_dict.pkl','wb') as f:
#     pickle.dump(game_aspects, f)
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_dict.pkl', 'rb') as f:
    game_aspects = pickle.load(f)

In [53]:

"Brass: Birmingham".translate(str.maketrans('', '', string.punctuation))

'Brass Birmingham'

In [55]:
for boardgame_id in df_boardgames_10['id'].to_list() + ['2223','199792']:
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    filtered_words = ['game','games']
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
        filtered_words.append(boardgame_data['name'].iloc[0])
        name_without_punct = boardgame_data['name'].iloc[0].translate(str.maketrans('', '', string.punctuation))
        filtered_words += name_without_punct.split(' ')
        filtered_words += name_without_punct.lower().split(' ')
    print(filtered_words)
    srs_aspects = pd.Series(game_aspects[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects[f'{boardgame_id}'][1])
    generated_aspects = pd.DataFrame()
    generated_aspects['id'] = [boardgame_id for _ in range(len(srs_aspects))]
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    generated_aspects_filtered = generated_aspects[~generated_aspects['aspects'].isin(filtered_words)]
    #generated_aspects_all = pd.concat([generated_aspects_all, generated_aspects])
    #print(generated_aspects)
    for aspect in generated_aspects_filtered['aspects'].value_counts().index[:20]:
        aspect_polarities = generated_aspects_filtered[generated_aspects_filtered['aspects']==aspect]['polarities']
        positive_aspect_count = aspect_polarities.where(lambda v:v=='positive').count()
        negative_aspect_count = aspect_polarities.where(lambda v:v=='negative').count()
        neutral_aspect_count = aspect_polarities.where(lambda v:v=='neutral').count()
        print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')
    print('\n')

Brass: Birmingham
['game', 'games', 'Brass: Birmingham', 'Brass', 'Birmingham', 'brass', 'birmingham']
players: 383, 27, 80 -> 14.185185185185185
Lancashire: 265, 16, 32 -> 16.5625
beer: 222, 39, 47 -> 5.6923076923076925
rules: 218, 31, 52 -> 7.032258064516129
strategy: 197, 20, 33 -> 9.85
play: 166, 26, 42 -> 6.384615384615385
cards: 183, 26, 25 -> 7.038461538461538
plays: 145, 27, 46 -> 5.37037037037037
board: 158, 39, 21 -> 4.051282051282051
time: 135, 14, 36 -> 9.642857142857142
player: 118, 9, 31 -> 13.11111111111111
mechanics: 136, 14, 3 -> 9.714285714285714
player interaction: 145, 1, 4 -> 145.0
gameplay: 116, 20, 4 -> 5.8
resources: 116, 11, 10 -> 10.545454545454545
actions: 106, 6, 17 -> 17.666666666666668
interaction: 121, 2, 4 -> 60.5
decisions: 117, 2, 7 -> 58.5
design: 110, 9, 5 -> 12.222222222222221
industries: 107, 7, 8 -> 15.285714285714286


Pandemic Legacy: Season 1
['game', 'games', 'Pandemic Legacy: Season 1', 'Pandemic', 'Legacy', 'Season', '1', 'pandemic', 'legacy

  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')
  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')
  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')


time: 57, 8, 39 -> 7.125
deck: 42, 15, 35 -> 2.8
house rules: 43, 14, 26 -> 3.0714285714285716
fun: 73, 3, 3 -> 24.333333333333332
player: 30, 4, 37 -> 7.5
play: 27, 6, 23 -> 4.5
family game: 45, 2, 5 -> 22.5
classic: 48, 0, 1 -> inf


['game', 'games']
cards: 906, 319, 155 -> 2.8401253918495297
Everdell: 422, 41, 25 -> 10.292682926829269
art: 426, 26, 7 -> 16.384615384615383
players: 283, 80, 82 -> 3.5375
worker placement: 365, 61, 16 -> 5.983606557377049
expansions: 322, 43, 22 -> 7.488372093023256
artwork: 356, 25, 1 -> 14.24
components: 296, 31, 4 -> 9.548387096774194
tree: 221, 81, 16 -> 2.728395061728395
card: 191, 48, 51 -> 3.9791666666666665
theme: 254, 21, 1 -> 12.095238095238095
resources: 211, 40, 23 -> 5.275
gameplay: 211, 44, 9 -> 4.795454545454546
time: 175, 19, 26 -> 9.210526315789474
play: 143, 33, 34 -> 4.333333333333333
combos: 147, 35, 26 -> 4.2
plays: 141, 27, 35 -> 5.222222222222222
luck: 132, 52, 19 -> 2.5384615384615383
table: 146, 23, 24 -> 6.3478260869565215
pl

  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')


In [68]:
for boardgame_id in df_boardgames_10['id'].to_list():
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    filtered_words = ['game','games']
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
        filtered_words.append(boardgame_data['name'].iloc[0])
        name_without_punct = boardgame_data['name'].iloc[0].translate(str.maketrans('', '', string.punctuation))
        filtered_words += name_without_punct.split(' ')
        filtered_words += name_without_punct.lower().split(' ')
    srs_aspects = pd.Series(game_aspects_long2[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects_long2[f'{boardgame_id}'][1])
    generated_aspects = pd.DataFrame()
    generated_aspects['id'] = [boardgame_id for _ in range(len(srs_aspects))]
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    generated_aspects_filtered = generated_aspects[~generated_aspects['aspects'].isin(filtered_words)]
    #generated_aspects_all = pd.concat([generated_aspects_all, generated_aspects])
    results = []
    for aspect in generated_aspects_filtered['aspects'].value_counts().index[:20]:
        aspect_polarities = generated_aspects_filtered[generated_aspects_filtered['aspects']==aspect]['polarities']
        positive_aspect_count = aspect_polarities.where(lambda v:v=='positive').count()
        negative_aspect_count = aspect_polarities.where(lambda v:v=='negative').count()
        neutral_aspect_count = aspect_polarities.where(lambda v:v=='neutral').count()
        score = positive_aspect_count/negative_aspect_count
        results.append((aspect,positive_aspect_count,negative_aspect_count,neutral_aspect_count,score))
    scores = [res[4] for res in results]
    median_score = statistics.median(scores)
    print(median_score)
    above_average = []
    below_average = []
    for (aspect,positive_aspect_count,negative_aspect_count,neutral_aspect_count,score) in results:
        print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {score}')
        if score>=median_score:
            above_average.append(aspect)
        else:
            below_average.append(aspect)
    print(f'Top: {above_average}')
    print(f'Bottom: {below_average}')
    print('\n')

Brass: Birmingham
10.382575757575758
players: 358, 25, 71 -> 14.32
beer: 208, 38, 46 -> 5.473684210526316
rules: 211, 27, 52 -> 7.814814814814815
Lancashire: 238, 13, 32 -> 18.307692307692307
strategy: 181, 18, 32 -> 10.055555555555555
cards: 179, 26, 23 -> 6.884615384615385
board: 154, 37, 21 -> 4.162162162162162
play: 150, 21, 37 -> 7.142857142857143
plays: 136, 21, 37 -> 6.476190476190476
time: 127, 12, 36 -> 10.583333333333334
player: 114, 9, 30 -> 12.666666666666666
player interaction: 139, 1, 4 -> 139.0
resources: 112, 11, 10 -> 10.181818181818182
mechanics: 115, 13, 2 -> 8.846153846153847
actions: 106, 6, 17 -> 17.666666666666668
gameplay: 106, 17, 4 -> 6.235294117647059
decisions: 114, 2, 7 -> 57.0
industries: 105, 6, 7 -> 17.5
strategies: 102, 3, 12 -> 34.0
interaction: 111, 2, 4 -> 55.5
Top: ['players', 'Lancashire', 'time', 'player', 'player interaction', 'actions', 'decisions', 'industries', 'strategies', 'interaction']
Bottom: ['beer', 'rules', 'strategy', 'cards', 'board'

  score = positive_aspect_count/negative_aspect_count


In [202]:
generated_aspects_all.to_csv('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/boardgames_aspect_extraction.csv')

In [59]:
for boardgame_id in df_boardgames_10['id'].to_list() + ['2223','199792']:
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
        filtered_words.append(boardgame_data['name'].iloc[0])
        name_without_punct = boardgame_data['name'].iloc[0].translate(str.maketrans('', '', string.punctuation))
        filtered_words += name_without_punct.split(' ')
        filtered_words += name_without_punct.lower().split(' ')
    else:
        filtered_words += ["uno","Uno","UNO","everdell","Everdell"]
    srs_aspects = pd.Series(game_aspects[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects[f'{boardgame_id}'][1])
    generated_aspects = pd.DataFrame()
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    generated_aspects_filtered = generated_aspects[~generated_aspects['aspects'].isin(filtered_words)]
    positive_aspects = generated_aspects_filtered[generated_aspects_filtered['polarities']=='positive']
    print('positives: ')
    print(f"{positive_aspects['aspects'].value_counts().index[:10].to_list()}")
    negative_aspects = generated_aspects_filtered[generated_aspects_filtered['polarities']=='negative']
    print('negatives: ')
    print(f"{negative_aspects['aspects'].value_counts().index[:10].to_list()}")
    print('\n')

Brass: Birmingham
positives: 
['players', 'Lancashire', 'beer', 'rules', 'strategy', 'cards', 'play', 'board', 'player interaction', 'plays']
negatives: 
['board', 'beer', 'rules', 'players', 'plays', 'cards', 'play', 'theme', 'strategy', 'gameplay']


Pandemic Legacy: Season 1
positives: 
['experience', 'campaign', 'time', 'group', 'story', 'players', 'legacy game', 'fun', 'rules', 'characters']
negatives: 
['campaign', 'rules', 'time', 'story', 'cards', 'rating', 'experience', 'group', 'plays', 'players']


Gloomhaven
positives: 
['characters', 'time', 'campaign', 'cards', 'scenarios', 'character', 'content', 'gameplay', 'players', 'group']
negatives: 
['cards', 'time', 'setup', 'characters', 'character', 'campaign', 'scenarios', 'rules', 'story', 'combat']


Ark Nova
positives: 
['cards', 'players', 'zoo', 'theme', 'mechanics', 'animals', 'time', 'plays', 'play', 'deck']
negatives: 
['cards', 'luck', 'players', 'animals', 'time', 'deck', 'zoo', 'play', 'mechanics', 'card']


Twiligh

In [106]:
for boardgame_id in df_boardgames_10['id'].to_list():
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
    srs_aspects = pd.Series(game_aspects_long[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects_long[f'{boardgame_id}'][1])
    stop_words = ['game','games','players']
    generated_aspects = pd.DataFrame()
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    generated_aspects = generated_aspects[generated_aspects['aspects']]
    positive_aspects = generated_aspects[generated_aspects['polarities']=='positive']
    print('positives: ')
    print(f"{positive_aspects['aspects'].value_counts().index[:10].to_list()}")
    negative_aspects = generated_aspects[generated_aspects['polarities']=='negative']
    print('negatives: ')
    print(f"{negative_aspects['aspects'].value_counts().index[:10].to_list()}")
    print('\n')

Brass: Birmingham
positives: 
['game', 'Brass', 'players', 'games', 'Lancashire', 'rules', 'beer', 'strategy', 'cards', 'board']
negatives: 
['game', 'games', 'beer', 'board', 'Brass', 'rules', 'cards', 'players', 'play', 'plays']


Pandemic Legacy: Season 1
positives: 
['game', 'games', 'experience', 'Pandemic', 'campaign', 'time', 'Legacy', 'group', 'story', 'players']
negatives: 
['game', 'games', 'Pandemic', 'rules', 'campaign', 'time', 'story', 'Legacy', 'cards', 'rating']


Gloomhaven
positives: 
['game', 'Gloomhaven', 'characters', 'time', 'cards', 'games', 'campaign', 'scenarios', 'character', 'content']
negatives: 
['game', 'cards', 'time', 'games', 'Gloomhaven', 'setup', 'characters', 'character', 'campaign', 'scenarios']


Ark Nova
positives: 
['game', 'cards', 'games', 'Terraforming Mars', 'Ark Nova', 'players', 'zoo', 'theme', 'mechanics', 'animals']
negatives: 
['game', 'cards', 'games', 'Terraforming Mars', 'luck', 'players', 'animals', 'Ark Nova', 'time', 'deck']


Twil

In [168]:
generated_aspects[generated_aspects['aspects']==aspect]['polarities']

4        positive
9        positive
10       negative
16       negative
20       positive
           ...   
24667    positive
24670    positive
24671    positive
24679    positive
24681    positive
Name: polarities, Length: 2883, dtype: object

In [166]:
generated_aspects[generated_aspects['aspects']==aspect]['polarities']

4        positive
9        positive
10       negative
16       negative
20       positive
           ...   
24667    positive
24670    positive
24671    positive
24679    positive
24681    positive
Name: polarities, Length: 2883, dtype: object

In [141]:
pd.Series(aspects).value_counts()[:10]

game          2883
Brass          585
players        490
games          442
Lancashire     313
beer           308
rules          301
strategy       250
play           234
cards          234
Name: count, dtype: int64

## Custom

In [None]:
!pip install -U "setfit[absa]"



In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install datasets



In [17]:
model = AbsaModel.from_pretrained(
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/all-mpnet-base-v2",
    spacy_model="en_core_web_lg",
)

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [18]:
dataset = load_dataset("tomaarsen/setfit-absa-semeval-restaurants", split="train")
train_dataset = dataset.select(range(128))
eval_dataset = dataset.select(range(128, 256))

Downloading readme:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3693 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1134 [00:00<?, ? examples/s]

In [20]:
args = TrainingArguments(
    output_dir="models",
    num_epochs=1,
    use_amp=True,
    batch_size=128,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
)
args.eval_strategy = args.evaluation_strategy

trainer = AbsaTrainer(
    model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)
trainer.train()

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 21624
  Batch size = 128
  Num epochs = 1
  Total optimization steps = 169


Step,Training Loss,Validation Loss,Embedding Loss,Rate
100,No log,No log,0.2254,9e-06


  0%|          | 0/205 [00:00<?, ?it/s]

Loading best SentenceTransformer model from step 100.
***** Running training *****
  Num unique pairs = 8670
  Batch size = 128
  Num epochs = 1
  Total optimization steps = 68


Step,Training Loss,Validation Loss


In [None]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()



In [21]:
metrics = trainer.evaluate(eval_dataset)
print(metrics)

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

***** Running evaluation *****


{'aspect': {'accuracy': 0.8407079646017699}, 'polarity': {'accuracy': 0.8031496062992126}}


In [39]:
model.save_pretrained(
    "models/setfit-absa-model-aspect",
    "models/setfit-absa-model-polarity",
)

In [22]:
preds = model.predict([
    "I love the combinations of scoring in this game from worker placement spaces that allow you to get points at the end game, to card bonuses, to points on cards in your tableau. This combined with the uniqueness of the seasons, unlocking workers, and being able to pass while other players are still in previous seasons adds so much variability and interesting gameplay that can occur. The theme is phenomenal and the pairing of creatures with one another or with buildings really drive the theme home. The only thing preventing this from knocking on my top-10 door is the fact that it suffers from the issue that many tableau games have, but to an even greater degree: it is difficult to see what is in-front of other players, and the text and some symbolism is very small, making it very difficult to see what other players are going for or need.",
])
print(preds)

[[{'span': 'combinations', 'polarity': 'positive'}, {'span': 'worker placement spaces', 'polarity': 'positive'}, {'span': 'card bonuses', 'polarity': 'positive'}, {'span': 'cards', 'polarity': 'positive'}, {'span': 'tableau', 'polarity': 'positive'}, {'span': 'workers', 'polarity': 'positive'}, {'span': 'gameplay', 'polarity': 'positive'}, {'span': 'pairing', 'polarity': 'positive'}, {'span': 'creatures', 'polarity': 'positive'}, {'span': 'buildings', 'polarity': 'positive'}, {'span': 'tableau games', 'polarity': 'negative'}]]


In [None]:
preds = model.predict([
    "The game itself is not interesting enough to learn it's metagame. Also, it's terrible at demonstrating the possible value of different strategies, other than the general \"rail building\" path. Sadly that's one of the best point source in a 4 player game.   Lack of game balance and play-testing is an issue. The game is mechanically solid, (It has some beautiful interactions!) But this game itself is under developed.  Exploring a game by playing it is fun, but  it's usually about learning how to do a strategy, and not finding out, how well a strategy performs.",
])
print(preds)

[[{'span': 'game', 'polarity': 'positive'}, {'span': 'metagame', 'polarity': 'negative'}, {'span': 'value', 'polarity': 'negative'}, {'span': 'strategies', 'polarity': 'negative'}, {'span': 'rail building', 'polarity': 'negative'}, {'span': 'player game', 'polarity': 'negative'}, {'span': 'game balance', 'polarity': 'negative'}, {'span': 'play', 'polarity': 'negative'}, {'span': 'game', 'polarity': 'negative'}, {'span': 'interactions', 'polarity': 'positive'}, {'span': 'game', 'polarity': 'negative'}, {'span': 'game', 'polarity': 'negative'}, {'span': 'strategy', 'polarity': 'neutral'}, {'span': 'strategy', 'polarity': 'positive'}]]


In [31]:
game_aspects_custom = {}
for boardgame_id in df_boardgames_10['id']:
    boardgame_comments = df_comments_only_eng[df_comments_only_eng['boardgame_id']==boardgame_id]
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    print(f"{boardgame_data['name'].iloc[0]}")
    aspects=[]
    polarities=[]
    for comment in boardgame_comments['value']:
        aspect_prediction = model(comment)
        for elem in aspect_prediction:
            if isinstance(elem, dict):
                aspects.append(elem['span'])
                polarities.append(elem['polarity'])
    game_aspects_custom[f'{boardgame_id}'] = (aspects, polarities)

Brass: Birmingham
Pandemic Legacy: Season 1
Gloomhaven
Ark Nova
Twilight Imperium: Fourth Edition
Dune: Imperium
Terraforming Mars
War of the Ring: Second Edition
Star Wars: Rebellion
Gloomhaven: Jaws of the Lion


In [40]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_custom_dict.pkl','wb') as f:
    pickle.dump(game_aspects_custom, f)

In [36]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/saved_game_aspects_custom_dict.pkl', 'rb') as f:
    game_aspects_custom = pickle.load(f)

EOFError: Ran out of input

In [33]:
for boardgame_id in df_boardgames_10['id'].to_list():
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
    srs_aspects = pd.Series(game_aspects_custom[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects_custom[f'{boardgame_id}'][1])
    generated_aspects = pd.DataFrame()
    generated_aspects['id'] = [boardgame_id for _ in range(len(srs_aspects))]
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    #generated_aspects_all = pd.concat([generated_aspects_all, generated_aspects])
    for aspect in srs_aspects.value_counts().index[:20]:
        aspect_polarities = generated_aspects[generated_aspects['aspects']==aspect]['polarities']
        positive_aspect_count = aspect_polarities.where(lambda v:v=='positive').count()
        negative_aspect_count = aspect_polarities.where(lambda v:v=='negative').count()
        neutral_aspect_count = aspect_polarities.where(lambda v:v=='neutral').count()
        print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')
    print('\n')

Brass: Birmingham
game: 1294, 109, 56 -> 11.871559633027523
Brass: 484, 38, 22 -> 12.736842105263158
games: 298, 52, 19 -> 5.730769230769231
players: 218, 39, 56 -> 5.589743589743589
cards: 160, 32, 26 -> 5.0
play: 134, 31, 12 -> 4.32258064516129
beer: 129, 26, 20 -> 4.961538461538462
plays: 108, 32, 18 -> 3.375
board: 111, 21, 15 -> 5.285714285714286
rules: 105, 29, 10 -> 3.6206896551724137
player interaction: 130, 3, 0 -> 43.333333333333336
mechanics: 111, 14, 2 -> 7.928571428571429
gameplay: 110, 14, 1 -> 7.857142857142857
strategy: 103, 9, 8 -> 11.444444444444445
theme: 89, 27, 0 -> 3.2962962962962963
design: 100, 11, 0 -> 9.090909090909092
resources: 86, 14, 4 -> 6.142857142857143
player: 62, 6, 31 -> 10.333333333333334
coal: 77, 8, 8 -> 9.625
industries: 86, 4, 2 -> 21.5


Pandemic Legacy: Season 1
game: 1493, 219, 122 -> 6.8173515981735155
games: 437, 156, 90 -> 2.801282051282051
Pandemic: 345, 98, 30 -> 3.520408163265306
campaign: 237, 57, 60 -> 4.157894736842105
group: 175, 53

  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')


# PyABSA - Fast LCF
https://www.mdpi.com/2076-3417/9/16/3389

In [None]:
!pip install pyabsa[dev] -U

Collecting pyabsa[dev]
  Downloading pyabsa-2.4.1.post1-py3-none-any.whl (575 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.5/575.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting findfile>=2.0.0 (from pyabsa[dev])
  Downloading findfile-2.0.1-py3-none-any.whl (7.0 kB)
Collecting autocuda>=0.16 (from pyabsa[dev])
  Downloading autocuda-0.16-py3-none-any.whl (5.1 kB)
Collecting metric-visualizer>=0.9.6 (from pyabsa[dev])
  Downloading metric_visualizer-0.9.13.post1-py3-none-any.whl (24 kB)
Collecting boostaug>=2.3.5 (from pyabsa[dev])
  Downloading boostaug-2.3.5-py3-none-any.whl (16 kB)
Collecting seqeval (from pyabsa[dev])
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting update-checker (from pyabsa[dev])
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB

[2024-07-04 14:12:14] (2.4.1.post1) [31mPyABSA(2.4.1.post1): If your code crashes on Colab, please use the GPU runtime. Then run "pip install pyabsa[dev] -U" and restart the kernel.
Or if it does not work, you can use v1.x versions, e.g., pip install pyabsa<2.0 -U




Try to downgrade transformers<=4.29.0.



[0m


  from distutils.version import StrictVersion
  _warn(f"unclosed running multiprocessing pool {self!r}",


In [10]:
config = (
    ATEPC.ATEPCConfigManager.get_atepc_config_english()
)  # this config contains 'pretrained_bert', it is based on pretrained models
config.model = ATEPC.ATEPCModelList.FAST_LCF_ATEPC  # improved version of LCF-ATEPC

In [11]:
dataset = ATEPC.ATEPCDatasetList.Restaurant14

In [12]:
warnings.filterwarnings("ignore")

config.batch_size = 16
config.patience = 2
config.log_step = -1
config.seed = [1]
config.verbose = False  # If verbose == True, PyABSA will output the model strcture and seversal processed data examples
config.notice = (
    "This is an training example for aspect term extraction"  # for memos usage
)

trainer = ATEPC.ATEPCTrainer(
    config=config,
    dataset=dataset,
    from_checkpoint="english",  # if you want to resume training from our pretrained checkpoints, you can pass the checkpoint name here
    auto_device=DeviceTypeOption.AUTO,  # use cuda if available
    checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT,  # save state dict only instead of the whole model
    load_aug=False,  # there are some augmentation dataset for integrated datasets, you use them by setting load_aug=True to improve performance
)

[2024-07-04 14:12:49] (2.4.1.post1) Set Model Device: cuda:0
[2024-07-04 14:12:49] (2.4.1.post1) Device Name: NVIDIA GeForce GTX 1070


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

2024-07-04 14:12:49,956 INFO: PyABSA version: 2.4.1.post1
2024-07-04 14:12:49,958 INFO: Transformers version: 4.42.3
2024-07-04 14:12:49,959 INFO: Torch version: 2.3.1+cu121+cuda12.1
2024-07-04 14:12:49,961 INFO: Device: NVIDIA GeForce GTX 1070
2024-07-04 14:12:49,989 INFO: Searching dataset 114.Restaurant14 in https://github.com/yangheng95/ABSADatasets
[2024-07-04 14:12:49] (2.4.1.post1) Clone ABSADatasets from https://github.com/yangheng95/ABSADatasets.git
2024-07-04 14:12:59,978 INFO: You can set load_aug=True in a trainer to augment your dataset (English only yet) and improve performance.
2024-07-04 14:12:59,979 INFO: Please use a new folder to perform new text augment if the former augment in integrated_datasets/atepc_datasets/110.SemEval/114.restaurant14 errored unexpectedly


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

[2024-07-04 14:13:02] (2.4.1.post1) Can not load en_core_web_sm from spacy, try to download it in order to parse syntax tree: [32m
python -m spacy download en_core_web_sm[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 6.5 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


convert examples to features:  16%|█▋        | 589/3608 [00:00<00:03, 865.03it/s]



convert examples to features:  47%|████▋     | 1680/3608 [00:02<00:02, 837.98it/s]



convert examples to features:  56%|█████▌    | 2022/3608 [00:02<00:01, 822.76it/s]



convert examples to features:  89%|████████▉ | 3226/3608 [00:03<00:00, 838.44it/s]



convert examples to features: 100%|██████████| 3608/3608 [00:04<00:00, 810.49it/s]

2024-07-04 14:13:14,326 INFO: Dataset Label Details: {'Neutral': 637, 'Negative': 807, 'Positive': 2160, 'Sum': 3604}



convert examples to features:  50%|█████     | 563/1120 [00:00<00:00, 902.64it/s]



convert examples to features:  98%|█████████▊| 1092/1120 [00:01<00:00, 761.54it/s]



convert examples to features: 100%|██████████| 1120/1120 [00:01<00:00, 828.81it/s]

2024-07-04 14:13:16,263 INFO: Dataset Label Details: {'Neutral': 196, 'Negative': 196, 'Positive': 726, 'Sum': 1118}





pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

2024-07-04 14:13:55,046 INFO: Save cache dataset to fast_lcf_atepc.Restaurant14.dataset.3180c1a8d9b4b975f1c495da780d0298592e143b1cb505f57bc6ff1965589266.cache
2024-07-04 14:13:55,624 INFO: cuda memory allocated:764963840
2024-07-04 14:13:55,625 INFO: ABSADatasetsVersion:None	-->	Calling Count:0
2024-07-04 14:13:55,626 INFO: IOB_label_to_index:{'B-ASP': 1, 'I-ASP': 2, 'O': 3, '[CLS]': 4, '[SEP]': 5}	-->	Calling Count:1
2024-07-04 14:13:55,628 INFO: MV:<metric_visualizer.metric_visualizer.MetricVisualizer object at 0x7fa93cd207f0>	-->	Calling Count:0
2024-07-04 14:13:55,630 INFO: PyABSAVersion:2.4.1.post1	-->	Calling Count:1
2024-07-04 14:13:55,631 INFO: SRD:3	-->	Calling Count:9444
2024-07-04 14:13:55,632 INFO: TorchVersion:2.3.1+cu121+cuda12.1	-->	Calling Count:1
2024-07-04 14:13:55,635 INFO: TransformersVersion:4.42.3	-->	Calling Count:1
2024-07-04 14:13:55,636 INFO: auto_device:True	-->	Calling Count:3
2024-07-04 14:13:55,637 INFO: batch_size:16	-->	Calling Count:4
2024-07-04 14:13:5

Downloading checkpoint: 579MB [00:56, 10.16MB/s]                         

Find zipped checkpoint: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip, unzipping





Done.
[2024-07-04 14:14:58] (2.4.1.post1) [33mIf the auto-downloading failed, please download it via browser: https://huggingface.co/spaces/yangheng/PyABSA/resolve/main/checkpoints/English/ATEPC/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip [0m
2024-07-04 14:14:58,759 INFO: Checkpoint downloaded at: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
2024-07-04 14:14:59,138 INFO: Resume trainer from Checkpoint: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43!
2024-07-04 14:14:59,139 INFO: ***** Running training for Aspect Term Extraction and Polarity Classification *****
2024-07-04 14:14:59,140 INFO:   Num examples = 3604
2024-07-04 14:14:59,142 INFO:   Batch size = 16
2024-07-04 14:14:59,143 INFO:   Num steps = 2250


Epoch:  0| loss_apc:0.0256 | loss_ate:0.0828 |: 100%|██████████| 226/226 [03:26<00:00,  1.09it/s,  APC_ACC: 88.28(max:88.28) | APC_F1: 82.70(max:82.70) | ATE_F1: 83.74(max:83.79)]
Epoch:  1| loss_apc:0.0050 | loss_ate:0.0135 |: 100%|██████████| 226/226 [03:19<00:00,  1.13it/s,  APC_ACC: 88.01(max:88.28) | APC_F1: 81.83(max:82.70) | ATE_F1: 85.10(max:85.10)]
Epoch:  2| loss_apc:0.0008 | loss_ate:0.0074 |: 100%|██████████| 226/226 [03:35<00:00,  1.05it/s,  APC_ACC: 87.30(max:88.28) | APC_F1: 80.56(max:82.70) | ATE_F1: 84.41(max:85.10)]


2024-07-04 14:25:42,127 INFO: 
-------------------------------------------------------------------- Raw Metric Records --------------------------------------------------------------------
╒════════════════════════════════╤═══════════════════════════════════════════════════════╤══════════╤═══════════╤══════════╤═══════╤═══════╤═══════╤═══════╕
│ Metric                         │ Trial                                                 │ Values   │  Average  │  Median  │  Std  │  IQR  │  Min  │  Max  │
╞════════════════════════════════╪═══════════════════════════════════════════════════════╪══════════╪═══════════╪══════════╪═══════╪═══════╪═══════╪═══════╡
│ Max-APC-Test-Acc w/o Valid Set │ fast_lcf_atepc-Restaurant14-microsoft/deberta-v3-base │ [88.28]  │   88.28   │  88.28   │   0   │   0   │ 88.28 │ 88.28 │
├────────────────────────────────┼───────────────────────────────────────────────────────┼──────────┼───────────┼──────────┼───────┼───────┼───────┼───────┤
│ Max-APC-Test-F1 w/o Valid

In [13]:
aspect_extractor = trainer.load_trained_model()
assert isinstance(aspect_extractor, ATEPC.AspectExtractor)

[2024-07-04 14:28:38] (2.4.1.post1) Load aspect extractor from checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/
[2024-07-04 14:28:38] (2.4.1.post1) config: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.config
[2024-07-04 14:28:38] (2.4.1.post1) state_dict: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.state_dict
[2024-07-04 14:28:38] (2.4.1.post1) model: None
[2024-07-04 14:28:38] (2.4.1.post1) tokenizer: checkpoints/fast_lcf_atepc_Restaurant14_cdw_apcacc_88.01_apcf1_81.83_atef1_85.1/fast_lcf_atepc.tokenizer
[2024-07-04 14:28:38] (2.4.1.post1) Set Model Device: cuda:0
[2024-07-04 14:28:38] (2.4.1.post1) Device Name: NVIDIA GeForce GTX 1070


In [16]:
for comment in df_comments_only_eng['value'][:1]:
    aspect_extractor.predict(comment)

[2024-07-04 14:30:27] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-07-04 14:30:27] (2.4.1.post1) Example 0: Great [32m<game:Positive Confidence:0.9985>[0m , full controllo of your [32m<strategy:Positive Confidence:0.999>[0m through constant adjustment of your tactic watching what your opponents do .
[2024-07-04 14:30:28] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-07-04 14:30:28] (2.4.1.post1) Example 0: Very clever game , enjoyable overall . Plus points : Great production values for all components and very good value for money . Lovely artwork everywhere . Smooth game play mechanisms - ironed out some oddities of the original game . Fortunes change frequently , so the winner could be anyone . Minus points : The b

In [11]:
ckpts = (
    available_checkpoints(TaskCodeOption.Aspect_Term_Extraction_and_Classification)
)  
aspect_extractor = ATEPC.AspectExtractor(
    checkpoint="english"
)

[2024-07-04 14:41:13] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-04 14:41:13] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-04 14:41:13] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-04 14:41:13] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-07-04 14:41:13] (2.4.1.post1) [32mDownloading checkpoint:english [0m
[2024-07-04 14:41:13] (2.4.1.post1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2024-07-04 14:41:13] (2.4.1.post1) Checkpoint already downloaded, skip
[2024-07-04 14:41:13] (2.4.1.post1) Load aspect extractor from checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apc



In [21]:
result = aspect_extractor.predict(
    text=df_comments_only_eng['value'].to_list(),
    print_result=False,
    ignore_error=True,  # ignore an invalid example, if it is False, invalid examples will raise Exceptions
    eval_batch_size=64,
)

preparing ate inference dataloader: 100%|██████████| 45144/45144 [01:42<00:00, 440.35it/s]
extracting aspect terms: 100%|██████████| 1411/1411 [34:02<00:00,  1.45s/it]
preparing apc inference dataloader: 100%|██████████| 53730/53730 [02:49<00:00, 317.21it/s]
classifying aspect sentiments: 100%|██████████| 1680/1680 [41:58<00:00,  1.50s/it]


[2024-07-04 16:07:50] (2.4.1.post1) The results of aspect term extraction have been saved in /home/fra/uni/nlp/project/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


In [63]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/pyabsa_aspect_extractor_result.pkl','wb') as f:
    pickle.dump(result, f)

In [64]:
with open('/home/fra/uni/nlp/project/content/drive/MyDrive/Boardgames/pyabsa_aspect_extractor_result.pkl','rb') as f:
    result2 = pickle.load(f)

In [68]:
result2[100]

{'sentence': "out of dozens of plays i ' m still never tired of playing brass",
 'IOB': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ASP'],
 'tokens': ['out',
  'of',
  'dozens',
  'of',
  'plays',
  'i',
  "'",
  'm',
  'still',
  'never',
  'tired',
  'of',
  'playing',
  'brass'],
 'aspect': ['brass'],
 'position': [[13]],
 'sentiment': ['Positive'],
 'probs': [[0.0003659271460492164, 0.0002612849639263004, 0.9993727803230286]],
 'confidence': [0.9994]}

In [59]:
game_aspects_pyabsa = {}
for id in df_boardgames_10['id']:
    game_comments = df_comments_only_eng_index[df_comments_only_eng_index['boardgame_id']==id]
    aspects = []
    polarities = []
    for i in game_comments.index:
        aspects += result[i-1]['aspect']
        polarities += result[i-1]['sentiment']
    game_aspects_pyabsa[f'{id}'] = (aspects,polarities)

In [62]:
for boardgame_id in df_boardgames_10['id'].to_list():
    boardgame_data = df_boardgames_10[df_boardgames_10['id']==boardgame_id]
    if(not boardgame_data.empty):
        print(f"{boardgame_data['name'].iloc[0]}")
    srs_aspects = pd.Series(game_aspects_pyabsa[f'{boardgame_id}'][0])
    srs_polarities = pd.Series(game_aspects_pyabsa[f'{boardgame_id}'][1])
    generated_aspects = pd.DataFrame()
    generated_aspects['id'] = [boardgame_id for _ in range(len(srs_aspects))]
    generated_aspects['aspects'] = srs_aspects
    generated_aspects['polarities'] = srs_polarities
    #generated_aspects_all = pd.concat([generated_aspects_all, generated_aspects])
    for aspect in srs_aspects.value_counts().index[:20]:
        aspect_polarities = generated_aspects[generated_aspects['aspects']==aspect]['polarities']
        positive_aspect_count = aspect_polarities.where(lambda v:v=='Positive').count()
        negative_aspect_count = aspect_polarities.where(lambda v:v=='Negative').count()
        neutral_aspect_count = aspect_polarities.where(lambda v:v=='Neutral').count()
        print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')
    print('\n')

Brass: Birmingham
game: 886, 185, 51 -> 4.789189189189189
rules: 60, 114, 16 -> 0.5263157894736842
games: 101, 21, 13 -> 4.809523809523809
beer: 39, 29, 64 -> 1.3448275862068966
theme: 51, 56, 4 -> 0.9107142857142857
gameplay: 84, 17, 3 -> 4.9411764705882355
interaction: 88, 10, 1 -> 8.8
Brass: 33, 18, 32 -> 1.8333333333333333
components: 68, 6, 4 -> 11.333333333333334
design: 66, 7, 0 -> 9.428571428571429
art: 63, 9, 1 -> 7.0
board: 26, 28, 10 -> 0.9285714285714286
artwork: 50, 13, 0 -> 3.8461538461538463
play: 43, 10, 2 -> 4.3
mechanics: 41, 1, 3 -> 41.0
strategy: 32, 4, 6 -> 8.0
cards: 15, 13, 14 -> 1.1538461538461537
time: 10, 23, 0 -> 0.43478260869565216
Beer: 10, 7, 13 -> 1.4285714285714286
teach: 9, 20, 0 -> 0.45


Pandemic Legacy: Season 1
game: 1118, 191, 72 -> 5.853403141361256
Pandemic: 157, 73, 127 -> 2.1506849315068495
story: 183, 53, 17 -> 3.452830188679245
games: 124, 51, 28 -> 2.4313725490196076
rules: 28, 108, 22 -> 0.25925925925925924
gaming: 144, 2, 9 -> 72.0
campaig

  print(f'{aspect}: {positive_aspect_count}, {negative_aspect_count}, {neutral_aspect_count} -> {positive_aspect_count/(negative_aspect_count)}')


rules: 27, 16, 6 -> 1.6875
design: 34, 10, 1 -> 3.4
components: 24, 15, 5 -> 1.6
art: 23, 16, 1 -> 1.4375
Dune: 15, 4, 20 -> 3.75
board: 10, 23, 6 -> 0.43478260869565216
Worker: 5, 4, 29 -> 1.25


Terraforming Mars
game: 2110, 374, 113 -> 5.641711229946524
theme: 591, 38, 11 -> 15.552631578947368
cards: 241, 201, 74 -> 1.199004975124378
components: 53, 346, 17 -> 0.1531791907514451
expansions: 112, 46, 138 -> 2.4347826086956523
games: 215, 37, 22 -> 5.8108108108108105
gameplay: 153, 51, 8 -> 3.0
engine: 166, 32, 10 -> 5.1875
art: 38, 147, 4 -> 0.2585034013605442
time: 34, 119, 1 -> 0.2857142857142857
interaction: 23, 118, 1 -> 0.19491525423728814
play: 96, 33, 4 -> 2.909090909090909
board: 39, 59, 29 -> 0.6610169491525424
card: 58, 35, 26 -> 1.6571428571428573
expansion: 29, 10, 74 -> 2.9
rules: 50, 49, 12 -> 1.0204081632653061
artwork: 21, 88, 1 -> 0.23863636363636365
design: 39, 47, 0 -> 0.8297872340425532
player boards: 9, 53, 23 -> 0.16981132075471697
mechanics: 70, 8, 5 -> 8.75


