In [307]:
### Necessarry imports and data load

In [308]:
import pandas as pd
from pandas import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import nltk
from datetime import datetime

plt.style.use('ggplot')

In [None]:
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')

In [310]:
# Preparing model for sentiment analysis

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

sia = SentimentIntensityAnalyzer()

In [311]:
# Prepare json data files.

with open('Data/gdko-2023-round-1.json') as f:
    gdko2023r1 = json.load(f)
r2023r1 = 2182 # amount of votes in round.

with open('Data/gdko-2023-round-2.json') as f:
    gdko2023r2 = json.load(f)
r2023r2 = 1003

with open('Data/gdko-2023-round-3.json') as f:
    gdko2023r3 = json.load(f)
r2023r3 = 603

with open('Data/gdko-2024-round-1.json') as f:
    gdko2024r1 = json.load(f)
r2024r1 = 1502

with open('Data/gdko-2024-round-2.json') as f:
    gdko2024r2 = json.load(f)
r2024r2 = 715

with open('Data/gdko-2024-round-3.json') as f:
    gdko2024r3 = json.load(f)
r2024r3 = 312

with open('Data/gdko-2025-round-1.json') as f:
    gdko2025r1 = json.load(f)
r2025r1 = 4032

with open('Data/gdko-2025-round-2.json') as f:
    gdko2025r2 = json.load(f)
r2025r2 = 1192

with open('Data/gdko-2025-round-3.json') as f:
    gdko2025r3 = json.load(f)
r2025r3 = 412

In [312]:
### Model builder methods

In [313]:
# Preprocessing. Creating training and prediction datasets with defined features (columns).

def platforms_to_num(platforms):
    platform_code = ''.join(['1' if platform in platforms else '0' for platform in ['web', 'windows', 'linux', 'osx']])
    return int(hex(int(platform_code, 2))[2:].zfill(4), 16)
    
def data_preprocessing(data, precision,  ratings_max, is_training = True):
    entries_list = [{'entry_id': entry_id, **entry_data} for entry_id, entry_data in data.items()]
    df = pd.DataFrame(entries_list)
    df = pd.concat([df, json_normalize(df['participant_info'])], axis=1)
    df = df.drop(columns=['participant_info'])
    df['rating_count_normalized'] = round(df['rating_count'] * 100.0 / ratings_max , 1)
    df['vader_score_neg'] = 0
    df['vader_score_neu'] = 0
    df['vader_score_pos'] = 0
    df['roberta_score_neg'] = 0
    df['roberta_score_neu'] = 0
    df['roberta_score_pos'] = 0
    df['vader_score_compound'] = 0
    df['platforms_numeric'] = df['platforms'].apply(platforms_to_num)
    df['comments_received'] = df['comments_received'].replace('', '0')
    if is_training:
        df['overal_raw_rating'] = df['overal_raw_rating'].apply(lambda x: round(pd.to_numeric(x[1:], errors='coerce').mean(), precision) if len(x[1:]) > 0 else np.nan)
        return df[
            ['game_title',
             'game_url', 
             'platforms', 
             'platforms_numeric',
             'rating_count',
             'rating_count_normalized',
             'comments_received',
             'participant',

             'coolness',
             
             'profile_url',
             
             'comments_raw',
             
             'vader_score_neg',
             'vader_score_neu',
             'vader_score_pos',    
             'vader_score_compound',
             
             'roberta_score_neg',
             'roberta_score_neu',
             'roberta_score_pos',
             
             'overal_raw_rating']
        ]
    else:
        return df[
            ['game_title',
             'game_url', 
             'platforms', 
             'platforms_numeric',
             'rating_count',
             'rating_count_normalized',
             'comments_received',
             'participant',

             'coolness',
             
             'profile_url',
             
             'comments_raw',
             
             'vader_score_neg',
             'vader_score_neu',
             'vader_score_pos',    
             'vader_score_compound',
             
             'roberta_score_neg',
             'roberta_score_neu',
             'roberta_score_pos']
        ]
    return df

In [None]:
# Calculations for sentiment analysis scores. Scores will be used as features for predicions.
# Important factor is the max length in the tokenizer in roberta scores. it will consider sentences of that length, otherwise will trim or fill them.
def vader_scores(data, precision):
    for i in tqdm(range(0, len(data))):
        compound = []
        negative = []
        positive = []
        neutral = []
        for row in data['comments_raw'][i]:
            compound.append(sia.polarity_scores(row)['compound'])
            negative.append(sia.polarity_scores(row)['neg'])
            positive.append(sia.polarity_scores(row)['pos'])
            neutral.append(sia.polarity_scores(row)['neu'])
        data.loc[i,'vader_score_compound'] = round(100.0 * sum(compound) / len(compound), precision) if len(compound) > 0 else 0
        data.loc[i,'vader_score_neg'] = round(100.0 * sum(negative) / len(negative), precision) if len(negative) > 0 else 0
        data.loc[i,'vader_score_pos'] = round(100.0 * sum(positive) / len(positive), precision) if len(positive) > 0 else 0
        data.loc[i,'vader_score_neu'] = round(100.0 * sum(neutral) / len(neutral), precision) if len(neutral) > 0 else 0

def roberta_scores(data, precision):
    for i in tqdm(range(0, len(data))):
        negative = []
        positive = []
        neutral = []
        for comment in data['comments_raw'][i]:
            encoded_comment = tokenizer(comment, return_tensors='pt', padding=True, truncation=True, max_length=128)
            output = model(**encoded_comment)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            negative.append(scores[0])
            neutral.append(scores[1])
            positive.append(scores[2])
        data.loc[i,'roberta_score_neg'] = round(100.0 * sum(negative) / len(negative), precision) if len(negative) > 0 else 0
        data.loc[i,'roberta_score_pos'] = round(100.0 * sum(positive) / len(positive), precision) if len(positive) > 0 else 0
        data.loc[i,'roberta_score_neu'] = round(100.0 * sum(neutral) / len(neutral), precision) if len(neutral) > 0 else 0

In [315]:
# Prediction and validation methods.

def validate_model(training_dataset ,split):
    dataset = training_dataset
    features = dataset.drop('overal_raw_rating', axis=1)
    target = dataset['overal_raw_rating']
    
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=split)
    rf_regressor = RandomForestRegressor()
    rf_regressor.fit(X_train, y_train)
    predictions = rf_regressor.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    #print("Mean Squared Error:", mse)
    return mse

def predict(training_dataset, realtime_dataset):
    features = training_dataset.drop('overal_raw_rating', axis=1)
    target = training_dataset['overal_raw_rating']
    
    rf_regressor = RandomForestRegressor()
    rf_regressor.fit(features, target)
    
    predicted_result = rf_regressor.predict(realtime_dataset)
    print(predicted_result)
    return predicted_result

In [316]:
### Using the model to predict based on loaded data

In [317]:
d1 = data_preprocessing(gdko2023r1, 1, r2023r1)
d2 = data_preprocessing(gdko2023r2, 1, r2023r2)
d3 = data_preprocessing(gdko2023r3, 1, r2023r3)
d4 = data_preprocessing(gdko2024r1, 1, r2024r1)
d5 = data_preprocessing(gdko2024r2, 1, r2024r2)
d6 = data_preprocessing(gdko2024r3, 1, r2024r3)
d7 = data_preprocessing(gdko2025r1, 1, r2025r1)
d8 = data_preprocessing(gdko2025r2, 1, r2025r2)
d9 = data_preprocessing(gdko2025r3, 1, r2025r3, False) # False for prediction dataset, otherwise will be used as training dataset.

td = pd.concat([d1, d2, d3, d4, d5, d6, d7, d8], ignore_index=True) # Concatenated training dataset.

In [None]:
print(td.shape, d9.shape) # td - training, d9 - prediction

In [None]:
# Calculating roberta scores for each dataset
roberta_scores(td, 1)
roberta_scores(d9, 1)

In [None]:
# Calculating vader scores
vader_scores(td, 1)
vader_scores(d9, 1)

In [None]:
def prediction_dataset(ds, is_training = True):
    if is_training:
        ds['overal_raw_rating'].fillna(0, inplace=True)
        return ds[[
            # 'rating_count_normalized',
            'rating_count',
            # 'comments_received',
            # 'vader_score_neg',
            # 'vader_score_neu',
            'vader_score_pos',    
            # 'vader_score_compound',
            'platforms_numeric',
            # 'roberta_score_neg',
            # 'roberta_score_neu',
            'roberta_score_pos',
            # 'coolness',
            'overal_raw_rating'
        ]]
    else:
        return ds[[
            # 'rating_count_normalized',
            'rating_count',
            # 'comments_received',
            # 'vader_score_neg',
            # 'vader_score_neu',
            'vader_score_pos',    
            # 'vader_score_compound',
            'platforms_numeric',
            # 'roberta_score_neg',
            # 'roberta_score_neu',
            'roberta_score_pos',
            # 'coolness',
        ]]
    

In [None]:
training_dataset = prediction_dataset(td, True)
training_dataset.head(3)

In [None]:
realtime_dataset = prediction_dataset(d9, False) 
realtime_dataset.head(3)

In [None]:
empty_per_column = (training_dataset == '').sum()
print("Number of empty cells per column:")
print(empty_per_column)

missing_per_column = training_dataset.isna().sum()
print("Number of missing values per column:")
print(missing_per_column)

In [None]:
mse = []
for i in range (0,10):
    mse.append(validate_model(training_dataset, 0.35))
import statistics
statistics.mean(mse)

In [None]:
(min(mse),max(mse))

In [None]:
results = predict(training_dataset, realtime_dataset)

In [355]:
d9['results'] = results
d9['results'] = round(d9['results'], 2)

In [None]:
d9.to_csv('predictions_' + str(datetime.utcnow()) + '.csv', index=False)