## Setup
This section installs and imports packages for use in this script.

In [None]:
#necessary installations (more may be needed depending on your system)
pip install stanza
stanza.download('en')
nltk.download('sentiwordnet')

In [None]:
#necessary imports
from urllib.request import urlopen
import pandas as pd
import re
from random import randint
import stanza
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from collections import Counter
from operator import itemgetter
import math
from scipy import stats
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import numpy as np
import statistics as st
import random
from matplotlib import pyplot as plt

## SWN modifications
The section below is used to investigate SWN and determine what kind of modifications may need to be made to it. Note that nothing in this section implements any actual changes: this must be done within the SWN txt file itself and/or as an 'intercept layer' (see the 'Rule-based system' section).

In [None]:
#set up the Stanza pipeline
nlp = stanza.Pipeline(lang='en')

In [None]:
#some useful swn/wn functionality:
list(swn.senti_synsets('heavy')) #lists all synset entries for a word

print(wn.synset('ferocity.n.01').definition()) #shows the definition for a specific synset entry

print(swn.senti_synset('breakdown.n.03')) #shows the pos/neg scores of a specific synset entry

In [None]:
#takes a text called "review" and prints info about it
review = nlp("This is a review example. It has two sentences.")
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "ROOT"}\tdeprel: {word.deprel}\tpos: {word.pos}' for sent in review.sentences for word in sent.words], sep='\n')

In [None]:
#a function to check how many definitions a word has in swn.
#returns a list of those with just one definition, and a list of those with multiple definitions
#input should be a list of words you want to (potentially) modify

def number_of_defs(words_to_modify):
    single_def_words = []
    multi_def_words = []
    for word in words_to_modify:
        print("The word",word,"has this number of definitions in swn:",len(list(swn.senti_synsets(word))))
        if len(list(swn.senti_synsets(word))) == 1:
            single_def_words.append(word)
        else:
            multi_def_words.append(word)
    print("The following words have only a single definition:",single_def_words)
    return single_def_words, multi_def_words

In [None]:
#a function to see which swn definition is most commonly used
#input should be a list of sample sentences, the word you want to check, and what type that word is
#word_type should match the types used in swn: 'n' (noun), 'r' (adverb), 'v' (verb), 'a' (adjective), 's' (adjective satellite)
#note that adjectives have two possible types, 's' or 'a', see here: https://wordnet.princeton.edu/documentation/wndb5wn

def which_def(review_texts, word, word_type):
    results = []
    for entry in review_texts:
        pipelined_text = nlp(entry)
        for sentence in pipelined_text.sentences:
            current_sentence = []
            for token in sentence.tokens:
                current_sentence.append(token.text)
            results.append(nltk.wsd.lesk(current_sentence, word, word_type))
    c = Counter(results)
    print("The word is:",word,"\nThe most common result for that word was:",c.most_common(1),"\nTotal results for this word were:",results,"\n")

In [None]:
#sample useage: we want to disambiguate the word 'brutality' as a noun, so we create some fake sentences using the word

brutality_snippets = ["The brutality of this music is great.", "There's so much brutality in this album.", "Brutality, that's the dominant force here.", "The music is brimming over with brutality.", "What this album really needs is more brutality."]
which_def(brutality_snippets, 'brutality', 'n')

In [None]:
#a function to search through a series of texts and find unrecognised words
#the parser defines these as nouns by default, and they will be returned if they also have no definition in SWN
#these can then be examined to look for new sentiment terms

def new_words(reviews, existing_set = [], to_skip = []):
    new_words = []
    index = -1
    for review in reviews:
        index += 1
        print("Review index number:",index)
        if index in to_skip:
            continue
        doc = nlp(review)
        for sentence in doc.sentences:
            for word in sentence.words:
                if word.pos == "NOUN" and len(list(swn.senti_synsets(word.text))) < 1:
                    if word.text not in new_words and word.text not in existing_set:
                        print("New word being added:",word.text)
                        new_words.append(word.text)
    return new_words

## Creation of corpus
In this section we scrape websites to build the corpus. Note that the real details have been replaced with fictional examples here, and this section will need to be customised based on the sites you wish to scrape and the data you need from them.

In [None]:
#a function that can be used to scrape a given URL.
#can be run in print mode (where it prints the results) or return mode, where it returns them

def scrape_check(url, output="print"):
    page = urlopen(url)
    html_bytes = page.read()
    html = html_bytes.decode("utf-8")
    if output == "print":
        print(html)
    elif output == "return":
        return html

In [None]:
#a dictionary to use for fixing incorrectly encoded characters
#note that this is not a comprehensive list, but one based on my own scraping efforts
#new characters may need to be added, or existing ones altered, depending on your setup and corpus

fixdict = {
    '&Auml;': 'Ä',
    '&auml;': 'ä',
    '&Euml;': 'Ë',
    '&euml;': 'ë',
    '&Iuml;': 'Ï',
    '&iuml;': 'ï',
    '&Ouml;': 'Ö',
    '&ouml;': 'ö',
    '&Uuml;': 'Ü',
    '&uuml;': 'ü',
    '&Yuml;': 'Ÿ',
    '&yuml;': 'ÿ',
    '&Aacute;': 'Á',
    '&aacute;': 'á',
    '&Cacute;': 'Ć',
    '&cacute;': 'ć',
    '&Eacute;': 'É',
    '&eacute;': 'é',
    '&Iacute;': 'Í',
    '&iacute;': 'í',
    '&Nacute;': 'Ń',
    '&nacute;': 'ń',
    '&Oacute;': 'Ó',
    '&oacute;': 'ó',
    '&Sacute;': 'Ś',
    '&sacute;': 'ś',
    '&Uacute;': 'Ú',
    '&uacute;': 'ú',
    '&Yacute;': 'Ý',
    '&yacute;': 'ý',
    '&Acirc;': 'Â',
    '&acirc;': 'â',
    '&Ecirc;': 'Ê',
    '&ecirc;': 'ê',
    '&Icirc;': 'Î',
    '&icirc;': 'î',
    '&Ocirc;': 'Ô',
    '&ocirc;': 'ô',
    '&Ucirc;': 'Û',
    '&ucirc;': 'û',
    '&AElig;': 'Æ',
    '&aelig;': 'æ',
    '&Ccaron;': 'Č',
    '&ccaron;': 'č',
    '&Agrave;': 'À',
    '&agrave;': 'à',
    '&Egrave;': 'È',
    '&egrave;': 'è',
    '&Ograve;': 'Ò',
    '&ograve;': 'ò',
    '&Ugrave;': 'Ù',
    '&ugrave;': 'ù',
    '&Aring;': 'Å',
    '&aring;': 'å',
    '&Oslash;': 'Ø',
    '&oslash;': 'ø',
    '&eth;': 'ð',
    '&ntilde;': 'ñ',
    '&atilde;': 'ã',
    '&oelig;': 'œ',
    '&ccedil;': 'ç',
    '&THORN;': 'Þ',
    '&amp;': '&',
    '&quot;': '"',
    '&quot': '"',
    '&tilde;': '~',
    '&lt;': '<',
    '&gt;': '>',
    '&ldquo;': '"',
    '&rdquo;': '"',
    '&mdash;': '-',
    '&ndash;': '-',
    '&hellip;': '...',
    '<sup>2</sup>': '^2',
    '<sup>3</sup>': '^3',
    '<sup>23</sup>': '^23',
    '&#1048;': 'И',
    '&#1103;': 'я',
    '&#1084;': 'м',
    '&#1085;': 'н',
    '&#1077;': 'е',
    '&#1051;': 'Л',
    '&#1075;': 'г',
    '&#1080;': 'и',
    '&#1086;': 'о',
    '&#363;': 'ū',
    '&#257;': 'ā',
    '&#269;': 'č',
    '&#283;': 'ě',
    '&#378;': 'ź',
    '&#263;': 'ć',
    '&#279;': 'ė',
    '&#0322;': 'ł',
    '&#335;': 'ŏ',
    '&#240;': 'ð',
    '&#382;': 'ž',
    '&#337;': 'ő',
    '&#328;': 'ň',
    '&#039;': "'",
    '&lsquo;': "'",
    '&rsquo;': "'",
    '&nbsp;': '',
    '&thorn;': 'þ',
    '&macr;': '¯',
    '&#1079;': 'з',
    '&#1061;': 'Х',
    '&#1072;': 'a',
    '&#1089;': 'c',
    '&#1044;': 'Д',
    '&#1088;': 'р',
    '&#1074;': 'в',
    '&#1093;': 'х',
    '&#1042;': 'В',
    '&#51665;': '집',
    '&Epsilon;': 'Ε',
    '&tau;': 'τ',
    '&epsilon;': 'ε',
    '&rho;': 'ρ',
    '&phi;': 'φ',
    '&omega;': 'ω',
    '&sigmaf;': 'ς',
    '&Kappa;': 'Κ',
    '&Tau;': 'Τ',
    '&omicron;': 'o',
    '&nu;': 'ν',
    '&Delta;': 'Δ',
    '&alpha;': 'α',
    '&mu;': 'μ',
    '&Mu;': 'M',
    '&upsilon;': 'υ',
    '&#22116;': '噤',
    '&#22818;': '夢',
    '&#947;': 'γ',
    '&sup2;': '^2',
    '&sup3;': '^3',
    '&#23798;': '島',
    '&#23996;': '嶼',
    '&#31070;': '神',
    '&#35441;': '話',
    '&#491;': 'ǫ',
    '&#259;': 'ă',
    '&#966;': 'φ',
    '&#351;': 'ş',
    'O&#776;': 'Ö',
    '&#1082;': 'к',
    '&#1102;': 'ю',
    '&#1095;': 'ч',
    '&#19968;': '一',
    '&#26399;': '期',
    '&#20250;': '会',
    '&#268;': 'Č',
    '&#937;': 'Ω',
    '&#1090;': 'т',
    '&#322;': 'ł',
    '&#1064;': 'Ш',
    '&#1081;': 'й',
    '&#921;': 'Ι',
    '&#963;': 'σ',
    '&#972;': 'ό',
    '&#952;': 'θ',
    '&#949;': 'ε',
    '&#959;': 'ο',
    '&#962;': 'ς',
    '&#1110;': 'i',
    '&#1054;': 'O',
    '&#1050;': 'К',
    '&#1056;': 'Р',
    '&#1101;': 'э',
    '&#1087;': 'п',
    '&#1073;': 'б',
    '&#1083;': 'л',
    '&#1105;': 'ё',
    '&#1076;': 'д',
    '&#1047;': 'З',
    '&#1099;': 'ы',
    '&#1053;': 'H',
    '&#1055;': 'П',
    '&#1097;': 'щ',
    '&#1091;': 'y',
    '&#1096;': 'ш',
    '&#1043;': 'Г',
    '&#1078;': 'ж',
    '&#1057;': 'C',
    '&#1063;': 'Ч',
    '&#1098;': 'ъ',
    '&#1058;': 'T',
    '&#1052;': 'M',
    '&#1100;': 'ь',
    '&#8203;': '',
    '&#133;': '...',
    '&#353;': 'š',
    '&#380;': 'ż',
    '&#183;': '·',
    '&#345;': 'ř',
    '&#357;': 'ť',
    '&#352;': 'Š',
    '&#277;': 'ĕ',
    '&#1059;': 'У',
    '&#1086;': 'о',
    '&#154;': 'š',
    '&#X10c;': 'Č',
    '&#X10C;': 'Č',
    '&#X107;': 'ć',
    '&#92;': '\\\\',
    '&#39740;': '鬼',
    '&#7717;': 'ḥ',
    '&#73773;': '𒀭',
    '&#7789;': 'ṭ',
    '&#12295;': '〇',
    '&#7809;': 'ẁ',
    '&#8734;': '∞',
    '&#2357;': 'वा',
    '&#2366;': 'घ',
    '&#2328;': 'न',
    '&#2344;': 'ख',
    '&#2326;': '',
    '&#277;': 'ĕ',
    '&#301;': 'ĭ',
    '&#537;': 'ș',
    '&#917;': 'Ε',
    '&#964;': 'τ',
    '&#961;': 'ρ',
    'ο&#769;': 'ό',
    '&#969;': 'ω',
    '&#913;': 'Α',
    '&#945;': 'α',
    '&#960;': 'π',
    '&#9839;': '#',
    '&#8734;': '∞',
    '&#272;': 'Đ',
    '&#273;': 'đ',
    '&#9578;': '╪',
    '&#650;': 'ʊ',
    '&#26286;': '暮',
    '&#23665;': '山',
    '&#33337;': '船',
    '&#24433;': '影',
    '&#23396;': '孤',
    '&#29128;': '燈',
    '&#24494;': '微',
    '&#38593;': '雁',
    '&#32027;': '紛',
    '&#39131;': '飛',
    '&#28784;': '灰',
    '&#26376;': '月',
    '&#28472;': '漸',
    '&#26126;': '明',
    '&#24565;': '念',
    '&#20234;': '伊',
    '&#20154;': '人',
    '&#29694;': '現',
    '&#35937;': '象',
    '&#40441;': '鷹',
    '&#34892;': '行',
    '&#21109;': '創',
    '&#19990;': '世',
    '&#27946;': '洪',
    '&#27700;': '水',
    '&#24040;': '巨',
    '&#39250;': '饒',
    '&#20126;': '亞',
    '&#21746;': '哲',
    '&#923;': 'Λ',
    '&#942;': 'ή',
    '&#951;': 'η',
    '&#920;': 'Θ',
    '&#954;': 'κ',
    '&#946;': 'β',
    '&#943;': 'ί',
    '&#291;': 'ģ',
    '&#916;': 'Δ',
    '&#948;': 'δ',
    '&#943;': 'ί',
    '&#940;': 'ά',
    '&#7941;': 'ἅ',
    '&#7940;': 'ἄ',
    '&#8032;': 'ὠ',
    '&#955;': 'λ',
    '&#953;': 'ι',
    '&#539;': 'ț',
    '&#1041;': 'Б',
    '&#1237;': 'ӕ',
    '&#1040;': 'А',
    '&#928;': 'Π',
    '&#905;': 'Ή',
    '&#924;': 'Μ',
    '&#931;': 'Σ',
    '&#957;': 'ν',
    '&#927;': 'Ο',
    '&#922;': 'Κ',
    '&#965;': 'υ',
    '&#974;': 'ώ',
    '&#941;': 'έ',
    '&#956;': 'μ',
    '&#967;': 'χ',
    '&#932;': 'Τ',
    '&#324;': 'ń',
    '&#350;': 'Ş',
    '&#973;': 'ύ',
    '&#300;': 'Ĭ',
    '&#355;': 'ţ',
    '&#958;': 'ξ',
    '&#950;': 'ζ',
    '&#596;': 'ɔ',
    '&#720;': 'ː',
    '&#536;': 'Ș',
    '&#1046;': 'Ж',
    '&#1094;': 'ц',
    '&#1071;': 'Я',
    '&#369;': 'ű',
    '&#769;': 'í',
    '&#8206;': '',
    '&#8195;': '',
    '&#919;': 'Η',
    '&#914;': 'Β',
    '&#902;': 'Ά',
    '&#968;': 'ψ',
    '&#321;': 'Ł',
    '&#1092;': 'ф',
    '&#5573;': 'ᗅ',
    '&#5626;': 'ᗺ',
    '&#5623;': 'ᗷ',
    '&#354;': 'Ţ',
    'p&#822;': '',
    'e&#822;': '',
    'w&#822;': '',
    'i&#822;': '',
    'l&#822;': '',
    ' &#822;': '',
    'd&#822;': '',
    'f&#822;': '',
    'n&#822;': '',
    't&#822;': '',
    'y&#822;': '',
    'b&#822;': '',
    'a&#822;': '',
    'c&#822;': '',
    'k&#822;': '',
    's&#822;': '',
    'o&#822;': '',
    'm&#822;': '',
    'h&#822;': '',
    'u&#822;': '',
    'g&#822;': '',
    '&#9834;': '♪',
    '&#9835;': '♫',
    '&#9745;': '☑',
    '&#8213;': '-',
    '&#1092;': 'к',
    '&#1062;': 'Ц',
    '&#1030;': 'І',
    '&#8220;': '"',
    '&#8221;': '"',
    '&#8221': '',
    '&#305;': 'ı',
    '&#233;': 'é',
    '&#248;': 'ø',
    '&#228;': 'ä',
    '&#244;': 'ô',
    '&#232;': 'è',
    '&#231;': 'ç',
    '&#245;': 'õ',
    '&#227;': 'ã',
    '&#229;': 'å',
    '&#224;': 'à',
    '&#250;': 'ú',
    '&#252;': 'ü',
    '&#8211;': '-',
    '&#8216;': "'",
    '&#8217;': "'",
    '&#225;': 'á',
    '&#261;': 'ą',
    '&#601;': 'ə',
    '&#712;': "'",
    '&#246;': 'ö',
    '&#238;': 'î',
    '&#235;': 'Ë',
    '&#196;': 'Ä',
    '&#246': 'ö',
    '&#237;': 'í',
    '&#347;': 'ś',
    '&#8230;': '...',
    '&#12484;': 'ツ',
    '&#39072;': '颠',
    '&#35206;': '覆',
    '&#10003;': '✓',
    '&#8197;': ' ',
    '&#147;': '"',
    '&#148;': '"',
    '&#145;': "'",
    '&#150;': "-",
    '&#149;': "•",
    '&#508;': "Ǽ",
    '&#506;': "Ǻ",
    '&#935;': "Χ",
    '&#915;': "Γ",
    '&#367;': "ů",
    '&#934;': "Φ",
    '&#926;': "Ξ",
    '&#1605;': '', 
    '&#1576;': '',
    '&#1610;': '',
    '&#1583;': '',
    '&#1575;': '',
    '&#1604;': '',
    '&#1583;': '',
    '&#1617;': '',
    '&#1606;': '',
    '&#61521;': '',
    '&#61484;': '',
    '&#61472;': '',
    '&#333;': 'ō',
    '&#1108;': 'є',
    '&#1045;': 'E',
    '&#379;': 'Ż',
    '&#908;': 'Ό',
    '&#38291;': '間',
    '&#332;': 'Ō',
    '&#493;': 'Ǭ',
    '&#246;': 'ö',
    '&#x65e0;': '无',
    '&#x540d;': '名',
    '&#x645;': '',
    '&#x6CC;': '',
    '&#x631;': '',
    '&#x627;': '',
    '&#x62B;': '',
    '&#903;': '·',
    '&#216;': 'Ø',
    '&#226;': 'â',
    '&#8208;': '-',
    '&#374;': 'Ŷ',
    '&#198;': 'Æ',
    '&Lcy;': 'Л',
    '&iecy;': 'е',
    '&gcy;': 'г',
    '&iecy;': 'е',
    '&ncy;': 'н',
    '&dcy;': 'д',
    '&acy;': 'а',
    '&#x20;': ' ',
    '&Ocy;': 'О',
    '&#344;': 'Ř',
    '&#26885;': '椅',
    '&#23376;': '子',
    '&#1578;': '',
    '&#1581;': '',
    '&#1602;': '',
    '&#32;': '',
    '&#1585;': '',
    '&#1587;': '',
    '&#1489;': 'ב',
    '&#929;': 'P',
    '&#255;': 'Ÿ',
    '&#78;': 'N',
    '&chi;': 'χ',
    '&iota;': 'ι',
    '&pi;': 'π',
    '&lambda;': 'λ',
    '&eta;': 'η',
    '&sigma;': 'σ',
    '&#8237;': '',
    '&#8236;': '',
    '&#146;': "'"
}

In [None]:
#functions used to clean scraped text.
#character_code_cleanup fixes based on the fixdict dictionary
#clean_text removes anything within angled brackets (which will typically be formatting markers)

def character_code_cleanup(column):
    for code, character in fixdict.items():
        column = [re.sub(code, character, entry) for entry in column]
    return column

def clean_text(text):
    text = re.sub('<[^>]+>', '', text)
    return text

In [None]:
#an example of a function for scraping a particular site of reviews.
#note that this will need to be modified depending on the site's particular layout/architecture, and what you wish to scrape

def example_scrape(dir_links):
    #lists to contain review links, texts, ratings and titles
    review_links_list = []
    all_review_texts = []
    review_titles = []
    review_ratings = []
    
    #markers for the start and end of each of the relevant fields
    #each start marker should be a unique piece of text that marks where the relevant field begins on that webpage. If it is not unique, the scraping will pick up incorrect info
    #the end marker should then mark where it ends (though it does not need to be unique, as the code will simply find the next instance of it after the start)
    #the start/end points do not need to be perfect, but the more unnecessary text that is scraped, the more cleaning will have to be done afterwards
    title_start_marker = 'review_title='
    title_end_marker = '/end_title'
    review_start_marker = 'review_body='
    review_end_marker = '/end_body'
    rating_start_marker = 'review_rating='
    rating_end_marker = '/end_rating'
    
    #in this example, we have a website with directories of all review URL links
    #these directories will be scraped to gather all review URLs, which can each then be scraped for the data we want
    #in this particular case, every review URL takes the form "http://www.example.com/reviewid=", then a numbered ID of the review
    #so we can scrape the directories for the numbered IDs, then paste them onto this base URL to create links to the reviews
    #id_start_marker indicates the marker within the directory pages that precedes a numbered review ID
    example_review_baselink = "http://www.example.com/reviewid="
    id_start_marker = '<a href="/reviews/id='
    
    #we work through each directory listed in dir_links, scraping it
    for directory in dir_links:
        current_page = scrape_check(directory, output="return")
        #we work through the page to find every review link listed on the page, extracting the review id and pasting it onto the baselink
        while len(current_page) > 1:
            try:
                current_point_start = (current_page.index(id_start_marker)) + len(id_start_marker)
                #note that you may need to tweak the number below to ensure that the correct piece of text is extracted and nothing more
                current_point_end = current_point_start + 5
                revid = current_page[current_point_start:current_point_end]
                #the two lines below ensure that only digits (i.e. the review ID number) are extracted
                numeric_filter = filter(str.isdigit, revid)
                revid = "".join(numeric_filter)
                link = example_review_baselink + revid
                review_links_list.append(link)
                current_page = current_page[current_point_end:]
            except ValueError:
                current_page = 'a'
                pass
    
    #having gone through every directory, we now have a list of all review URLs
    review_links_list = list(dict.fromkeys(review_links_list))
    #now we scrape every review URL in that list, and extract the desired information from it
    #in this example specifically, we are extracting the review title, review text and review rating, and they appear in that order on the page
    #note that this function removes the scraped page results as it goes, so the below must go in the order the desired fields appear on the page
    for revlink in review_links_list:
        print("Current review scrape:",revlink)
        full_page = scrape_check(revlink, output="return")
        title_start = (full_page.index(title_start_marker)) + len(title_start_marker)
        full_page = full_page[title_start:]
        title_end = full_page.index(title_end_marker)
        title_text = full_page[:title_end]
        review_titles.append(title_text)
        review_start = (full_page.index(review_start_marker)) + len(review_start_marker)
        full_page = full_page[review_start:]
        review_end = full_page.index(review_end_marker)
        review_text = full_page[:review_end]
        all_review_texts.append(review_text)
        try:
            rating_start = (full_page.index(rating_start_marker)) + len(rating_start_marker)
            full_page = full_page[rating_start:]
            rating_end = full_page.index(rating_end_marker)
            review_rating = full_page[:rating_end]
        #in some cases the review might not have a rating, and needs to be skipped
        except ValueError:
            review_rating = 'N/A'
            pass
        review_ratings.append(review_rating)
    #then we return all scraped data
    return all_review_texts, review_titles, review_ratings

In [None]:
#example lists creation
#there are 54 directories in this example, with each directory URL ending in a number, 1-54
#so we can simply create a list of all directory pages with the base of that URL + the number

example_directory_links = []
example_directory_baselink = 'http://www.example.com/directory='
for i in range(1,55):
    link = example_directory_baselink + str(i)
    example_directory_links.append(link)

#then we can scrape this collection of directory links, getting their review links, then the desired data from each of those reviews
example_reviews, example_titles, example_ratings = example_scrape(example_directory_links)

In [None]:
#cleanup of the review column
#again, the exact details will vary by site
#in this case, we want to remove some characters from the start of each review text
#then clean up the character encodings and remove any formatting markers within <> brackets

example_reviews = [x[13:] for x in example_reviews]
example_reviews = character_code_cleanup(example_reviews)
for i, x in enumerate(example_reviews):
    example_reviews[i] = clean_text(x)

In [None]:
#we will also clean up the titles column (though this has no formatting markers, so only the character encoding cleanup is needed)
#this also includes removing some unnecessary text at the end of each title

for i, x in enumerate(example_titles):
    example_titles[i] = x[:-10]
example_titles = character_code_cleanup(example_titles)

In [None]:
#we create lists to act as columns in the final dataframe, detailing the source of scraping an index that can later act as a unique ID number for each review
example_origin = ['example.com'] * len(example_ratings)
example_index = []
for i in range(0, len(example_ratings)):
    example_index.append(i)

In [None]:
#the lists can then be transformed into a pandas dataframe, then downloaded as a csv file
d_example = {'Index': example_index, 'Source': example_origin, 'Title': example_titles, 'Rating': example_ratings, 'Review': example_reviews}
df_example = pd.DataFrame(d_example)
df_example.to_csv('example.csv', index=False, header=False)

## Rule-based system
This section contains the main rule-based system of the project.

In [None]:
#if not done earlier, we set up the Stanza pipeline

nlp = stanza.Pipeline(lang='en')

In [None]:
#we define a full list of all explicit and implicit aspect terms, and create a dictionary structure with aspect categories as keys and all terms as values

full_aspect_list = ['guitar', 'guitarist', 'riff', 'riffing', 'lead', 'solo', 'soloing', 'noodling', 'riffage', 'vocalist', 'singer', 'vocal', 'vocalisation', 'vocalising', 'vokill', 'singing', 'scream', 'screaming', 'growl', 'growling', 'roar', 'roaring', 'shriek', 'shrieking', 'wail', 'wailing', 'snarl', 'snarling', 'bark', 'barking', 'howl', 'howling', 'yell', 'yelling', 'chant', 'chanting', 'croon', 'crooning', 'voice', 'vox', 'drummer', 'percussionist', 'drum', 'drumming', 'fill', 'beat', 'percussion', 'production', 'mastering', 'master', 'mix', 'production job', 'producer', 'recording', 'breakdown', 'lyric', 'theme', 'ambience', 'mood', 'atmosphere', 'composition', 'structure', 'writing', 'songwriting', 'arrangement', 'bass', 'bassline', 'bassist', 'melody', 'harmony', 'rhythm', 'grooves', 'technicality', 'complexity', 'signature', 'skill', 'talent', 'musicianship', 'keyboard', 'synth', 'piano', 'orchestra', 'choir', 'symphonic', 'orchestration', 'sax', 'saxophone', 'tambourine', 'trumpet', 'flute', 'creativity', 'experimentation', 'variation', 'variety', 'diversity', 'humor', 'humour', 'comedy', 'memorability', 'catchiness']
implicit_aspects = ['song', 'album', 'track', 'sound', 'band', 'artist', 'group', 'musician', 'performer', 'member', 'music']
aspect_synonyms = {'implicit': ['song', 'album', 'track', 'sound', 'band', 'artist', 'group', 'musician', 'performer', 'member', 'music'], 'guitars': ['guitar', 'guitarist', 'riff', 'riffing', 'lead', 'solo', 'soloing', 'noodling', 'riffage'], 'vocals': ['vocalist', 'singer', 'vocal', 'vocalisation', 'vocalising', 'vokill', 'singing', 'scream', 'screaming', 'growl', 'growling', 'roar', 'roaring', 'shriek', 'shrieking', 'wail', 'wailing', 'snarl', 'snarling', 'bark', 'barking', 'howl', 'howling', 'yell', 'yelling', 'chant', 'chanting', 'croon', 'crooning', 'voice', 'vox'], 'drums': ['drummer', 'percussionist', 'drum', 'drumming', 'fill', 'beat', 'percussion'], 'production': ['production', 'mastering', 'master', 'mix', 'production job', 'producer', 'recording'], 'breakdowns': ['breakdown'], 'lyrics': ['lyric', 'theme'], 'ambience': ['ambience', 'mood', 'atmosphere'], 'writing': ['composition', 'structure', 'writing', 'songwriting', 'arrangement'], 'bass': ['bass', 'bassline', 'bassist'], 'melodies': ['melody', 'harmony', 'rhythm', 'grooves'], 'technicality': ['technicality', 'complexity', 'signature', 'skill', 'talent', 'musicianship'], 'keyboards': ['keyboard', 'synth', 'piano'], 'symphonics': ['orchestra', 'choir', 'symphonic', 'orchestration'], 'exotic instruments': ['sax', 'saxophone', 'tambourine', 'trumpet', 'flute'], 'creativity': ['creativity', 'experimentation', 'variation', 'variety', 'diversity', 'inspiration'], 'comedy': ['humor', 'humour', 'comedy'], 'catchiness': ['memorability', 'catchiness']}

In [None]:
#the intercept layer for new words and their sentiment scores.
#this includes words completely new to SWN and those which do exist in SWN, but have a new specific meaning within this domain which is not used by SWN.
#these new meaning entries should only be added where the new, domain-specific meaning is very dominant (i.e. if you see this word in this domain, it will almost always be used in this new sense)
#note that hyphenated terms are merged, as the system will modify review text to convert such terms and allow them to be recognised

new_sentiment_terms = {
    "hardrocking": 0.375,
    "hardrock": 0.375,
    "headbanging": 0.375,
    "rock": 0.375,
    "rocker": 0.375,
    "masterclass": 0.625,
    "masterwork": 0.625,
    "brutalise": 0.25,
    "brutalize": 0.25,
    "brutaliser": 0.25,
    "brutalizer": 0.25,
    "bythenumbers": -0.25,
    "runofthemill": -0.25,
    "cringeworthy": -0.75,
    "cringey": -0.75,
    "cringy": -0.75,
    "wankery": -0.5,
    "thrashy": 0.375,
    "catchiness": 0.125,
    "headbang": 0.375,
    "headbanger": 0.375,
    "bang": 0.375,
    "banger": 0.375,
    "skipworthy": -0.25,
    "forgettability": -0.5,
    "rollercoaster": 0.375,
    "cliché": -0.375,
    "cliche": -0.375,
    "trve": -0.5,
    "tr00": -0.5,
    "kvlt": -0.5,
    "cvlt": -0.5,
    "stratospheric": 0.25,
    "asshat": -0.625,
    "knuckledragger": -0.375,
    "clubcore": -0.5,
    "br00tal": -0.5,
    "brootal": -0.5,
    "br00tality": -0.5,
    "brootality": -0.5,
    "musthear": 0.375,
    "mustlisten": 0.375,
    "poorman's": -0.875,
    "standout": 0.375,
    "mallcore": -0.5,
    "oversaturation": -0.375,
    "oversaturated": -0.375,
    "oversaturate": -0.375,
    "butthurt": -0.625,
    "burnout": -0.5,
    "burntout": -0.5,
    "fistpump": 0.375,
    "hornthrow": 0.375,
    "necksnap": 0.375,
    "fistpumper": 0.375,
    "hornthrower": 0.375,
    "necksnapper": 0.375,
    "mudslinging": 0.375,
    "listenability": 0.625,
    "awesomeness": 0.75,
    "thrashiness": 0.375,
    "doomster": 0.375,
    "noholdsbarred": 0.25,
    "snorefest": -0.25,
    "snoozefest": -0.25,
    "borefest": -0.25,
    "musthave": 0.375,
    "vomitinduce": -0.5,
    "turnoff": -0.75,
    "pitchperfect": 1.0,
    "clunker": -0.625,
    "ripoff": -0.25,
    "pulserace": 0.375,
    "trainwreck": -0.375,
    "badass": 0.375,
    "infectiousness": 0.125,
    "infectious": 0.125,
    "crustiness": 0.375,
    "selfparody": -0.375,
    "sellout": -0.625,
    "soldout": -0.625,
    "skullcrush": 0.25,
    "skullcrushingly": 0.25,
    "eargasm": 0.375,
    "hardhitting": 0.25,
    "hardhit": 0.25,
    "earworm": 0.125,
    "infectious": 0.25,
    "heavy": 0.375,
    "heaviness": 0.375,
    "harsh": 0,
    "harshness": 0,
    "filler": -0.25,
    "generic": -0.375,
    "shredding": 0,
    "shred": 0,
    "killer": 0.375,
    "solid": 0.25,
    "distorted": 0,
    "distortion": 0,
    "brutal": 0.25,
    "clean": 0,
    "dark": 0.25,
    "rubbish": -0.375,
    "dirty": 0,
    "filthy": 0,
    "comeback": 0,
    "live": 0,
    "progressive": 0,
    "thrash": 0,
    "sludge": 0,
    "industrial": 0,
    "fulllength": 0
}

In [None]:
#a list of modifier terms which will alter the sentiment score of an attached word by a certain proportion

modifiers = {
    "less": -1.5,
    "barely": -1.5,
    "hardly": -1.5,
    "almost": -1.5,
    "nottoo": -1.5,
    "notonly": 0.5,
    "notjust": 0.5,
    "notsimply": 0.5,
    "only": -0.5,
    "alittle": -0.5,
    "alittlebit": -0.5,
    "slightly": -0.5,
    "marginally": -0.5,
    "relatively": -0.3,
    "mildly": -0.3,
    "moderately": -0.3,
    "somewhat": -0.3,
    "partially": -0.3,
    "abit": -0.3,
    "arguably": -0.2,
    "mostly": -0.2,
    "mainly": -0.2,
    "theleastbit": -0.9,
    "tosomeextent": -0.2,
    "toacertainextent": -0.2,
    "sortof": -0.3,
    "sorta": -0.3,
    "kindof": -0.3,
    "kinda": -0.3,
    "fairly": -0.2,
    "pretty": -0.1,
    "rather": -0.1,
    "immediately": 0.1,
    "quite": 0.1,
    "perfectly": 0.1,
    "consistently": 0.1,
    "really": 0.2,
    "clearly": 0.2,
    "obviously": 0.2,
    "certainly": 0.2,
    "completely": 0.2,
    "definitely": 0.2,
    "absolutely": 0.2,
    "constantly": 0.2,
    "highly": 0.2,
    "very": 0.2,
    "significantly": 0.2,
    "noticeably": 0.2,
    "distinctively": 0.2,
    "frequently": 0.2,
    "awfully": 0.2,
    "totally": 0.2,
    "largely": 0.2,
    "fully": 0.2,
    "extra": 0.3,
    "truly": 0.3,
    "especially": 0.3,
    "particularly": 0.3,
    "damn": 0.3,
    "intensively": 0.3,
    "downright": 0.3,
    "entirely": 0.3,
    "strongly": 0.3,
    "remarkably": 0.3,
    "majorly": 0.3,
    "amazingly": 0.3,
    "strikingly": 0.3,
    "stunningly": 0.3,
    "quintessentially": 0.3,
    "unusually": 0.3,
    "dramatically": 0.3,
    "intensely": 0.3,
    "extremely": 0.4,
    "so": 0.4,
    "incredibly": 0.4,
    "terribly": 0.4,
    "hugely": 0.4,
    "immensely": 0.4,
    "such": 0.4,
    "unbelievably": 0.4,
    "insanely": 0.4,
    "outrageously": 0.4,
    "radically": 0.4,
    "blisteringly": 0.4,
    "exceptionally": 0.4,
    "exceedingly": 0.4,
    "withoutadoubt": 0.4,
    "way": 0.4,
    "vastly": 0.4,
    "deeply": 0.4,
    "super": 0.4,
    "profoundly": 0.4,
    "universally": 0.4,
    "abundantly": 0.4,
    "infinitely": 0.4,
    "exponentially": 0.4,
    "enormously": 0.4,
    "thoroughly": 0.4,
    "passionately": 0.4,
    "tremendously": 0.4,
    "ridiculously": 0.4,
    "obscenely": 0.4,
    "wildly": 0.4,
    "extraordinarily": 0.5,
    "spectacularly": 0.5,
    "phenomenally": 0.5,
    "monumentally": 0.5,
    "mind-bogglingly": 0.5,
    "utterly": 0.5,
    "more": -0.5,
    "evenmore": 0.5,
    "morethan": 0.5,
    "themost": 1.0,
    "utmost": 1.0,
    "total": 0.5,
    "monumental": 0.5,
    "great": 0.5,
    "greatly": 0.5,
    "huge": 0.5,
    "tremendous": 0.5,
    "complete": 0.4,
    "infinite": 0.4,
    "endless": 0.4,
    "absolute": 0.5,
    "resounding": 0.4,
    "unabashed": 0.4,
    "dropdead": 0.4,
    "massive": 0.5,
    "collossal": 0.5,
    "incredible": 0.5,
    "unimagiable": 0.5,
    "abject": 0.5,
    "sucha": 0.4,
    "suchan": 0.4,
    "utter": 0.4,
    "double": 0.3,
    "clear": 0.3,
    "clearer": 0.2,
    "clearest": 0.5,
    "big": 0.3,
    "bigger": 0.2,
    "biggest": 0.5,
    "obvious": 0.3,
    "serious": 0.3,
    "deep": 0.3,
    "deeper": 0.2,
    "deepest": 0.5,
    "considerable": 0.2,
    "important": 0.3,
    "major": 0.2,
    "crucial": 0.3,
    "immediate": 0.1,
    "visable": 0.1,
    "noticeable": 0.1,
    "consistent": 0.1,
    "high": 0.2,
    "higher": 0.1,
    "highest": 0.5,
    "real": 0.2,
    "true": 0.2,
    "pure": 0.2,
    "definite": 0.2,
    "much": 0.2,
    "small": -0.3,
    "smaller": -0.2,
    "smallest": -0.5,
    "minor": -0.3,
    "moderate": -0.3,
    "mild": -0.3,
    "slight": -0.5,
    "slightest": -0.9,
    "insignificant": -0.5,
    "inconsequential": -0.5,
    "low": -2.0,
    "lowest": -3.0,
    "few": -2.0,
    "fewer": -1.5,
    "fewest": -3.0,
    "alot": 0.3,
    "numerous": 0.3,
    "several": 0.2,
    "multiple": 0.2,
    "various": 0.2,
    "afew": -0.3,
    "acouple": -0.3,
    "acoupleof": -0.3,
    "alotof": 0.3,
    "lotsof": 0.3,
    "atall": -0.5,
    "agreatdealof": 0.5,
    "awholelotof": 0.5,
    "ahugeamountof": 0.5,
    "hugenumbersof": 0.5,
    "aheckofa": 0.5,
    "ahellofa": 0.5,
    "aplethoraof": 0.5,
    "amultitudeof": 0.5,
    "heckuva": 0.5,
    "atonof": 0.5,
    "tonsof": 0.5,
    "abunchof": 0.3,
    "bunchesof": 0.3,
    "plentyof": 0.3,
    "acertainamountof": -0.2,
    "some": -0.2,
    "alittlebitof": -0.5,
    "abitof": -0.5,
    "abitofa": -0.5,
    "difficultto": -1.5,
    "hardto": -1.5,
    "toughto": -1.5,
    "nowherenear": -3.0,
    "notallthat": -1.2,
    "notthat": -1.5,
    "outof": -2.0,
    "scarcely": -1.5,
    "endlessly": 0.4,
    "resoundingly": 0.4,
    "unabashedly": 0.4,
    "massively": 0.5,
    "abjectly": 0.5,
    "doubly": 0.3,
    "seriously": 0.3,
    "purely": 0.2,
    "lackof": -0.5,
    "toolittle": -0.3,
    "toofew": -0.3,
    "toomuch": -0.3,
    "toomany": -0.3,
    "not": -0.5,
    "no": -0.5,
    "noone": -0.5,
    "none": -0.5,
    "nobody": -0.5,
    "nothing": -0.5,
    "neither": -0.5,
    "nor": -0.5,
    "nowhere": -0.5,
    "n't": -0.5,
    "without": -0.5
}

In [None]:
#multi-word or hyphenated new terms and modifier terms, which may not be recognised, are compressed by the system

words_to_modify = {
    "hard-rocking": "hardrocking",
    "head-banging": "headbanging",
    "by-the-numbers": "bythenumbers",
    "run-of-the-mill": "runofthemill",
    "cringe-worthy": "cringeworthy",
    "thrash-y": "thrashy",
    "re-defining": "redefining",
    "skip-worthy": "skipworthy",
    "must-hear": "musthear",
    "must-listen": "mustlisten",
    "poor-man's": "poorman's",
    "over-saturation": "oversaturation",
    "over-saturated": "oversaturated",
    "re-hash": "rehash",
    "butt-hurt": "butthurt",
    "burn-out": "burnout",
    "burnt-out": "burntout",
    "fist-pumping": "fistpumping",
    "horn-throwing": "hornthrowing",
    "neck-snapping": "necksnapping",
    "fist-pumper": "fistpumper",
    "over-saturate": "oversaturate",
    "horn-thrower": "hornthrower",
    "neck-snapper": "necksnapper",
    "no-holds-barred": "noholdsbarred",
    "snore-fest": "snorefest",
    "snooze-fest": "snooze-fest",
    "bore-fest": "borefest",
    "must-have": "musthave",
    "vomit-inducing": "vomitinducing",
    "turn-off": "turnoff",
    "pitch-perfect": "pitchperfect",
    "rip-off": "ripoff",
    "pulse-racing": "pulseracing",
    "bad-ass": "badass",
    "jaw-dropping": "jawdropping",
    "jaw-droppingly": "jawdroppingly",
    "self-parody": "selfparody",
    "pit-fall": "pitfall",
    "sell-out": "sellout",
    "sold-out": "soldout",
    "skull-crushing": "skullcrushing",
    "skull-crshingly": "skullcrushingly",
    "hard-hitting": "hardhitting",
    "full-length": "fulllength",
    "not too": "nottoo",
    "not only": "notonly",
    "not just": "notjust",
    "not simply": "notsimply",
    "a little": "alittle",
    "a little bit": "alittlebit",
    "a bit": "abit",
    "the least bit": "theleastbit",
    "to some extent": "tosomeextent",
    "to a certain extent": "toacertainextent",
    "sort of": "sortof",
    "kind of": "kindof",
    "without a doubt": "withoutadoubt",
    "even more": "evenmore",
    "more than": "morethan",
    "the most": "themost",
    "drop dead": "dropdead",
    "such a": "sucha",
    "such an": "suchan",
    "a lot": "alot",
    "a few": "afew",
    "a couple": "acouple",
    "a couple of": "acoupleof",
    "a lot of": "alotof",
    "lots of": "lotsof",
    "at all": "atall",
    "a great deal of": "agreatdealof",
    "a whole lot of": "awholelotof",
    "a huge amount of": "ahugeamountof",
    "huge numbers of": "hugenumbersof",
    "a heck of a": "aheckofa",
    "a hell of a": "ahellofa",
    "a plethora of": "aplethoraof",
    "a multitude of": "amultitudeof",
    "a ton of": "atonof",
    "tons of": "tonsof",
    "a bunch of": "abunchof",
    "bunches of": "bunchesof",
    "plenty of": "plentyof",
    "a certain amount of": "acertainamountof",
    "a little bit of": "alittlebitof",
    "a bit of": "abitof",
    "a bit of a": "abitofa",
    "difficult to": "difficultto",
    "hard to": "hardto",
    "tough to": "toughto",
    "nowhere near": "nowherenear",
    "not all that": "notallthat",
    "not that": "notthat",
    "out of": "outof",
    "lack of": "lackof",
    "too little": "toolittle",
    "too few": "toofew",
    "too much": "toomuch",
    "too many": "toomany",
    "no one": "noone"
}

In [None]:
#the full rule-based system

#a function that checks for the presence of aspect terms within a sentence. Implicit only counted if a noun.
#also checks for compound terms.
#returns the id of all found aspect terms

def aspect_detection(sent):
    #check if a sentence contains any aspects, and return their details
    aspects_ids = []
    for sentence in sent.sentences:
        for word in sentence.words:
            if word.lemma in full_aspect_list or (word.lemma in implicit_aspects and word.upos == "NOUN"):
                details = [word.id, word.lemma, word.head]
                #if the aspect word is marked as a "compound", then we need to extend the search
                if word.deprel == "compound":
                    compound = word.head
                    #so we search again, looking for the head of the compound word, then getting ITS head (and later its dependent terms)
                    for word in sentence.words:
                        if word.id == compound:
                            details.append(word.head)
                aspects_ids.append(details)
    return aspects_ids

#a function that checks the category of a given aspect term
def aspect_category(foundword):
    aspect = None
    aspectlist = aspect_synonyms.items()
    for item in aspectlist:
        for sub_item in item[1:]:
            if foundword in sub_item:
                aspect = item[0]
    return aspect

#a function that finds all words in a sentence with a particular aspect (by its id) as their head
#returns the id of all words with the given aspect term as their head
def find_aspect_tails(sent, aspect_id):
    aspect_tails = []
    for sentence in sent.sentences:
        for word in sentence.words:
            if word.head == aspect_id:
                aspect_tails.append(word.id)
    return aspect_tails

#a function that takes a selection of word ids in a sentence, gets their lemma form and upos tag.
#also converts their upos tag into a POS tag that SWN can utilise.
#returns a list of triples, one for each word id originally provided. Each triple lists the lemma, pos-tag and id for that word.
def get_lemmas_and_types(sent, ids):
    lemmas = []
    for sentence in sent.sentences:
        for word in sentence.words:
            if word.id in ids:
                lemmas.append([word.lemma, word.upos, word.id])
    #convert the lemmas into a form SWN can recognise
    for lemma in lemmas:
        if lemma[1] == 'ADJ':
            lemma[1] = 's'
        elif lemma[1] == 'ADV':
            lemma[1] = 'r'
        elif lemma[1] == 'VERB':
            lemma[1] = 'v'
        elif lemma[1] == 'NOUN':
            lemma[1] = 'n'
        else:
            lemma[1] = 'n'
    return lemmas

#a function for scaling sentiment scores, putting them onto a -1.0 to +1.0 scale.
def scale_score(min_score, max_score, score):
    scaled_score = 2 * ((score - min_score) / (max_score - min_score)) - 1
    return scaled_score

#a function that takes the lemma and type of a word in a sentence, and checks its SWN score.
#returns the positive and negative scores of that word.
def get_swn_scores(sent, lemma, word_type):
    tokenized_sent = []
    for i, sentence in enumerate(sent.sentences):
        for token in sentence.tokens:
            tokenized_sent.append(token.text)
    disambig = str(nltk.wsd.lesk(tokenized_sent, lemma, word_type))
    #if the disambiguation fails, it is likely that an 's' type is actually an 'a' type in SWN, so try that:
    if disambig == 'None':
        disambig = str(nltk.wsd.lesk(tokenized_sent, lemma, 'a'))
    #if still nothing, then just set it to 0.0
    if disambig == 'None':
        pos = 0.0
        neg = 0.0
    #if the disambiguation worked, we take the relevant part of the result and get its positive and negative scores from SWN
    else:
        disambig_slice = disambig[8:-2]
        senti_details = swn.senti_synset(disambig_slice)
        pos = senti_details.pos_score()
        neg = senti_details.neg_score()
    return pos, neg

#a function to check for all words linked by the parser to a sentiment term, to check for modifiers
def get_sentiment_links(sent, sentiment_id):
    sentiment_links = []
    sent_head = None
    for sentence in sent.sentences:
        for word in sentence.words:
            if word.id == sentiment_id:
                sent_head = word.head
        for word in sentence.words:
            if word.head == sentiment_id or word.id == sent_head:
                sentiment_links.append(word.text)
    return sentiment_links

#a function to replace a term within a sentence, used to handle multi-word/hyphenated modifiers and new sentiment terms
def multi_word_slice(sentence, term, new_term):
    first_half = sentence[:sentence.index(term)]
    second_half = sentence[sentence.index(term)+len(term):]
    sliced_sent = first_half + new_term + second_half
    return sliced_sent

#the main function that scores all the aspects within a text
#takes the full review text as input, along with three parameters:
#1.) intro_to_ignore defines what percentage (always rounding down) of the sentences at the start to consider the 'introduction', and to consequently ignore
#2.) iem_weights, which defines what percentage weight to assign to the implicit aspect, explicit aspects (collectively) and unconnected sentiment, respectively. Must sum to 100.
#3.) the mode, which defines the mode to run the function in and affects its outputs. Can be 1, 2, 3, 4 or 5.
#mode 1 = verbose, printing a lot of information as it runs, and at the end
#mode 2 = default, makes and returns a rating prediction
#mode 3 = detailed aspect polarity outputs, returns a dataframe detailing all the aspect scores across each sentence
#mode 4 = emphasis on the weightiest sentences, returns the indices of the most important sentences as this system sees it
#mode 5 = emphasis on rationales, returns the weightiest sentences
def score_aspects(full_review, intro_to_ignore=8, iem_weights=[40, 40, 20], mode=2):
    #check to ensure the weights sum to 100
    if iem_weights[0] + iem_weights[1] + iem_weights[2] != 100:
        print("Error: weights must sum to 100.")
        return
    #tokenize the review into sentences so they can be worked through one by one
    review = nltk.sent_tokenize(full_review)
    #count the number of sentences
    num_sents = len(review)
    #remove the intro if specified
    if intro_to_ignore > 0:
        intro_sents = math.floor(num_sents/(100/intro_to_ignore))
        review = review[intro_sents:]
    #create a dictionary for recording the sentiment score of each aspect within each sentence, if needed for mode 3
    if mode == 3:
        sentence_details_lists = [ ([0] * 20) for x in range(len(review)) ]
        aspect_column_refs = {'guitars': 0, 'vocals': 1, 'drums': 2, 'production': 3, 'breakdowns': 4, 'lyrics': 5, 'ambience': 6, 'writing': 7, 'bass': 8, 'melodies': 9, 'technicality': 10, 'keyboards': 11, 'symphonics': 12, 'exotic instruments': 13, 'creativity': 14, 'comedy': 15, 'catchiness': 16, 'implicit': 17, 'unattached': 18}
    #dictionary for recording sentiment scores across review
    aspect_scores = {'implicit': 0, 'guitars': 0, 'vocals': 0, 'drums': 0, 'production': 0, 'breakdowns': 0, 'lyrics': 0, 'ambience': 0, 'writing': 0, 'bass': 0, 'melodies': 0, 'technicality': 0, 'keyboards': 0, 'symphonics': 0, 'exotic instruments': 0, 'creativity': 0, 'comedy': 0, 'catchiness': 0}
    total_unconnected_sentiment = 0
    #start recording sentence number, beginning at 0
    current_sent = 0
    #create a list to store all the sentence scores, to later pull up the weightiest sentences
    all_sents = []
    for sentence in review:
        if mode == 1:
            print("Sentence being evaluated:\n",sentence,"\n")
        #modify terms within each sentence if needed
        for term in words_to_modify.keys():
            if term in sentence:
                new_term = words_to_modify[term]
                sentence = multi_word_slice(sentence, term, new_term)
        #increment sentence index number
        current_sent += 1
        sentence_score = [current_sent, 0]
        #count the number of aspect terms within the sentence
        t_sentence = nlp(sentence)
        ids_range = range((t_sentence.num_tokens)+1)
        #we also add all word ids to not_checked, which will be updated as we go, so that we can use to check for unconnected sentiment terms after aspects are all checked
        not_checked = []
        for i in ids_range[1:]:
            not_checked.append(i)
        aspect_ids = aspect_detection(t_sentence)
        #if there aren't any, move on; otherwise:
        if len(aspect_ids) > 0:
            if mode == 1:
                print("Aspects detected.")
            #work through each set of aspect details found (as there may be multiple aspects in a single sentence)
            for aspect_details in aspect_ids:
                #see which aspect category this word belongs to
                aspect_name = aspect_category(aspect_details[1])
                #note its head and dependent terms (tails)
                aspect_head = [aspect_details[2]]
                aspect_tails = find_aspect_tails(t_sentence, aspect_details[0])
                #this gives us the ids of all connected words to this aspect term:
                all_linked = aspect_head + aspect_tails
                #if the aspect_ids list is length 4, then it found a compound, and the last value is the head of the head
                if len(aspect_details) == 4:
                    #so we add this, and then its tails, to the full list of potentially relevant indexes
                    all_linked.append(aspect_details[3])
                    compound_tails = find_aspect_tails(t_sentence, aspect_details[3])
                    for i in compound_tails:
                        all_linked.append(i)
                #we also check if the aspect term itself is within the "linked" list, and remove it if so
                if aspect_details[0] in all_linked:
                    all_linked.remove(aspect_details[0])
                #not_checked is updated to exclude the ids of words that are linked to sentiment terms
                not_checked = list(set(not_checked)^set(all_linked))
                #we then get the lemmas of all linked words, to check for their SWN scores
                lemmas = get_lemmas_and_types(t_sentence, all_linked)
                #the score for this aspect in this sentence starts at 0
                sentence_aspect_score = 0
                #we can then work through the details of all connected words
                sentiment_terms = []
                for lemma in lemmas:
                    lemma_score = 0
                    #if the word exists in the intercept layer, then use the sentiment score there
                    if lemma[0] in new_sentiment_terms.keys():
                        sentiment_terms.append(lemma[0])
                        lemma_score = new_sentiment_terms[lemma[0]]
                        #also check for linked modifiers
                        sentiment_links = get_sentiment_links(t_sentence, lemma[2])
                        modifier = 0.0
                        for i in sentiment_links:
                            if i in modifiers.keys():
                                modifier += modifiers[i]
                        lemma_score = lemma_score + (lemma_score * modifier)
                    #if the word is not in the intercept layer, then check for it in SWN
                    else:
                        synset_check = list(swn.senti_synsets(lemma[0]))
                        if len(synset_check) > 0:
                            sentiment_terms.append(lemma[0])
                            pos_score, neg_score = get_swn_scores(t_sentence, lemma[0], lemma[1])
                            lemma_score = (pos_score - neg_score)
                            if mode == 1:
                                print("The term '",lemma[0],"' yields a score difference of",lemma_score)
                            #also check for linked modifiers
                            sentiment_links = get_sentiment_links(t_sentence, lemma[2])
                            modifier = 0.0
                            for i in sentiment_links:
                                if i in modifiers.keys():
                                    modifier += modifiers[i]
                            lemma_score = lemma_score + (lemma_score * modifier)
                    #either way, we end up with a score for this particular word, which we will use to update the aspect score and the overall sentence score
                    sentence_aspect_score += lemma_score
                    sentence_score[1] += lemma_score
                #we now have a total score for this aspect instance in this sentence
                #if we're in mode 3, we also need to update the dataframe recording these details
                if mode == 3:
                    column = aspect_column_refs[aspect_name]
                    sentence_details_lists[(current_sent-1)][column] += sentence_aspect_score
                #modify the appropriate part of the aspect_scores dictionary with the score
                aspect_scores[aspect_name] += sentence_aspect_score
                if mode == 1:
                    print("CATEGORY TYPE:\t\t",aspect_name,"\nASPECT KEYWORD:\t\t'",aspect_details[1],"'\nSCORE ASSIGNED:\t\t",sentence_aspect_score,"\nLINKED SENTIMENT TERMS:\n",sentiment_terms,"\n")
        else:
            if mode == 1:
                print("No aspects detected in the sentence.")
        #once we have checked for all aspects (implicit and explicit), we pass through for any remaining unconnected sentiment in the sentence
        additional_lemmas = get_lemmas_and_types(t_sentence, not_checked)
        unconnected_sentence_score = 0
        #we perform the same basic process as for the aspects, just now for unconnected sentiment
        for adlem in additional_lemmas:
            lemma_score = 0
            if adlem[0] in new_sentiment_terms.keys():
                lemma_score = new_sentiment_terms[adlem[0]]
                if mode == 1:
                        print("The UNCONNECTED term '",adlem[0],"' yields a score difference of",lemma_score)
                sentiment_links = get_sentiment_links(t_sentence, adlem[2])
                modifier = 0.0
                for i in sentiment_links:
                    if i in modifiers.keys():
                        modifier += modifiers[i]
                lemma_score = lemma_score + (lemma_score * modifier)
                if mode == 1:
                        print("\tWith modification, this becomes:",lemma_score)
            else:
                synset_check = list(swn.senti_synsets(adlem[0]))
                if len(synset_check) > 0:
                    pos_score, neg_score = get_swn_scores(t_sentence, adlem[0], adlem[1])
                    lemma_score = (pos_score - neg_score)
                    if mode == 1:
                        print("The UNCONNECTED term '",adlem[0],"' yields a score difference of",lemma_score)
                    sentiment_links = get_sentiment_links(t_sentence, adlem[2])
                    modifier = 0.0
                    for i in sentiment_links:
                        if i in modifiers.keys():
                            modifier += modifiers[i]
                    lemma_score = lemma_score + (lemma_score * modifier)
                    if mode == 1:
                        print("With modification, this becomes:",lemma_score)
            unconnected_sentence_score += lemma_score
        sentence_score[1] += unconnected_sentence_score
        all_sents.append(sentence_score)
        total_unconnected_sentiment += unconnected_sentence_score
        if mode == 3:
            sentence_details_lists[(current_sent-1)][18] += unconnected_sentence_score
            sentence_details_lists[(current_sent-1)][19] = sum(sentence_details_lists[(current_sent-1)][0:19])
        if mode == 1:
            print("\nThe value of the unconnected sentiment in this sentence is:",unconnected_sentence_score,"\n")
            print("\nThe total sentiment value of this sentence is:",sentence_score[1])
            print("==============================================================")
    #having gone through every sentence, we now need to see which had the greatest weight
    all_weights = []
    for sent in all_sents:
        weight = abs(0.0 - sent[1])
        all_weights.append(weight)
    #find the index values of the three biggest weights
    biggest = sorted(range(len(all_weights)), key = lambda sub: all_weights[sub])[-3:]
    if mode == 1:
        if len(biggest) == 0:
            print("No aspect sentiment found!")
        else:
            print("\nThe sentences with the strongest impact on sentiment were:")
            for i in biggest:
                print("Sentence:",review[(all_sents[i][0])-1],"\nScore:",all_sents[i][1],"\n")
    #remove implicit from the aspects list, as it will now need to be handled differently
    imp_score = aspect_scores['implicit']
    del aspect_scores['implicit']
    #go through the implicit, explicit and miscellaneous aspects, limiting them to their max/min
    for key, value in aspect_scores.items():
        if value > 2:
            aspect_scores[key] = 2
        elif value < -2:
            aspect_scores[key] = -2
    if imp_score > 4:
        imp_score = 4
    elif imp_score < -4:
        imp_score = -4
    if total_unconnected_sentiment > 4:
        total_unconnected_sentiment = 4
    elif total_unconnected_sentiment < -4:
        total_unconnected_sentiment = -4
    #scale all scores, putting them all on a -1 to +1 scale
    scaled_imp = scale_score(-4, 4, imp_score)
    scaled_unconnected = scale_score(-4, 4, total_unconnected_sentiment)
    num_scored_aspects = 0
    scores_range = []
    scaled_scores = []
    for score in aspect_scores.values():
        scores_range.append(score)
        if score != 0:
            num_scored_aspects += 1
    for score in scores_range:
        scaled = scale_score(-2, 2, score)
        if num_scored_aspects > 0:
            scaled_scores.append(scaled/num_scored_aspects)
        else:
            scaled_scores.append(scaled/1)
    #convert each scaled value into a weight for the overall rating prediction
    scaled_imp = (scaled_imp/100)*iem_weights[0]
    scaled_exp = ((sum(scaled_scores))/100)*iem_weights[1]
    scaled_misc = (scaled_unconnected/100)*iem_weights[2]
    #sum up the weights to get a full score, on the -1 to +1 scale
    final_score = scaled_imp + scaled_exp + scaled_misc
    #convert into /10 rating
    predicted_rating = (final_score + 1) * 5
    #print and return results, depending on mode
    if mode == 1:
        print("The predicted score for this review is",final_score,", which translates to a review rating of",round(predicted_rating, 1))
        print("\nAll scores were as follows:",aspect_scores)
        print("The number of scored explicit aspects was:",num_scored_aspects)
        print("\nThe implicit aspect rating was",imp_score)
        print("The unconnected sentiment was worth",total_unconnected_sentiment)
    if mode == 2:
        return round(predicted_rating, 1)
    if mode == 3:
        #NOTE: All values shown in this table are before limits, scaling and weighting
        df = pd.DataFrame.from_records(sentence_details_lists)
        columns_list = ['guitars', 'vocals', 'drums', 'production', 'breakdowns', 'lyrics', 'ambience', 'writing', 'bass', 'melodies', 'technicality', 'keyboards', 'symphonics', 'exotic instruments', 'creativity', 'comedy', 'catchiness', 'implicit', 'unattached', 'SENTENCE TOTAL']
        df.columns = columns_list
        totals = {'guitars': 0, 'vocals': 0, 'drums': 0, 'production': 0, 'breakdowns': 0, 'lyrics': 0, 'ambience': 0, 'writing': 0, 'bass': 0, 'melodies': 0, 'technicality': 0, 'keyboards': 0, 'symphonics': 0, 'exotic instruments': 0, 'creativity': 0, 'comedy': 0, 'catchiness': 0, 'implicit': 0, 'unattached': 0, 'SENTENCE TOTAL': 0}
        for i in columns_list:
            totals[i] = df[i].sum()
        df = df.append(totals, ignore_index=True)
        print(df)
        return df
    if mode == 4:
        print("The indices of the sentences with the biggest impact were:",biggest,"\nNote: remember that the first sentence is index 0.")
        return biggest
    if mode == 5:
        rationales = []
        for i in biggest:
            rationales.append(review[(all_sents[i][0])-1])
        return rationales

In [None]:
#a function to conduct the score_aspects function across a batch of reviews.
#this takes the same inputs as score_aspects, except that it takes a dataframe including a 'Review' column containing the review texts (and may need other columns depending on the mode it's run in), and a filename to give to an output file
#the modes generally match to the modes of score_aspects, collecting the outputs of score_aspects (in that mode) into an output for this function
#if run in mode 2 (the default), it will attempt to check the result metrics with the check_metrics function

def review_batch_aspects(all_reviews, mode=2, intro_to_ignore=8, iem_weights=[40, 40, 20], filename='aspectresults.csv'):
    review_num = 0
    all_predictions = []
    all_actual_ratings = []
    all_ids = []
    all_titles = []
    all_sent_weights = []
    all_rationales = []
    all_review_texts = []
    for index, row in all_reviews.iterrows():
        review_text = row['Review']
        review_ID = row['ID']
        if mode == 2:
            print("Processing review",index)
            pred = score_aspects(full_review=review_text, intro_to_ignore=intro_to_ignore, iem_weights=iem_weights, mode=2)
            all_predictions.append(pred)
            actual_rating = row['Rating']
            all_actual_ratings.append(actual_rating)
            review_id = row['ID']
            all_ids.append(review_id)
            title = row['Title']
            all_titles.append(title)
        elif mode == 3:
            review_num += 1
            all_details = score_aspects(full_review=review_text, intro_to_ignore=intro_to_ignore, iem_weights=iem_weights, mode=3)
            title = 'review' + str(review_ID) + '.csv'
            all_details.to_csv(title, index=True, header=True)
        elif mode == 4:
            print("Processing review",index)
            weightiest_sents = score_aspects(full_review=review_text, intro_to_ignore=intro_to_ignore, iem_weights=iem_weights, mode=4)
            all_sent_weights.append(weightiest_sents)
            all_ids.append(review_ID)
            title = row['Title']
            all_titles.append(title)
        elif mode == 5:
            print("Processing review",index)
            rationales = score_aspects(full_review=review_text, intro_to_ignore=intro_to_ignore, iem_weights=iem_weights, mode=5)
            all_rationales.append(rationales)
            all_ids.append(review_ID)
            all_review_texts.append(review_text)
    if mode == 2:
        results = pd.DataFrame(list(zip(all_ids, all_titles, all_actual_ratings, all_predictions)), columns =['ID', 'Title', 'Actual rating', 'Prediction'])
        results.to_csv(filename)
        mape, rmse, rho, pval = check_metrics(all_actual_ratings, all_predictions, mode='return')
        print("The following results were achieved:\n\tMAPE:",mape,"\n\tRMSE:",rmse,"\n\tRHO:",rho,"\n\tPVAL:",pval)
        return all_actual_ratings, all_predictions
    elif mode == 4:
        results = pd.DataFrame(list(zip(all_ids, all_titles, all_sent_weights)), columns =['ID', 'Title', 'Weightiest sentences'])
        results.to_csv(filename)
        return all_sent_weights
    elif mode == 5:
        results = pd.DataFrame(list(zip(all_ids, all_review_texts, all_rationales)), columns = ['ID', 'Review', 'Rationales'])
        results.to_csv(filename)
        return all_rationales

## Metrics and hyperparameter tuning
This section contains functions for checking the metrics of the systems, as well as tuning the hyperparameters of the rule-based system (the simplistic system requires no tuning and BERT has its own methods for that).

In [None]:
#functions for calculating MAPE, RMSE, Spearman's rho correlation coefficient and p-value
#note that MAPE cannot handle actual values of 0.0, so if any such entries exist, they should be removed first

def calculate_rmse(errors):
    sum_squared_errors = 0
    for error in errors:
        squared_error = error*error
        sum_squared_errors += squared_error
    mean_squared_errors = sum_squared_errors/len(errors)
    rmse = math.sqrt(mean_squared_errors)
    return rmse

def calculate_mape(errors, actual_values):
    ape = []
    for i, error in enumerate(errors):
        flipped = error*(-1)
        ape.append(abs(flipped/actual_values[i]))
    mape = sum(ape)/len(ape)
    return mape

def check_metrics(actual_values, predictions, mode='verbose'):
    errors = []
    for i, x in enumerate(predictions):
        errors.append(x - actual_values[i])
    mape = calculate_mape(errors, actual_values)
    rmse = calculate_rmse(errors)
    rho, pval = stats.spearmanr(actual_values, predictions)
    if mode == 'verbose':
        print("The mean absolute percentage error value is:",mape)
        print("The root mean squared error value is:",rmse)
        print("The Spearman's rho correlation coefficient value is:",rho)
        print("The p-value is:",pval)
    elif mode == 'return':
        return mape, rmse, rho, pval

In [None]:
#a grid search function that checks different possible intro_to_ignore values

def hypersearch_intro(validation_set, param_range):
    all_results = []
    for param_value in param_range:
        print("Now checking this value:",param_value)
        all_preds = []
        all_actual_ratings = []
        for index, row in validation_set.iterrows():
            print("Processing review",index)
            review_text = row['Review']
            actual_rating = row['Rating']
            prediction = score_aspects(full_review=review_text, intro_to_ignore=param_value, mode=2)
            all_preds.append(prediction)
            all_actual_ratings.append(actual_rating)
        mape, rmse, rho, pval = check_metrics(all_actual_ratings, all_preds, mode='return')
        all_results.append([param_value, [mape, rmse, rho, pval]])
        print("\nFor this intro-to-ignore value:",param_value,", the following results were achieved:\n\tMAPE:",mape,"\n\tRMSE:",rmse,"\n\tRHO:",rho,"\n\tPVAL:",pval)
    return all_results

In [None]:
#a grid search function that checks different implicit/explicit/miscellaneous weights values

def hypersearch_weights(validation_set, param_range):
    all_results = []
    for param_value in param_range:
        print("Now checking this value:",param_value)
        all_preds = []
        all_actual_ratings = []
        mape = None
        rmse = None
        rho = None
        pval = None
        for index, row in validation_set.iterrows():
            print("Processing review",index)
            review_text = row['Review']
            actual_rating = row['Rating']
            prediction = score_aspects(full_review=review_text, intro_to_ignore=8, iem_weights=param_value, mode=2)
            all_preds.append(prediction)
            all_actual_ratings.append(actual_rating)
            #print("Sanity check - prediction:",prediction,"and actual score:",actual_rating)
        print("Sanity check - The predictions being passed into check_metrics are:",all_preds)
        mape, rmse, rho, pval = check_metrics(all_actual_ratings, all_preds, mode='return')
        all_results.append([param_value, [mape, rmse, rho, pval]])
        print("\nFor this weights distribution:",param_value,", the following results were achieved:\n\tMAPE:",mape,"\n\tRMSE:",rmse,"\n\tRHO:",rho,"\n\tPVAL:",pval)
    return all_results

## Simplistic system
This section contains the simplistic sentiment counting system.

In [None]:
#a function that takes a simpler approach
#this function just checks every word, disambiguates it and checks it against the intercept layer and SWN, and counts up all the sentiment scores in this fashion

def brute_force_count(full_review):
    review = nltk.sent_tokenize(full_review)
    total_sentiment = 0
    for sentence in review:
        for term in words_to_modify.keys():
            if term in sentence:
                new_term = words_to_modify[term]
                sentence = multi_word_slice(sentence, term, new_term)
        t_sentence = nlp(sentence)
        for sent in t_sentence.sentences:
            for word in sent.words:
                score = 0
                if word.lemma in new_sentiment_terms.keys():
                    score = new_sentiment_terms[word.lemma]
                elif len(list(swn.senti_synsets(word.lemma))) > 0:
                    if word.upos == 'ADJ':
                        word_type = 's'
                    elif word.upos == 'ADV':
                        word_type = 'r'
                    elif word.upos == 'VERB':
                        word_type = 'v'
                    elif word.upos == 'NOUN':
                        word_type = 'n'
                    else:
                        word_type = 'n'
                    pos_score, neg_score = get_swn_scores(t_sentence, word.lemma, word_type)
                    score = (pos_score - neg_score)
                total_sentiment += score
    if total_sentiment > 10:
        total_sentiment = 10
    elif total_sentiment < -10:
        total_sentiment = -10
    scaled_score = scale_score(-10, 10, total_sentiment)
    predicted_rating = (scaled_score + 1) * 5
    return round(predicted_rating, 1)

In [None]:
#a function for running the simplistic count across a batch of reviews, and outputting the results as a csv file
#it will attempt to check the result metrics with the check_metrics function

def review_batch_brute(all_reviews, filename='bruteforceresults.csv'):
    review_num = 0
    all_predictions = []
    all_actual_ratings = []
    all_ids = []
    all_titles = []
    for index, row in all_reviews.iterrows():
        print("Processing review",index)
        review_text = row['Review']
        pred = brute_force_count(review_text)
        all_predictions.append(pred)
        actual_rating = row['Rating']
        all_actual_ratings.append(actual_rating)
        review_id = row['ID']
        all_ids.append(review_id)
        title = row['Title']
        all_titles.append(title)
    results = pd.DataFrame(list(zip(all_ids, all_titles, all_actual_ratings, all_predictions)), columns =['ID', 'Title', 'Actual rating', 'Prediction'])
    results.to_csv(filename)
    mape, rmse, rho, pval = check_metrics(all_actual_ratings, all_predictions, mode='return')
    print("The following results were achieved:\n\tMAPE:",mape,"\n\tRMSE:",rmse,"\n\tRHO:",rho,"\n\tPVAL:",pval)
    return all_actual_ratings, all_predictions

## Preparing files for BERT script
This section contains code for preparing files for the BERT script. Note that the methods needed may vary significantly depending on exactly how you implement BERT.

In [None]:
#BERT may require some additional characters to be changed in order to function properly
#in this case, these characters were simply removed for the sake of time
#but be careful not to perform this on the original dataset, as this will lose information unnecessarily
#instead, perform this on copies specifically set aside for BERT

fixdict2 = {
    '\n': ' ',
    '\r': ' ',
    '\t': ' ',
    'И': '',
    'я': '',
    'м': '',
    'н': '',
    'Л': '',
    'г': '',
    'и': '',
    '噤': '',
    '夢': '',
    'γ': '',
    '島': '',
    '嶼': '',
    '神': '',
    '話': '',
    '暮': '',
    '山': '',
    '船': '',
    '影': '',
    '孤': '',
    '燈': '',
    '微': '',
    '雁': '',
    '紛': '',
    '飛': '',
    '灰': '',
    '月': '',
    '漸': '',
    '明': '',
    '念': '',
    '伊': '',
    '人': '',
    '現': '',
    '象': '',
    '鷹': '',
    '行': '',
    '創': '',
    '世': '',
    '洪': '',
    '水': '',
    '巨': '',
    '饒': '',
    '亞': '',
    '哲': '',
    '鬼': '',
    'ḥ': '',
    '𒀭': '',
    'ṭ': '',
    '〇': '',
    'ẁ': '',
    '∞': '',
    'वा': '',
    'घ': '',
    'न': '',
    'ख': '',
    'χ': '',
    'ι': '',
    'π': '',
    'λ': '',
    'η': '',
    'σ': '',
    'Ř': '',
    '椅': '',
    '子': '',
    'ב': '',
    'Л': '',
    'г': '',
    'н': '',
    'д': '',
    'Д': '',
    '집': '',
    'τ': '',
    'ε': '',
    'ρ': '',
    'φ': '',
    'ω': '',
    'ς': '',
    'ツ': '',
    '颠': '',
    '覆': '',
    '✓': '',
    "•": '',
    "Γ": '',
    "Φ": '',
    "Ξ": '',
    '': '',
    'є': '',
    '間': '',
    '无': '',
    '名': '',
    'Δ': '',
    'α': '',
    'μ': '',
    'υ': '',
    'ǫ': '',
    'φ': '',
    'ş': '',
    'к': '',
    'ю': '',
    'ч': '',
    '期': '',
    '会': '',
    'τ': '',
    'ρ': '',
    'ω': '',
    'α': '',
    'π': '',
    '∞': '',
    '╪': '',
    'ʊ': '',
    'Λ': '',
    'ή': '',
    'η': '',
    'Θ': '',
    'κ': '',
    'β': '',
    'Δ': '',
    'δ': '',
    'ά': '',
    'ἅ': '',
    'ἄ': '',
    'ὠ': '',
    'λ': '',
    'ι': '',
    'Б': '',
    'Ω': '',
    'т': '',
    'ł': '',
    'Ш': '',
    'й': '',
    'Ι': '',
    'σ': '',
    'ό': '',
    'θ': '',
    'ε': '',
    'ς': '',
    'э': '',
    'п': '',
    'б': '',
    'л': '',
    'д': '',
    'З': '',
    'ы': '',
    'П': '',
    'щ': '',
    'ш': '',
    'Г': '',
    'ж': '',
    'Ч': '',
    'ъ': '',
    'ξ': '',
    'ζ': '',
    'ɔ': '',
    'Ş': '',
    'ύ': '',
    'Σ': '',
    'Π': '',
    'Ή': '',
    'ώ': '',
    'έ': '',
    'μ': '',
    'χ': '',
    'Ș': '',
    'Ж': '',
    'ц': '',
    'Я': '',
    'ű': '',
    'Ά': '',
    'ψ': '',
    'Ł': '',
    'ф': '',
    'ᗅ': '',
    'ᗺ': '',
    'ᗷ': '',
    'Ţ': '',
    '♪': '',
    '♫': '',
    '☑': '',
    'Ц': '',
    'з': '',
    'Х': '',
    'в': '',
    'ь': '',
    'œ': '',
    'Þ': '',
    'к': '',
    'Ό': '',
    'Ō': '',
    'Ǭ': '',
    'ł': '',
    'þ': '',
    'В': '',
    'Κ': '',
    'Τ': '',
    'ν': '',
    'ο': '',
    'К': '',
    'Р': '',
    '·': '',
    '·': '',
    'Β': '',
    'ё': '',
    'У': '',
    'Ε': '',
    'Đ': '',
    'đ': '',
    'υ': '',
    'Η': '',
    'ı': '',
    'О': '',
    'І': '',
    'ţ': '',
    'о': '',
    'Α': '',
    'Μ': '',
    'ν': '',
    'Τ': '',
    'ί': '',
    'Á': '',
    'á': '',
    'Ć': '',
    'ć': '',
    'É': '',
    'é': '',
    'Í': '',
    'í': '',
    'Ń': '',
    'ń': '',
    'Ó': '',
    'ó': '',
    'Ś': '',
    'ś': '',
    'Ú': '',
    'ú': '',
    'Ý': '',
    'ý': '',
    'Â': '',
    'â': '',
    'Ê': '',
    'ê': '',
    'Î': '',
    'î': '',
    'Ô': '',
    'ô': '',
    'Û': '',
    'û': '',
    'Č': '',
    'č': '',
    'À': '',
    'à': '',
    'È': '',
    'è': '',
    'Ò': '',
    'ò': '',
    'Ù': '',
    'ù': '',
    'ñ': '',
    'ã': '',
    'ç': '',
    'ū': '',
    'ā': '',
    'č': '',
    'ě': '',
    'ź': '',
    'ć': '',
    'ė': '',
    'ŏ': '',
    'ž': '',
    'ő': '',
    'ň': '',
    '¯': '',
    'ă': '',
    'Č': '',
    'š': '',
    'ż': '',
    'ř': '',
    'ť': '',
    'Š': '',
    'ĕ': '',
    'š': '',
    'Č': '',
    'ć': '',
    'ĕ': '',
    'ĭ': '',
    'ș': '',
    'ό': '',
    'ģ': '',
    'ț': '',
    'ń': '',
    'Ĭ': '',
    'í': '',
    'é': '',
    'ô': '',
    'è': '',
    'ç': '',
    'õ': '',
    'ã': '',
    'à': '',
    'ú': '',
    'á': '',
    'ą': '',
    'ə': '',
    'î': '',
    'í': '',
    'ś': '',
    'ō': '',
    'Ż': '',
    'â': '',
    'Ŷ': '',
    'Χ': '',
    'х': '',
    'е': '',
    'р': '',
    'А': '',
    'Ǽ': '',
    'Ǻ': '',
    'ů': '',
    '': '',
    'Æ': '',
    'æ': '',
    'ӕ': '',
    'ğ': '',
    '': '',
    'Ο': ''
}

def character_code_cleanup2(column):
    for code, character in fixdict2.items():
        column = [re.sub(code, character, entry) for entry in column]
    return column

def clean_review_in_set(dataframe, review_col):
    reviews_list = dataframe[review_col].tolist()
    reviews_list_cleaned = character_code_cleanup2(reviews_list)
    dataframe = dataframe.drop(columns=[review_col])
    dataframe[review_col] = reviews_list_cleaned
    return dataframe

In [None]:
#a function that can be used to mask sentences before passing them to BERT
#this function requires review text and the indices of the sentences to be masked, and returns the masked text
#this function will remove the intro just as the rule-based system does (the value should be the same as when you found the indices of the weightiest sentences), but add it back on again after the masking
#also note that if the review is shorter than the number of sentences to mask, it will be empty, so you may need to manually replace the least weighty sentence afterwards

def mask_sents(review, sents_to_mask, intro_to_ignore):
    indexed_review = nltk.sent_tokenize(review)
    num_sents = len(indexed_review)
    intro_sents = math.floor(num_sents/(100/intro_to_ignore))
    intro_bit = indexed_review[:intro_sents]
    indexed_review = indexed_review[intro_sents:]
    masked_review = []
    if len(intro_bit) > 0:
        for i in intro_bit:
            masked_review.append(i)
    for i, x in enumerate(indexed_review):
        if i not in sents_to_mask:
            masked_review.append(x)
    finished_review = ' '.join([str(item) for item in masked_review])
    return finished_review

## Results analysis
This section contains functions for adding categories to the outputs, getting the best and worst results, and plotting sentence sentiment. This can also be done in conjunction with the earlier functions for checking metrics used during hyperparameter tuning.

In [None]:
#a function for adding categories to the results
#this requires a dataframe containing results, including an 'Actual_rating' column and a 'Prediction' column
#it counts a result above 5.0 as positive and below 5.1 as negative, but these may be adjusted as you wish, and further categories could easily be added by branching the if statements further

def add_cats(resultsframe):
    actual_cats = []
    pred_cats = []
    for rating in resultsframe['Actual_rating']:
        if rating < 5.1:
            actual_cats.append('Negative')
        else:
            actual_cats.append('Positive')
    for prediction in resultsframe['Prediction']:
        if rating < 5.1:
            pred_cat.append('Negative')
        else:
            pred_cats.append('Positive')

In [None]:
#a function that can be used to get the best and worst results based on the difference between predictons and actual ratings
#takes a dataframe of results as input, contained a column for "Actual_rating" and a column for "Prediction"
#note that num_best and num_worst must be NEGATIVE versions of the desired numbers (i.e. if you want the top ten, then num_best = -10)

def best_and_worst(resultsframe, num_best, num_worst):
    actual_ratings_list = resultsframe['Actual_rating'].tolist()
    predictions_list = resultsframe['Prediction'].tolist()
    differences_list = []
    for i, x in enumerate(actual_ratings_list):
        differences_list.append(round(abs(x - predictions_list[i]), 1))
    worst = sorted(range(len(differences_list)), key=lambda i: differences_list[i])[num_worst:]
    best = sorted(range(len(differences_list)), key=lambda i: differences_list[i], reverse=True)[num_best:]
    worst_results = resultsframe.iloc[worst]
    best_results = resultsframe.iloc[best]
    worst_results.to_csv('worstresults.csv', index=False)
    best_results.to_csv('bestresults.csv', index=False)

In [None]:
#a function that plots the sentiment scores of all sentences across a review
#note, this function takes as input the output csv of score_aspects (mode 3). This file must be named 'review[ID].csv'
#this function will display the plot and download it as a png

def plot_sents(reviewid):
    filetitle = 'review' + reviewid + '.csv'
    results_sheet = pd.read_csv(filetitle, skiprows=1, header=None, names=['index', 'guitars', 'vocals', 'drums', 'production', 'breakdowns', 'lyrics', 'ambience', 'writing', 'bass', 'melodies', 'technicality', 'keyboards', 'symphonics', 'exotic instruments', 'creativity', 'comedy', 'catchiness', 'implicit', 'unattached', 'SENTENCE TOTAL'])
    results_sheet.drop(results_sheet.tail(1).index,inplace=True)
    index_list = results_sheet['index'].tolist()
    sentence_score_list = results_sheet['SENTENCE TOTAL'].tolist()
    plt.plot(index_list, sentence_score_list)
    plt.xlabel('Sentence index')
    plt.ylabel('Sentiment score')
    plt.title(i)
    savename = reviewid + '.png'
    plt.savefig(savename, bbox_inches='tight')
    plt.show()