In [30]:
'''
Install dependencies
'''
!pip3 install -U spacy contractions beautifulsoup4 spacy-cleaner pyspellchecker requests pandas scikit-learn
!python3 -m spacy download en_core_web_lg

Collecting spacy
  Using cached spacy-3.5.3-cp310-cp310-macosx_11_0_arm64.whl (6.6 MB)
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-macosx_12_0_arm64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy>=1.3.2 (from scikit-learn)
  Downloading scipy-1.10.1-cp310-cp310-macosx_12_0_arm64.whl (28.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [31]:
'''
Import statements
'''
import re
from bs4 import BeautifulSoup
import contractions
from spellchecker import SpellChecker
import en_core_web_lg
import spacy
import requests

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
def normalize(review: str) -> list:
    '''Method to normalize (preprocess and lemmatize) a review into a tokenized list of words
    Steps:
        1. HTML decoding
        2. Lowercase conversion
        3. Contractions expansion
        4. Entities detection
        5. Digits and punctuations removal
        5. Slang correction
        6. Spelling correction
        7. Word tokenization
        8. Stop words removal and removal of insignificant words (less than 2 characters)
        9. Lemmatization
    
    Required Args
        review (str): Text to be preprocessed
    
    Returns
        list: List of tokenized words    
    '''
    # HTML decoding
    review = BeautifulSoup(review).get_text()
    # Convert words to lowercase
    review = review.lower()
    # Expand contractions
    review_list = []
    for word in review.split():
        review_list.append(contractions.fix(word))
    # Make entities a single token eg. New York -> New-York
    nlp = en_core_web_lg.load()
    entity_list = []
    for ele in nlp(review).ents:
        if len(str(ele.text).split()) > 1:
            entity_list.append((ele.text, str(ele.text).replace(' ','_')))
    for item in entity_list:
        review = review.replace(item[0], item[1])
    # Digits and punctuations removal
    review_list = re.sub('[^a-zA-Z0-9_]', ' ', review).split()
    review_list = [word for word in review_list if not re.search(r'\d', word)]
    # Correct slang words / phrases
    abbr_dict = {}
    def getAbbr(alpha, abbr_dict):
        res = requests.get('https://www.noslang.com/dictionary/'+alpha)
        soup = BeautifulSoup(res.text, 'html.parser')
        for abbr in soup.find_all('abbr'):
            full = abbr['title'].lower()
            abrv = abbr.find('dt').text[:-2]
            abbr_dict[abrv] = full
    abbr_list = []
    for char in range(97,123):
        abbr_list.append(chr(char))
    for i in abbr_list:
        getAbbr(i, abbr_dict)
    expanded = {}
    for word in review_list:
        if word in abbr_dict:
            expanded[word] = abbr_dict[word]
    review = ' '.join(review_list)
    for item in expanded:
        review = review.replace(item, expanded[item])
    review_list = review.split()
    # Correct spelling
    corrector = SpellChecker()
    misspelled = corrector.unknown(review_list)
    corrected = {}
    for word in misspelled:
        if corrector.correction(word):
            corrected[word] = corrector.correction(word)
    review = ' '.join(review_list)
    for item in corrected:
        review = review.replace(item, corrected[item])
    # Word tokenization, stop word removal and lemmatization
    doc = nlp(review)
    tokenized_word_list = []
    for token in doc:
        if '_' in token.text:
            tokenized_word_list.append(token.text)
        elif not token.is_stop and len(token.text) > 1:
            # Lemmatization
            tokenized_word_list.append(token.lemma_)
    '''To consider: sentence tokenization'''
    return tokenized_word_list

In [None]:
def tfidf(train: list, test: list) -> tuple[list, list]:
    '''Method to use the Term Frequency Inverse Document Frequency (TF-IDF) method to calculate how relevant a word in a series or corpus is to a text
    
    Required Args
        train (list): Training dataset of movie reviews
        test (list): Testing dataset of movie reviews
    
    Returns
        list: Transformed training dataset reviews
        list: Transformed Testing dataset reviews
    '''
    # Initialize vectorizer
    tfidf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
    # Transform training dataset
    transformed_train=tfidf.fit_transform(train)
    # Transform testing dataset
    transformed_test=tv.transform(test)
    return transformed_train, transformed_test

In [None]:
def bow(train: list, test: list) -> tuple[list, list]:
    '''Method to use the Bag of Words (BOW) method to calculate the occurrence of words within a document
    
    Required Args
        train (list): Training dataset of movie reviews
        test (list): Testing dataset of movie reviews
    
    Returns
        list: Transformed training dataset reviews
        list: Transformed Testing dataset reviews
    '''
    # Initialize vectorizer
    cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
    # Transform training dataset
    transformed_train=cv.fit_transform(train)
    # Transform testing dataset
    transformed_test=cv.transform(test)
    return transformed_train, transformed_test