In [None]:
import csv
import os
import json
import numpy as np
import random
import re
import spacy
from tqdm import tqdm

if os.getcwd().split(os.sep)[-1] == 'notebook':
    os.chdir('..')

from data_preprocessing import sample_class_data
from keras.layers import Dense
from keras.models import Sequential
from langid.langid import LanguageIdentifier, model
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
from typing import List, Any

In [None]:
DATA_FOLDER = "./data/"
PREPROCESSED_DATA_PATH = DATA_FOLDER + 'preprocessed_data.json'

# Practical application - Predicting the rating of a movie review
To be able to make a prediction about the rating of a movie review (from bad/0 to very good/5), this notebook will guide you through the steps needed for this use case. To get a better understanding about the different parts of a NLP project, three parts will be looked at in detail: 
1. Data preprocessing
1. Creating datasets for machine learninig tasks
1. Training a model and making predictions


For each part, there is at least one task with two possible approaches depending on your knowledge about NLP and machine learning in general. 

Besides this notebook, there are other additional files and folders that will be used throughout the use case. You do not have to worry about additional imports, everything is already set up for you. The files and folders are:
* data/: folder containing the data that will be used. Please do not make any changes to the content of this folder!
* data_preprocessing.py: Contains the function 'sample_class_data' which will be used for the part 'Training a model and making predictions'. This function balances the distribution of classes in a dataset and can apply oversampling and undersampling

Before you start your work, please make sure that everything is set up correctly (see README.md)!

## Data preprocessing

The goal of this part is to show the steps needed to preprocess data for NLP tasks or machine learning tasks in general. These include
* reading the data
* filtering out unsusable data like empty lines or duplicates
* tokenising, and depending on the use case
    * lemmatising
    * stemming
    * Part of Speech (POS) detection
    * Named Entity Recognition (NER)
* processing classes for predictions (simplyfing, casting to other types, ...)
* saving the processed data into an easy to use format

Important:
Instead of applying your code on all the data (data/filmstarts.tsv), please use the file 'selection_film_reviews.tsv' instead. This contains a small collection of the original dataset with which you can run this part of the use case (for the other parts of the use, an already processed dataset is available). 

In [None]:
# Preparation for the tasks below - loading a spaCy model with a custom tokenizer component, loading the Snowball Stemmer for stemming, 
# and loading LanguageIdentifier for determining the language of a text
de_sm = spacy.load("de_core_news_sm")

def custom_tokenizer(model):
    infix_re = re.compile(r'''[.\,\_\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(model.Defaults.prefixes)
    suffix_re = compile_suffix_regex(model.Defaults.suffixes)

    return Tokenizer(model.vocab, prefix_search=prefix_re.search,
                                  suffix_search=suffix_re.search,
                                  infix_finditer=infix_re.finditer,
                                  token_match=None)

de_sm.tokenizer = custom_tokenizer(de_sm)

snow_stemmer_de = SnowballStemmer('german')

langid_model = LanguageIdentifier.from_modelstring(model, norm_probs=True)

Loading data from a tsv/csv file

In [None]:
# Task: Write a function that loads the content of a .tsv or .csv file (consider the encoding of the file)
def read_csv(path: str, encoding: str = "utf-8", newline: str = "\r\n",
            delimiter:str = ";") -> List:
    """Parses the given CSV and returns an array which contains each row of the
    CSV.

    Args:
        path (str): Path to the CSV which should be read
        encoding (str, optional): The CSV encoding. Defaults to "utf-8".
        newline (str, optional): The newline delimiter. Defaults to "\r\n".
        delimiter (str, optional): The delimiter between each Entry.
            Defaults to ";".

    Returns:
        List: Containing the parsed entries from each row.
    """

    csv_array = []
    # TODO open the file and save content of file into csv_array

    return csv_array

Using regex to find and group together repetitions of words

In [None]:
# Task: Write a function that uses regular expressions to find repetitions of a word in a sentence and return said word. If no repetitions have been found, return the input text.

# For the easier version, use the regular expressions from easy_code_help.txt and try to understand them
# Complex version: write the regular expressions yourself

# TODO write regex to find repetitions of a word at the start of a sentence
REPEATER_fast = r""
# TODO write regex to group all repetitions of a word
REPEATER_exact = r""

def remove_repetitions(s):  
    # TODO check if input text contains repetitions
    match_fast = None
    if match_fast:
        # TODO apply the regex for grouping together reptitions and return repeated word
        pass
    return s

Tokenization, lemmatization, and stemming of movie reviews using spaCy (https://spacy.io/usage/spacy-101#annotations) and nltk's SnowballStemmer (https://www.nltk.org/api/nltk.stem.snowball.html#module-nltk.stem.snowball)

In [None]:
# Task: Write a function that tokenizes, lemmatizes, and applies stemming on an input text using a spaCy model and the Snowball Stemmer. 
def tokenize_and_transform(text: str,
                           spacy_model: Language=de_sm,
                           stemming: bool=False,
                           lemmatization: bool=False,
                           stopword_removal: bool=False,
                           punctuation_removal: bool=False) -> List[str]:
    """Tokenizes and transforms the input text with the use of a spacy model
    and the nltk snowball stemmer.

    Args:
        text (str): The text which should be tokenized and transformed.
        spacy_model (Language): The Spacy model which tokenizes and lemmazes
            the text.  Defaults to de_sm.
        stemming (bool, optional): Tokens are stemmed if set to True. Defaults 
            to False.
        lemmatization (bool, optional): Tokens are lemmatized if set to True.
            Defaults to False.
        stopword_removal (bool, optional): Stopwrod tokens are removed if set 
            to True. Defaults to False.
        punctuation_removal (bool, optional): Punctuation tokens are removed if
            set to True. Defaults to False.

    Returns:
        List[str]: Transformed tokens of the input text
    """
    transformed_text = []
    # defining components of the spaCy model
    docs = spacy_model.pipe([text], disable=["tagger", 
                                             "parser", 
                                             "attribute_ruler", 
                                             "ner"])
    # TODO Iterate over all texts and tokens
        # TODO Skip token if stopwords should be checked and puncuation should be checked and skip current token if one of those applies
        # TODO apply lemmatization if requested 
        # TODO apply stemming if requested
        # TODO add token to output if it is not empty and is not '--' or ' '
    return transformed_text

Main function of the preprocessing task 

In [None]:
# Task: Write a function that iterates over the data and applies the following
# - read each line and add them to a list, skip empty lines
# - determine the language of a line and skip non-German lines using 
# - Check for word repetitions and group repetitions
# - tokenize and transform each line
# - transform the value of the rating to a data type more usable than String
def preprocess_movie_data(movie_data: List, enable_langid: bool=False) -> List:
    """Standardizes and preprocesses the unprocessed movie data.
    
    Args:
        movie_data (List): Unprocessed movie data
        enable_langid (Bool, optional): Enables the language detection which
            removes foreign languages and unidentifiable texts. This slows down
            the performance significantly. Defaults to False
    Returns:
        List: Preprocessed movie data
    """    
    preprocessed_data = []
    for row in tqdm(movie_data):
        # TODO skip lines with missing content  
        # TODO skip lines with empty values
        # TODO group together the review text if it is contained in more than one column 
        if enable_langid:
            # TODO determine the language of the current line and skip non-German ones using langid_model if requested
            pass
        
        # TODO Check for word repetitions and group repetitions 
        movie_review = None
        
        # tokenize and transform current line
        tokens = tokenize_and_transform(movie_review,
                                        stemming=True,
                                        lemmatization=True,
                                        stopword_removal=True,
                                        punctuation_removal=True)

        # TODO transform the review rating into a good data format and append formatted line to output as [URL, rating, raw text, transformed text]
        preprocessed_data.append(None)
    return preprocessed_data

Create output JSON file

In [None]:
# Task: Write a function that uses all the functions you worked on above to save the preprocessed data as a .json
def create_preprocessed_dataset(path_to_tsv):
    # TODO read data from .tsv file
    csv_array = None

    # preprocess data
    movie_data = preprocess_movie_data(csv_array, enable_langid=True)

    # TODO zip data and save in a data format usable for JSON exports
    url, stars, text, tokens = None, None, None, None

    preprocessed_data = {
        'url': None,
        'text_transformed': None,
        'text': None,
        'stars': None
    }
    
    # TODO Save the data to the file 'data/preprocessed_notebook.json'
    
    return preprocessed_data

In [None]:
exercise_block_1_output = create_preprocessed_dataset(DATA_FOLDER + 'film_review.tsv')

## Creating datasets for machine learninig tasks
As for all machine learning tasks, creating datasets for training and evaluation your model is needed. The goal of this part is to create a training, validation, and test dataset with regards to correct formatting.

In [None]:
# Preparation for tasks below
# list of movie reviews and their ratings for testing purposes
list_text_stars = [
    ["""Ein Wunderbarer schön geschriebener Film. John Fords zeitloser Klassiker! Auf gleicher Wellenlänge mit dem Buch.""", 5],
    ["""einfach nur hammer. und superlustig. Bin Fan der Reihe und wurde nich enttäuscht. Ganz im Gegenteil.""", 4],
    ["""Realitätsfern, aber nett anzusehen.""", 3],
    ["""Habe den Film heute gesehen, war ziemlich langweilig! Kaum Spannung, absolut vorhersehbar...""",0]
]

# Loading a spaCy model
de_core = spacy.load("de_core_news_lg", disable=["tagger",
                                                 "morphologizer",
                                                 "parser",
                                                 "lemmatizer",
                                                 "attribute_ruler",
                                                 "ner"])

# loading preprocessed data and their vector representations
data_de_core_vectors = np.load(DATA_FOLDER + "preprocessed_de_core_vecs.npz")
preprocessed_de_core_vectors = dict(data_de_core_vectors)

with open(PREPROCESSED_DATA_PATH, 'r') as f:
    preprocessed_data = json.load(f)

In [None]:
# TODO Depending on what you choose, set use_easy_code to True for easy version, False for complex version
use_easy_code = True

# Task: Write a function that creates lists containing different type of information, see comments for more detail
def format_dataset(processed_data, processed_data_vectors):
    x = [] # contains preprocessed texts
    x_fulltext = [] # contains raw texts
    x_vectors = [] # contains vector representations of texts
    y = [] # contains movie ratings

    # TODO iterate over the data and append the information of each line to the correct list from above

    return x, x_fulltext, x_vectors, y

# Task: Create a training, validation, and test dataset with a distribution of 70:15:15 (70% of the data in the training set, and 15% each of the data in the validation and test set)

# Easy version: Use the Python library 'sklearn' to create these sets and return split_data
def split_dataset_sklearn_version(x, y, random_seed=42):
    split_data = {}
    
    # TODO use sklearn to create the train and validation-test set (distribution: 70%-30%)
    x_train, x_val_test, y_train, y_val_test = None, None, None, None

    # TODO with x_val_test and y_val_test, use sklearn to create the validation and test set (distribution: 50%-50%)
    x_val, x_test, y_val, y_test = None, None, None, None
    
    split_data = {
        'x_train': x_train,
        'x_val': x_val,
        'x_test': x_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test
    }

    return split_data 
    # End of easy version

# Complex version: Write a function that creates these datasets without the help of sklearn or any other library that can create suchs sets. 
# Keep in mind that shuffling the data is important!
def split_dataset_code_version(x, y, percent_val, percent_test, random_seed=42):
    split_data = {}
    
    # TODO Check if x and y have the same length and return a ValueError of not

    # TODO generate a random seed and shuffle data
    
    # TODO determine the percentage value of the size of the training data 
    percent_train = 0
    
    # TODO define indices to access the correct amount of data for training and validation
    train_pos = 0
    val_pos = 0
    
    split_data = {
        'x_train': x[:train_pos],
        'x_val': x[train_pos:val_pos],
        'x_test': x[val_pos:],
        'y_train': y[:train_pos],
        'y_val': y[train_pos:val_pos],
        'y_test': y[val_pos:]
    }

    return split_data


x, x_fulltext, x_vectors, y = format_dataset(preprocessed_data, preprocessed_de_core_vectors)

if use_easy_code:
    split_data = split_dataset_sklearn_version(preprocessed_data['text_transformed'],
                            preprocessed_data['stars'])
    task_1_split = split_dataset_sklearn_version(x, y)
    task_1_fulltext_split = split_dataset_sklearn_version(x_fulltext, y)
    task_1_vectors_split = split_dataset_sklearn_version(x_vectors, y)
else:
    split_data = split_dataset_code_version(preprocessed_data['text_transformed'],
                            preprocessed_data['stars'],
                            0.15,
                            0.15)
    task_1_split = split_dataset_code_version(x, y, 0.15, 0.15)
    task_1_fulltext_split = split_dataset_code_version(x_fulltext, y, 0.15, 0.15)
    task_1_vectors_split = split_dataset_code_version(x_vectors, y, 0.15, 0.15)

Transforming the representation of the data using One Hot encoding

In [None]:
num_labels_task_1 = np.max(task_1_split['y_train']) + 1

# functions to apply one hot encoding 
def num_to_one_hot(num, length):
    one_hot = [0]*length
    one_hot[num] = 1
    return one_hot

def y_to_one_hot(y, length):
    return np.array([np.array(num_to_one_hot(e, length)) if isinstance(e, int) else e for e in y])


# Task: Write a function that transforms the data into numpy arrays. 
# For each type of data set, data_split should contain a numpy representation of the text data (x sets) and a one hot encoding representation of the ratings (y sets).
def keras_transform_data(data_split):
    # TODO create a numpy array for the number of labels
    num_labels = None

    # TODO create numpy arrays for x_train and y_train
    data_split['x_train'] = None
    data_split['y_train'] = None
    
    # TODO create numpy arrays for x_val and y_val
    data_split['x_val'] = None
    data_split['y_val'] = None
    
    # TODO create numpy arrays for x_text and y_test
    data_split['x_test'] = None
    data_split['y_test'] = None
    
    return data_split

keras_task_1_split = keras_transform_data(task_1_vectors_split)

## Training a model and making predictions
The goal of this part is to create a machine learning model that predicts the rating of a movie review.

Functions for training the model and making predictions

In [None]:
# Task: Using keras, write a function that creates the machine learning model.

# For the easier version, take the code from easy_code_help.txt
def get_model(n_inputs, n_outputs) -> Sequential:
    model = Sequential()
    # TODO create the structure of the Model with three Dense layer and the correct loss function and optimizer
    # TODO add first Dense layer
    # TODO add second Dense Layer 
    # TODO add third Dense Layer
    # TODO apply correct loss function and optimizer
    return model

# Task: Write a function that creates the machine learning model and applies a training and evaluation phase
def train_keras_model(data, batch_size=16, sampling_method="none", epochs=1):
    # Start of task
    model = None
    # TODO define the input and output dimensions of the model and initialize it with this information
    n_inputs, n_outputs = 0, 0
    model = get_model(n_inputs, n_outputs)
    
    # TODO Using the sample_class_data function from data_preprocessing.py, create x_train and y_train with a sampling method
    x_train, y_train = sample_class_data(None, None, None)

    # TODO convert x_train and y_train to numpy arrays
    x_train = None
    y_train = None
    
    # TODO train the model
    
    # TODO evaluate the model on the test data set
    test_pred = None

    # Prints the f1 score of the model
    print(np.round(f1_score([np.argmax(e) for e in data['y_test']], 
                            [np.argmax(e) for e in test_pred], 
                            average=None), 2))
    
    model.save(DATA_FOLDER + 'keras_model_movie_reviews')

    print('Model has been saved to data/.')

    # End of task
    return model

# Task: Write a function that predicts the rating of a movie review
def predict_movie_rating(model, text) -> Any:
    # TODO create vector representation of the input text usind the spaCy model 'de_core' from further above
    vec = None
    # TODO make the prediction
    prediction = None

    return np.argmax(prediction[0])

In [None]:
model_t1_o_10 = train_keras_model(keras_task_1_split, sampling_method="mediansampling", epochs=10)

In [None]:
for movie_review, rating in list_text_stars:
    print(f"Movie review: '{movie_review}', Prediction: {predict_movie_rating(model_t1_o_10, movie_review)}, correct rating: {rating}")
    print('-----------')