# Description

This notebook reads a csv file and process the data.

# Set Up Enviroment

## Import Libraries

In [1]:
import spacy
import classy_classification
import csv
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_md")


  from .autonotebook import tqdm as notebook_tqdm


## Load spacy language package

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
                                              0.0/42.8 MB ? eta -:--:--
                                              0.1/42.8 MB 2.2 MB/s eta 0:00:20
                                              0.4/42.8 MB 4.8 MB/s eta 0:00:09
                                              0.8/42.8 MB 6.5 MB/s eta 0:00:07
     -                                        1.5/42.8 MB 8.4 MB/s eta 0:00:05
     --                                       2.4/42.8 MB 10.8 MB/s eta 0:00:04
     --                                       3.2/42.8 MB 12.0 MB/s eta 0:00:04
     ---                                      3.6/42.8 MB 12.2 MB/s eta 0:00:04
     ---                                      3.7/42.8 MB 11.3 MB/s eta 0:00:04
     ---                                      3.7/42.8 MB 11.3 MB/s eta 0:00:04
     ----                           

## Load csv file

In [3]:
filename = "data/dataset_shakes_cakes.csv"
file = open(filename,"r", encoding='utf-8')
new_data = list(csv.reader(file,delimiter=";"))
original_data = list(csv.reader(file,delimiter=";"))
file.close()

# Get only assessment statement
assessment = []
for sublist in new_data[1:]:
    if len(sublist) >= 3:
        assessment.append(sublist[2])

## Define Functions

In [4]:
def tokenizing(not_tokenized_list):
    tokenized_list = []
    for item in not_tokenized_list:
        doc = nlp(item)
        tokenized_list.append([token.text for token in doc])
    return tokenized_list

def remove_stopwords(tokens):
    stopwords = STOP_WORDS
    filtered_sublists = []
    for sublist in tokens:
        filtered_tokens = [token for token in sublist if token.lower() not in stopwords]
        filtered_sublists.append(filtered_tokens)
    return filtered_sublists

def remove_punctuation(tokens):
    cleaned_tokens = []
    for sublist in tokens:
        cleaned_sublist = []
        for token in sublist:
            if not nlp.vocab[token].is_punct:
                cleaned_sublist.append(token)
        cleaned_tokens.append(cleaned_sublist)
    return cleaned_tokens

def remove_assessment_terms(sublist):
    regex_pattern = r"^(ASSESSMENT:1|ASSESSMENT1|ASSESSMENT)"
    return [item for item in sublist if not re.match(regex_pattern, item)]

def remove_assessment_terms_from_list(lst):
    return [remove_assessment_terms(sublist) for sublist in lst]

def remove_empty_strings(input_list):
    return [[word for word in sublist if word.strip()] for sublist in input_list]

def lowercase_tokens(sublists):
    lowercase_sublists = []
    for sublist in sublists:
        lowercase_tokens = [token.lower() for token in sublist]
        lowercase_sublists.append(lowercase_tokens)
    return lowercase_sublists


# Data Processing

## Tokenization

In [5]:
tokenized_data = tokenizing(assessment)

## Remove Stopwords

In [6]:
removed_list = remove_stopwords(tokenized_data)

## Remove Punctuation

In [7]:
updated_tokenized_strings = remove_punctuation(removed_list)

## Process strings

Remove "ASSESSMENT" strings at the beginning of the sublists and delete empty strings

In [8]:
cleaned_sublists = remove_assessment_terms_from_list(updated_tokenized_strings)
cleaned_sublist = remove_empty_strings(cleaned_sublists)
lowercased_tokens = lowercase_tokens(cleaned_sublist)

## Lemmatizaion using spaCy

In [9]:
def lemmatize_token(token):
    return token.lemma_


lemmatized_data_spacy = []
for sublist in lowercased_tokens:  
    lemmatized_sublist = []
    for token in sublist:
        doc = nlp(token)
        lemmatized_token = lemmatize_token(doc[0]) if len(doc) > 0 else ''
        lemmatized_sublist.append(lemmatized_token)
    lemmatized_data_spacy.append(lemmatized_sublist)

# Process Data Format 

## Computations for the comprehensive dataset

In [12]:
#Separate header from content
if new_data[0][-1] != "Processed_Tokens":
    new_data[0].append("Processed_Tokens")

header = new_data[0]
new_content = new_data[1:]

In [13]:
print((lemmatized_data_spacy))

[['allergic', 'rhinitis'], ['patient', '56', 'year', 'old', 'female', 'present', 'bariatric', 'surgery', 'service', 'body', 'mass', 'index', '41', 'obesity', 'relate', 'comorbiditie', 'patient', 'interested', 'gastric', 'bypass', 'surgery', 'patient', 'appear', 'excellent', 'candidate', 'benefit', 'greatly', 'management', 'comorbiditie'], ['patient', 'status', 'post', 'lap', 'band', 'adjustment', 'total', '7', 'ml', 'band', 'tolerate', 'water', 'postprocedure', 'come', 'week', 'adjustment', 'need'], ['abdominal', 'cramp', 'past', '2', 'day', '1', '2', '10', 'intensity', 'performance', 'status', 'karnofsky', 'score', '100', 'continue', 'work', 'time', 'nutritional', 'status', 'appetite', 'depressed', 'past', 'couple', 'day', 'lose', '5', 'pound', 'week.)psychiatric', 'stress', 'upcoming', 'irs', 'audits', 'client', 'review', 'system', 'noncontributory', 'medications1', 'nyquil.2', 'timolol', 'eye', 'drops.3', 'aspirin.4', 'advil.5', 'zinc', 'physical', 'examinationgeneral', 'pleasant', 

In [14]:
def add_elements_to_sublists(main_list, sublists):
    # Check if the length of main_list matches the number of sublists
    if len(main_list) != len(sublists):
        raise ValueError("Length of main_list and sublists should be the same.")
    
    for i, item in enumerate(main_list):
        sublist = sublists[i]
        sublist.append(item)
    return sublists

new_content = add_elements_to_sublists(lemmatized_data_spacy, new_content)

## Computations for the dataset for the machine learning model

save in format: [["status", "tokens as string without punctuation"]]

In [15]:
#Keep only the status[4] and processed_tokens[5] column
def extract_items(list_of_lists):
    extracted_items = [[sublist[4], sublist[5]] for sublist in list_of_lists]
    return extracted_items

ml_data = extract_items(new_content)

#Remove punctuation and safe processed_tokens as string
ml_data = [[sublist[0], ', '.join(map(str, sublist[1]))] for sublist in ml_data]
for sublist in ml_data:
    sublist[1] = sublist[1].replace(',', '')




In [16]:
#Define header for ml_data
ml_header = []
ml_header.append(header[4])
ml_header.append(header[5])
print(ml_header)

['Status', 'Processed_Tokens']


# Save CSV Files

In [17]:
# with open('data/data_pipelined.csv', 'w', encoding='UTF8', newline='') as f:
#     writer = csv.writer(f)

#     # write the header
#     writer.writerow(header)

#     # write multiple rows
#     writer.writerows(new_content)

In [18]:
# with open('data/ml_data_spacy.csv', 'w', encoding='UTF8', newline='') as f:
#     writer = csv.writer(f)

#     # write the header
#     writer.writerow(ml_header)

#     # write multiple rows
#     writer.writerows(ml_data)