In [None]:
import re
import pandas as pd
from numba import njit
import matplotlib.pyplot as plt
import timeit
import numpy as np

def lix_calc1(text):
    # Split text into words by common delimiters
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    # Count long words (more than 6 letters)
    long_words = sum(1 for word in words if len(word) > 6)
    # Count sentences (approximation using periods)
    sentences = re.split(r'[.!?]', text)
    num_sentences = sum(1 for sentence in sentences if sentence.strip())
    # Calculate LIX index
    lix = num_words / num_sentences + (long_words / num_words) * 100
    return lix


def lix_calc2(text):
    # Split text into words by common delimiters
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    
    # Count long words (more than 6 letters) using map and sum
    long_words = sum(map(lambda word: len(word) > 6, words))
    
    # Count sentences (approximation using periods, exclamations, and question marks)
    num_sentences = sum(map(lambda s: bool(s.strip()), re.split(r'[.!?]', text)))
    
    # Calculate LIX index
    lix = num_words / num_sentences + (long_words / num_words) * 100
    return lix


def lix_calc3(text):
    # Split the text into words directly
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)

    # Count long words (more than 6 letters)
    long_words = sum(1 for word in words if len(word) > 6)

    # Count sentences by counting delimiters directly
    num_sentences = text.count('.') + text.count('!') + text.count('?')
    if num_sentences == 0:  # Prevent division by zero
        num_sentences = 1

    # Calculate LIX index
    lix = num_words / num_sentences + (long_words / num_words) * 100
    return lix

def time_them():
    %timeit lix_calc1(test)
    %timeit lix_calc2(test)
    %timeit lix_calc3(test)

def lix_calc_matrix(matrix, lix_method):
    # Choose the appropriate function based on lix_method
    if lix_method == 1:
        lix_calc = lix_calc1
    elif lix_method == 2:
        lix_calc = lix_calc2
    else:
        lix_calc = lix_calc3
    
    # Calculate LIX score for each text in the matrix
    for m in range(len(matrix)):
        matrix[m][1] = lix_calc(matrix[m][0])
    return matrix
    
def lix_vectorized1(texts):
    # Flatten input and load into a DataFrame
    df = pd.DataFrame([text[0] for text in texts], columns=['text'])

    # Use regex to split text and count words
    df['num_words'] = df['text'].str.findall(r'\b\w+\b').str.len()

    # Count long words (more than 6 letters) in a vectorized manner
    df['long_words'] = df['text'].str.count(r'\b\w{7,}\b')

    # Count sentences by counting punctuation (approximation)
    df['num_sentences'] = df['text'].str.count(r'[.!?]')
    df['num_sentences'] = df['num_sentences'].replace(0, 1)  # Avoid division by zero

    # Calculate LIX index
    df['lix'] = df['num_words'] / df['num_sentences'] + (df['long_words'] / df['num_words']) * 100

    # Return the LIX scores and texts as a list of tuples
    return df[['text', 'lix']].values.tolist()

@njit
def calculate_lix(num_words, long_words, num_sentences):
    # Avoid division by zero
    if num_sentences == 0:
        num_sentences = 1
    return num_words / num_sentences + (long_words / num_words) * 100

def preprocess_texts(texts):
    num_words_list = []
    long_words_list = []
    num_sentences_list = []

    # Preprocess each text for word count, long word count, and sentence count
    for text in texts:
        # Find all words and count them
        words = re.findall(r'\b\w+\b', text)
        num_words = len(words)
        num_words_list.append(num_words)
        
        # Count long words
        long_words = sum(1 for word in words if len(word) > 6)
        long_words_list.append(long_words)
        
        # Count sentence-ending punctuation marks
        num_sentences = max(1, text.count('.') + text.count('!') + text.count('?'))
        num_sentences_list.append(num_sentences)
    
    return num_words_list, long_words_list, num_sentences_list

def lix_vectorized2(texts):
    # Flatten the input list for DataFrame initialization
    flat_texts = [text[0] for text in texts]
    
    # Preprocess texts for word, long word, and sentence counts
    num_words_list, long_words_list, num_sentences_list = preprocess_texts(flat_texts)
    
    # Use Numba-optimized LIX calculation
    lix_scores = [calculate_lix(num_words, long_words, num_sentences) 
                  for num_words, long_words, num_sentences in zip(num_words_list, long_words_list, num_sentences_list)]
    
    # Return as list of tuples with text and calculated LIX score
    return list(zip(flat_texts, lix_scores))


# Example usage
test = "LIX er en forkortelse for læsbarhedsindeks (sv. läsbarhetsindex), der er en skala der giver et mål for en teksts læsbarhed. Det opgøres som det gennemsnitlige antal ord pr. helsætning, plus procentdelen af lange ord, altså ord der er over seks bogstaver lange. LIX blev introduceret af den svenske pædagog C.H. Björnsson (1916-1988)."


def time_big_ones(n_values, iterations=40):
    runtimes = {'lix_calc_matrix_1': [], 'lix_calc_matrix_2': [], 
                'lix_calc_matrix_3': [], 'lix_vectorized1': [], 
                'lix_vectorized2': []}
    
    for n in n_values:
        big_test = [[test, 0] for _ in range(n)]
        
        lix1 = timeit.repeat(lambda: lix_calc_matrix(big_test, 1), repeat=iterations, number=1)
        lix2 = timeit.repeat(lambda: lix_calc_matrix(big_test, 2), repeat=iterations, number=1)
        lix3 = timeit.repeat(lambda: lix_calc_matrix(big_test, 3), repeat=iterations, number=1)
        vec1 = timeit.repeat(lambda: lix_vectorized1(big_test), repeat=iterations, number=1)
        vec2 = timeit.repeat(lambda: lix_vectorized2(big_test), repeat=iterations, number=1)
        
        # Store the average of the repeated runtimes
        runtimes['lix_calc_matrix_1'].append(np.mean(lix1))
        runtimes['lix_calc_matrix_2'].append(np.mean(lix2))
        runtimes['lix_calc_matrix_3'].append(np.mean(lix3))
        runtimes['lix_vectorized1'].append(np.mean(vec1))
        runtimes['lix_vectorized2'].append(np.mean(vec2))
    
    return runtimes

# Generate the plot
def plot_runtimes(n_values, runtimes):
    plt.figure(figsize=(10, 6))
    
    for key, times in runtimes.items():
        plt.plot(n_values, times, label=key)
    
    plt.xlabel('n (size of big_test)')
    plt.ylabel('Average runtime (seconds)')
    plt.title('Runtime Analysis for Increasing n')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
n_values = [100, 500, 1000, 5000, 10000,20000,50000]  # Example sizes
runtimes = time_big_ones(n_values)
# plot_runtimes(n_values, runtimes)