This notebook was used to check if there is a significant difference between different approaches aiming for a better memory efficiency.

In [1]:

from pathlib import Path
import sys
from pathlib import Path
import os

# Add the app directory to the Python path
sys.path.append(str(Path().resolve().parent / "app"))

PROJECT_ROOT = Path(os.getcwd()).parent

DATA_DIR = PROJECT_ROOT / "data"
FILENAME = "farmers-protest-tweets-2021-2-4.json"

In [10]:
import emoji
import heapq

import json
import time
from collections import Counter

from utils import memory_profile_logging_wrapper
from logger import Logger
Logger(log_dir=".", log_file="log")



<logger.Logger at 0x1914a2bb050>

In [15]:
@memory_profile_logging_wrapper
def emoji_first_approach(file_path: str):
    emoji_counter = Counter()
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if 'content' in data:
                content = data['content']
                # Extraer emojis de cada mensaje
                emoji_counter.update([value.chars for value in emoji.analyze(content)])
    # Top 10 emojis
    top_10_emojis = emoji_counter.most_common(10)
    return top_10_emojis

@memory_profile_logging_wrapper
def emoji_second_approach(filepath):
    emoji_counts = {}

    # Procesar el archivo línea por línea
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            content = tweet.get('content', '')

            # Filtrar solo los emojis usando emoji librería
            for char in content:
                if emoji.purely_emoji(char):
                    if char in emoji_counts:
                        emoji_counts[char] += 1
                    else:
                        emoji_counts[char] = 1

    # Obtener los top 10 emojis más usados
    top_10_emojis = sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    return top_10_emojis
@memory_profile_logging_wrapper
def emoji_third_approach(filepath):
    emoji_counts = {}

    # Procesar el archivo línea por línea
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            content = tweet.get('content', '')

            # Filtrar solo los emojis usando emoji librería
            for char in content:
                if emoji.purely_emoji(char):
                    if char in emoji_counts:
                        emoji_counts[char] += 1
                    else:
                        emoji_counts[char] = 1

    # Obtener los top 10 emojis más usados
    top_10_emojis = heapq.nlargest(10, emoji_counts.items(), key=lambda x: x[1])
    return top_10_emojis
# File path
file_path = DATA_DIR / FILENAME

# Measure the time for original solution
start_time = time.time()
first_data = emoji_first_approach(file_path)
first_duration = time.time() - start_time
print(first_data)

# Measure the time for optimized solution
start_time = time.time()
second_data = emoji_second_approach(file_path)
second_duration = time.time() - start_time
print(second_data)

# Measure the time for optimized solution
start_time = time.time()
third_data = emoji_third_approach(file_path)
third_duration = time.time() - start_time
print(third_data)


ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 5049), ('😂', 3072), ('🚜', 2972), ('🌾', 2182), ('🇮🇳', 2086), ('🤣', 1668), ('✊', 1651), ('❤️', 1382), ('🙏🏻', 1317), ('💚', 1040)]
ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 7286), ('😂', 3072), ('️', 3061), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218)]
ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 7286), ('😂', 3072), ('️', 3061), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218)]


We can see that the two last solutions have some undefined or empty emojis in the list. We will consider this as a erroneous approach and we will use the first one.

We will use the `heapq` library for the sort and find operation.