This notebook was used to check if there is a significant difference between different approaches aiming for a better memory efficiency.

In [1]:

from pathlib import Path
import sys
from pathlib import Path
import os

# Add the app directory to the Python path
sys.path.append(str(Path().resolve().parent.parent / "OPTION - LATAM"))

PROJECT_ROOT = Path(os.getcwd()).parent

DATA_DIR = PROJECT_ROOT / "data"
FILENAME = "farmers-protest-tweets-2021-2-4.json"

In [2]:
import emoji
import heapq

import json
import time
from collections import Counter

from app.utils import memory_profile_logging_wrapper
from app.logger import Logger
Logger(log_dir=".", log_file="log")



<app.logger.Logger at 0x206931ebd10>

In [15]:
@memory_profile_logging_wrapper
def emoji_first_approach(file_path: str):
    emoji_counter = Counter()
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if 'content' in data:
                content = data['content']
                # Extraer emojis de cada mensaje
                emoji_counter.update([value.chars for value in emoji.analyze(content)])
    # Top 10 emojis
    top_10_emojis = emoji_counter.most_common(10)
    return top_10_emojis

@memory_profile_logging_wrapper
def emoji_second_approach(filepath):
    emoji_counts = {}

    # Procesar el archivo línea por línea
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            content = tweet.get('content', '')

            # Filtrar solo los emojis usando emoji librería
            for char in content:
                if emoji.purely_emoji(char):
                    if char in emoji_counts:
                        emoji_counts[char] += 1
                    else:
                        emoji_counts[char] = 1

    # Obtener los top 10 emojis más usados
    top_10_emojis = sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    return top_10_emojis
@memory_profile_logging_wrapper
def emoji_third_approach(filepath):
    emoji_counts = {}

    # Procesar el archivo línea por línea
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            content = tweet.get('content', '')

            # Filtrar solo los emojis usando emoji librería
            for char in content:
                if emoji.purely_emoji(char):
                    if char in emoji_counts:
                        emoji_counts[char] += 1
                    else:
                        emoji_counts[char] = 1

    # Obtener los top 10 emojis más usados
    top_10_emojis = heapq.nlargest(10, emoji_counts.items(), key=lambda x: x[1])
    return top_10_emojis
# File path
file_path = DATA_DIR / FILENAME

# Measure the time for original solution
start_time = time.time()
first_data = emoji_first_approach(file_path)
first_duration = time.time() - start_time
print(first_data)

# Measure the time for optimized solution
start_time = time.time()
second_data = emoji_second_approach(file_path)
second_duration = time.time() - start_time
print(second_data)

# Measure the time for optimized solution
start_time = time.time()
third_data = emoji_third_approach(file_path)
third_duration = time.time() - start_time
print(third_data)


ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 5049), ('😂', 3072), ('🚜', 2972), ('🌾', 2182), ('🇮🇳', 2086), ('🤣', 1668), ('✊', 1651), ('❤️', 1382), ('🙏🏻', 1317), ('💚', 1040)]
ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 7286), ('😂', 3072), ('️', 3061), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218)]
ERROR: Could not find file C:\Users\dzimm\AppData\Local\Temp\ipykernel_15728\2644555730.py
[('🙏', 7286), ('😂', 3072), ('️', 3061), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218)]


We can see that the two last solutions have some undefined or empty emojis in the list. We will consider this as a erroneous approach. To solve this we will use a fixed list of emojis instead of using the emojis en the `emoji` library. 

We don't have full control of that library and the output of its methods are not consistent. As I don't know that library and because of the erratic results, I prefer not to use it.

We will use the `heapq` library for the sort and find operation.

In [3]:
import heapq
import json
from collections import defaultdict
from typing import List, Tuple
from app.constants import EMOJI_PATTERN

from tqdm import tqdm

# @memory_profile_logging_wrapper
def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    """Answer question 2 efficiently in memory.

        I use the `emoji` library to make things simpler.
        By doing so we leave it up to the library to keep
        the list of existing emojis up to date (
        thus reducing complexity and maintenance on our side).
        The downside of this is that we have no control over
        the implementation of this library and the processing
        may not be thinking about memory efficiency.

        Despite this, I consider that for this particular application,
        after some tests, this library fits very well for
        the usage scenario. The simplified emoji search
        implies that this library is an excellent solution.

    Parameters
    ----------
    file_path : str
        Path of the json file to be loaded

    Returns
    -------
    List[Tuple[datetime.date, str]]
        List of the 10 most used emojis and
        the number of times they had been used.
    """

    # Create regex pattern
    # We include every emoji with an OR ('|') operator
    emoji_counter = defaultdict(int)

    _break = False
    with open(file_path, 'r', encoding="utf-8") as file:
        for line in tqdm(file):
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                continue

            if 'content' not in data:
                continue
            # Extract all emojis from tweet and update dict with count
            for match in EMOJI_PATTERN.findall(data["content"]):
                if match:  # Only count non-None matches
                    emoji_counter[match] += 1

    # Top 10 emojis
    top_10_emojis = heapq.nlargest(10, emoji_counter.items(), key=lambda x: x[1])
    return top_10_emojis
file_path = DATA_DIR / FILENAME
start_time = time.time()
third_data = q2_memory(file_path)
third_duration = time.time() - start_time
print(third_data)


117407it [00:43, 2714.90it/s]

[('#', 331643), ('2', 13864), ('1', 12475), ('4', 11934), ('3', 10996), ('5', 10199), ('7', 9993), ('6', 9981), ('8', 9781), ('9', 9765)]



