This notebook was used to check if there was a significant difference between a pandas or a polar approach for Q1.

In [1]:

from pathlib import Path
import sys
from pathlib import Path
import os

# Add the app directory to the Python path
sys.path.append(str(Path().resolve().parent / "app"))

PROJECT_ROOT = Path(os.getcwd()).parent

DATA_DIR = PROJECT_ROOT / "data"
FILENAME = "farmers-protest-tweets-2021-2-4.json"

In [6]:
from typing import List, Tuple
from datetime import datetime
import polars as pl
import pandas as pd
import time

from extract import read_json_file


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [16]:
def polar_solution(file_path: str) -> List[Tuple[datetime.date, str]]:
    
    data = read_json_file(file_path)
    
    # Create a Polars DataFrame from the processed data
    tweets_df: pl.DataFrame = pl.DataFrame(
        data, schema=["date", "username"], orient="row"
    )

    # Convert the 'date' column to datetime format
    tweets_df = tweets_df.with_columns([
        pl.col("date").str.strptime(
            pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z", strict=False
        )
    ])

    # Group by date and username, counting the number of tweets per user each day
    tweets_per_day = tweets_df.group_by([pl.col("date").dt.date(), "username"]).agg(
        pl.count().alias("tweet_count")
    )

    # Find the user with the most tweets per day
    top_user_per_day = (
        tweets_per_day
        .sort(
            "tweet_count", descending=True
        )
        .group_by("date")
        .agg(
            pl.first("username").alias("top_user")
        )
    )

    # Count the total number of tweets per day
    tweets_by_day = tweets_df.group_by(tweets_df["date"].dt.date()).agg(
        pl.count().alias("total_tweets")
    )

    # Join the total tweets with the top user by day
    top_10_dates = tweets_by_day.join(top_user_per_day, on="date")

    # Sort by total number of tweets per day and select the top 10 dates
    top_10_dates = top_10_dates.sort("total_tweets", descending=True).head(10)

    return top_10_dates


def pandas_solution(file_path: str) -> List[Tuple[datetime.date, str]]:
    # Cargar el JSON en un DataFrame de Pandas
    data = read_json_file(file_path)
    columns = ['date', 'username']
    df = pd.DataFrame(data, columns=columns)

    # Convertir la columna de fecha en formato datetime para fácil manipulación
    df['date'] = pd.to_datetime(df['date'])

    # Agrupar por fecha y usuario, y contar el número de tweets por cada usuario en cada día
    tweets_per_day = df.groupby([df['date'].dt.date, 'username']).size().reset_index(name='tweet_count')

    # Encontrar el usuario con más tweets por día
    top_user_per_day = tweets_per_day.loc[tweets_per_day.groupby('date')['tweet_count'].idxmax()]

    # Contar el número total de tweets por día
    tweets_by_day = df.groupby(df['date'].dt.date).size().reset_index(name='total_tweets')

    # Unir el total de tweets con el usuario que más tweets tiene por cada día
    top_10_dates = pd.merge(tweets_by_day, top_user_per_day, on='date')

    # Ordenar por el número de tweets en cada día y obtener el top 10
    top_10_dates = top_10_dates.sort_values(by='total_tweets', ascending=False).head(10)

    return top_10_dates

# File path
file_path = DATA_DIR / FILENAME

# Measure the time for original solution
start_time = time.time()
polars_data = polar_solution(file_path)
polars_duration = time.time() - start_time

# Measure the time for optimized solution
start_time = time.time()
pandas_data = pandas_solution(file_path)
pandas_duration = time.time() - start_time

print(f"Polars Duration: {polars_duration:.4f} seconds")
print(f"Pandas Duration: {pandas_duration:.4f} seconds")

  pl.count().alias("tweet_count")
  pl.count().alias("total_tweets")


Polars Duration: 3.2676 seconds
Pandas Duration: 3.4433 seconds


We can see there is no big difference between these two approaches. We will keep the Polars approach just to follow the best approach.

This notebook was used to check if there was a significant difference between the most common solution with the `json` library and the `orjson` library.