In [1]:
!pip uninstall -y geopandas numpy matplotlib pandas polars

Found existing installation: geopandas 1.0.1
Uninstalling geopandas-1.0.1:
  Successfully uninstalled geopandas-1.0.1
Found existing installation: numpy 2.2.0
Uninstalling numpy-2.2.0:
  Successfully uninstalled numpy-2.2.0
Found existing installation: matplotlib 3.9.3
Uninstalling matplotlib-3.9.3:
  Successfully uninstalled matplotlib-3.9.3
Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Successfully uninstalled pandas-2.2.3
Found existing installation: polars 1.17.1
Uninstalling polars-1.17.1:
  Successfully uninstalled polars-1.17.1


In [2]:
!pip install geopandas numpy==1.26.4 matplotlib pandas polars

Collecting geopandas
  Using cached geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Collecting matplotlib
  Using cached matplotlib-3.9.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting polars
  Using cached polars-1.17.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
Using cached geopandas-1.0.1-py3-none-any.whl (323 kB)
Using cached matplotlib-3.9.3-cp312-cp312-macosx_11_0_arm64.whl (7.8 MB)
Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
Using cached polars-1.17.1-cp39-abi3-macosx_11_0_arm64.whl (28.8 MB)
Installing collected packages: polars, numpy, pandas, matplotlib, geopandas
[31mERROR: pip's dependency resolver does not currently take into account all the

In [2]:
import polars as pl
import os

# Enable progress bar for lazy execution
pl.Config.set_tbl_cols(100)  # Optional: set table column width
pl.Config.set_tbl_rows(50)  # Optional: set table row height
pl.Config.set_verbose(True)  # Enable verbose output
# pl.Config.set_streaming(True)  # Enable lazy evaluation streaming (if supported)

DATA_FOLDER = os.path.join(os.getcwd(), '../data/processed/')

In [2]:
df_ratings_no_text = pl.read_parquet(DATA_FOLDER + 'ratings_no_text.pq')
df_ratings_text = pl.read_parquet(DATA_FOLDER + 'ratings_text_cleaned_lemma.pq')

parquet scan with parallel = Columns
parquet scan with parallel = RowGroups


In [4]:
import numpy as np
from collections import Counter
from scipy.stats import pearsonr
import re

In [14]:
import polars as pl
import numpy as np
from collections import Counter
from scipy.stats import pearsonr
import re
from tqdm import tqdm

def analyze_beer_reviews(df):
    """
    Analyze relationships between review text and other variables in beer reviews dataset.

    Parameters:
    df: polars.DataFrame with columns: text_cleaned_lemma, rating, palate, appearance,
                                     aroma, overall, year, country

    Returns:
    dict: Analysis results including common terms, correlations, and temporal trends
    """
    results = {}
    print("\n🍺 Starting beer reviews analysis...")

    # 1. Most common words by rating level
    print("\n📊 Analyzing word frequencies by rating...")
    def get_top_words(text_series, n=20):
        words = ' '.join(text_series.to_list()).split()
        return Counter(words).most_common(n)

    # Group by rating ranges
    df = df.with_columns([
        pl.col('rating').map_elements(lambda x: f"{int(x)}_stars")
        .alias('rating_group')
    ])

    results['words_by_rating'] = {}
    for rating_group in tqdm(df['rating_group'].unique(), desc="Processing rating groups"):
        group = df.filter(pl.col('rating_group') == rating_group)
        results['words_by_rating'][rating_group] = get_top_words(group['text_cleaned_lemma'])

    # 2. Word frequency correlation with numerical scores
    print("\n📈 Calculating word-score correlations...")
    def get_word_score_correlation(text_series, scores):
        # Get word frequencies for each review
        all_words = set(' '.join(text_series.to_list()).split())
        word_frequencies = {word: [] for word in all_words}

        print(f"   Processing {len(text_series)} reviews...")
        for text in tqdm(text_series, desc="Calculating word frequencies"):
            words = text.split()
            word_counts = Counter(words)
            for word in all_words:
                word_frequencies[word].append(word_counts.get(word, 0))

        # Calculate correlations
        correlations = {}
        print("   Computing correlations...")
        for word in tqdm(all_words, desc="Computing word correlations"):
            freqs = word_frequencies[word]
            if sum(freqs) > len(freqs) * 0.01:  # Only consider words that appear in >1% of reviews
                corr, p_value = pearsonr(freqs, scores)
                if abs(corr) > 0.1 and p_value < 0.05:  # Only significant correlations
                    correlations[word] = (corr, p_value)

        return dict(sorted(correlations.items(), key=lambda x: abs(x[1][0]), reverse=True)[:20])

    # Calculate correlations for each numeric rating
    for rating_col in tqdm(['rating', 'palate', 'appearance', 'aroma', 'overall'],
                           desc="Analyzing rating categories"):
        print(f"\n   Analyzing correlations for {rating_col}...")
        results[f'word_correlations_{rating_col}'] = get_word_score_correlation(
            df['text_cleaned_lemma'],
            df[rating_col].to_numpy()
        )

    # 3. Temporal analysis
    print("\n📅 Computing temporal trends...")
    yearly_stats = df.group_by('year').agg([
        pl.col('rating').mean().alias('avg_rating'),
        pl.col('text_cleaned_lemma').map_elements(len).mean().alias('avg_review_length')
    ])
    results['yearly_trends'] = yearly_stats.to_dict(as_series=False)

    # 4. Country analysis
    print("\n🌍 Analyzing country-level statistics...")
    country_stats = df.group_by('country').agg([
        pl.col('rating').mean().alias('avg_rating'),
        pl.col('text_cleaned_lemma').map_elements(len).mean().alias('avg_review_length'),
        pl.count().alias('review_count')
    ]).filter(pl.col('review_count') > 100)  # Only countries with significant reviews

    results['country_stats'] = country_stats.to_dict(as_series=False)

    print("\n✅ Analysis complete!")
    return results

def print_analysis_results(results):
    """
    Print the analysis results in a readable format.
    """
    print("\n=== 🍺 BEER REVIEWS ANALYSIS RESULTS ===\n")

    print("1. 📊 TOP WORDS BY RATING:")
    for rating, words in results['words_by_rating'].items():
        print(f"\n{rating}:")
        print(", ".join(f"{word}({count})" for word, count in words[:10]))

    print("\n2. 📈 STRONGEST WORD-SCORE CORRELATIONS:")
    for rating_type, correlations in results.items():
        if rating_type.startswith('word_correlations'):
            print(f"\n{rating_type.replace('word_correlations_', '').upper()}:")
            for word, (corr, p_value) in list(correlations.items())[:10]:
                print(f"{word}: {corr:.3f}")

    print("\n3. 📅 YEARLY TRENDS (Sample):")
    years = results['yearly_trends']['year'][:5]
    ratings = results['yearly_trends']['avg_rating'][:5]
    lengths = results['yearly_trends']['avg_review_length'][:5]
    for year, rating, length in zip(years, ratings, lengths):
        print(f"Year {year}: Avg Rating = {rating:.2f}, Avg Review Length = {length:.1f}")

    print("\n4. 🌍 COUNTRY INSIGHTS (Top 5 by review count):")
    countries = results['country_stats']['country'][:5]
    ratings = results['country_stats']['avg_rating'][:5]
    counts = results['country_stats']['review_count'][:5]
    for country, rating, count in zip(countries, ratings, counts):
        print(f"{country}: {count} reviews, Avg Rating = {rating:.2f}")

In [9]:
# concat the two dataframes
df = df_ratings_text.with_columns(
    df_ratings_no_text
)

In [15]:
# Analyze the beer reviews dataset
results = analyze_beer_reviews(df)


🍺 Starting beer reviews analysis...

📊 Analyzing word frequencies by rating...


Processing rating groups:   0%|          | 0/6 [00:00<?, ?it/s]dataframe filtered
Processing rating groups:  17%|█▋        | 1/6 [00:22<01:54, 22.99s/it]dataframe filtered
Processing rating groups:  33%|███▎      | 2/6 [00:23<00:39,  9.78s/it]dataframe filtered
Processing rating groups:  50%|█████     | 3/6 [00:27<00:21,  7.23s/it]dataframe filtered
Processing rating groups:  67%|██████▋   | 4/6 [00:28<00:09,  4.53s/it]dataframe filtered
Processing rating groups:  83%|████████▎ | 5/6 [01:03<00:15, 15.76s/it]dataframe filtered
Processing rating groups: 100%|██████████| 6/6 [01:04<00:00, 10.79s/it]



📈 Calculating word-score correlations...


Analyzing rating categories:   0%|          | 0/5 [00:00<?, ?it/s]


   Analyzing correlations for rating...
   Processing 7102520 reviews...



Calculating word frequencies:   0%|          | 0/7102520 [00:00<?, ?it/s][A
Calculating word frequencies:   0%|          | 1/7102520 [00:00<1029:29:59,  1.92it/s][A
Calculating word frequencies:   0%|          | 2/7102520 [00:00<964:24:44,  2.05it/s] [A
Calculating word frequencies:   0%|          | 3/7102520 [00:01<935:27:20,  2.11it/s][A
Calculating word frequencies:   0%|          | 4/7102520 [00:01<916:40:25,  2.15it/s][A
Calculating word frequencies:   0%|          | 5/7102520 [00:02<1023:40:00,  1.93it/s][A
Calculating word frequencies:   0%|          | 6/7102520 [00:03<1030:12:52,  1.92it/s][A
Calculating word frequencies:   0%|          | 7/7102520 [00:03<983:24:55,  2.01it/s] [A
Calculating word frequencies:   0%|          | 8/7102520 [00:03<960:51:04,  2.05it/s][A
Calculating word frequencies:   0%|          | 9/7102520 [00:04<982:57:53,  2.01it/s][A
Calculating word frequencies:   0%|          | 10/7102520 [00:04<946:56:35,  2.08it/s][A
Calculating word frequenci

KeyboardInterrupt: 

In [None]:
# Print the analysis results
print_analysis_results(results)

In [7]:
from collections import Counter
from scipy.stats import pearsonr
from tqdm import tqdm
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def analyze_beer_reviews(df):
    """
    Analyze relationships between review text and other variables in beer reviews dataset.

    Parameters:
    df: pandas.DataFrame with columns: text_cleaned_lemma, rating, palate, appearance,
                                     aroma, overall, year, country

    Returns:
    dict: Analysis results including common terms, correlations, and temporal trends
    """
    results = {}

    # 1. Most common words by rating level
    def get_top_words(text_series, n=20):
        words = ' '.join(text_series).split()
        return Counter(words).most_common(n)

    # Group by rating ranges
    df['rating_group'] = df['rating'].apply(lambda x: f"{int(x)}_stars")

    rating_groups = df.groupby('rating_group')
    results['words_by_rating'] = {}

    for rating, group in tqdm(rating_groups, desc="Processing Words by Rating"):
        results['words_by_rating'][rating] = get_top_words(group['text_cleaned_lemma'])

    # # 2. Word frequency correlation with numerical scores
    # def get_word_score_correlation(text_series, scores):
    #     # Get word frequencies for each review
    #     all_words = set(' '.join(text_series).split())
    #     word_frequencies = {word: [] for word in all_words}
    #
    #     for text in tqdm(text_series, desc="Building Word Frequencies"):
    #         words = text.split()
    #         word_counts = Counter(words)
    #         for word in all_words:
    #             word_frequencies[word].append(word_counts.get(word, 0))
    #
    #     # Calculate correlations
    #     correlations = {}
    #     for word, freqs in tqdm(word_frequencies.items(), desc="Calculating Correlations"):
    #         if sum(freqs) > len(freqs) * 0.01:  # Only consider words that appear in >1% of reviews
    #             corr, p_value = pearsonr(freqs, scores)
    #             if abs(corr) > 0.1 and p_value < 0.05:  # Only significant correlations
    #                 correlations[word] = (corr, p_value)
    #
    #     return dict(sorted(correlations.items(), key=lambda x: abs(x[1][0]), reverse=True)[:20])

    def get_word_score_correlation(text_series, scores):
        # Create a matrix of word frequencies
        words = ' '.join(text_series).split()
        unique_words = list(set(words))
        word_freq_matrix = np.zeros((len(text_series), len(unique_words)))

        word_to_idx = {word: idx for idx, word in enumerate(unique_words)}

        for i, text in enumerate(text_series):
            for word in text.split():
                if word in word_to_idx:
                    word_freq_matrix[i, word_to_idx[word]] += 1

        # Compute correlations in a vectorized manner
        correlations = {}
        for idx, word in enumerate(unique_words):
            corr, p_value = pearsonr(word_freq_matrix[:, idx], scores)
            if abs(corr) > 0.1 and p_value < 0.05:
                correlations[word] = (corr, p_value)

        return dict(sorted(correlations.items(), key=lambda x: abs(x[1][0]), reverse=True)[:20])

    for rating_col in tqdm(['rating', 'palate', 'appearance', 'aroma', 'overall'], desc="Calculating Word Correlations"):
        results[f'word_correlations_{rating_col}'] = get_word_score_correlation(
            df['text_cleaned_lemma'],
            df[rating_col]
        )

    # # 3. Temporal analysis
    # yearly_stats = df.groupby('year').agg(
    #     avg_rating=('rating', 'mean'),
    #     avg_review_length=('text_cleaned_lemma', lambda x: x.str.len().mean())
    # ).reset_index()
    # results['yearly_trends'] = yearly_stats.to_dict(orient='list')

    # 4. Country analysis
    country_stats = df.groupby('country').agg(
        avg_rating=('rating', 'mean'),
        avg_review_length=('text_cleaned_lemma', lambda x: x.str.len().mean()),
        review_count=('text_cleaned_lemma', 'count')
    ).reset_index()
    country_stats = country_stats[country_stats['review_count'] > 100]  # Only countries with significant reviews

    results['country_stats'] = country_stats.to_dict(orient='list')

    return results

def print_analysis_results(results):
    """
    Print the analysis results in a readable format.
    """
    print("=== BEER REVIEWS ANALYSIS ===\n")

    print("1. TOP WORDS BY RATING:")
    for rating, words in results['words_by_rating'].items():
        print(f"\n{rating}:")
        print(", ".join(f"{word}({count})" for word, count in words[:10]))

    print("\n2. STRONGEST WORD-SCORE CORRELATIONS:")
    for rating_type, correlations in results.items():
        if rating_type.startswith('word_correlations'):
            print(f"\n{rating_type.replace('word_correlations_', '').upper()}:")
            for word, (corr, p_value) in list(correlations.items())[:10]:
                print(f"{word}: {corr:.3f}")

    # print("\n3. YEARLY TRENDS (Sample):")
    # years = results['yearly_trends']['year'][:5]
    # ratings = results['yearly_trends']['avg_rating'][:5]
    # lengths = results['yearly_trends']['avg_review_length'][:5]
    # for year, rating, length in zip(years, ratings, lengths):
    #     print(f"Year {year}: Avg Rating = {rating:.2f}, Avg Review Length = {length:.1f}")

    print("\n4. COUNTRY INSIGHTS (Top 5 by review count):")
    countries = results['country_stats']['country'][:5]
    ratings = results['country_stats']['avg_rating'][:5]
    counts = results['country_stats']['review_count'][:5]
    for country, rating, count in zip(countries, ratings, counts):
        print(f"{country}: {count} reviews, Avg Rating = {rating:.2f}")


In [4]:
import pandas as pd

df_ratings_no_text = pd.read_parquet(DATA_FOLDER + 'ratings_no_text.pq')
df_ratings_text = pd.read_parquet(DATA_FOLDER + 'ratings_text_cleaned_lemma.pq')

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [5]:
df_ratings_text = df_ratings_text.reset_index(drop=True)
df_ratings_no_text = df_ratings_no_text.reset_index(drop=True)
df = pd.concat([df_ratings_text, df_ratings_no_text], axis=1)

In [9]:
results = analyze_beer_reviews(df)
print_analysis_results(results)

Processing Words by Rating: 100%|██████████| 6/6 [00:59<00:00,  9.98s/it]
Calculating Word Correlations:   0%|          | 0/5 [00:00<?, ?it/s]
Building Word Frequencies:   0%|          | 0/7102520 [00:00<?, ?it/s][A
Building Word Frequencies:   0%|          | 1/7102520 [00:00<866:50:19,  2.28it/s][A
Building Word Frequencies:   0%|          | 2/7102520 [00:00<871:31:42,  2.26it/s][A
Building Word Frequencies:   0%|          | 3/7102520 [00:01<851:42:38,  2.32it/s][A
Building Word Frequencies:   0%|          | 4/7102520 [00:01<840:27:33,  2.35it/s][A
Building Word Frequencies:   0%|          | 5/7102520 [00:02<856:08:13,  2.30it/s][A
Building Word Frequencies:   0%|          | 6/7102520 [00:02<847:33:39,  2.33it/s][A
Building Word Frequencies:   0%|          | 7/7102520 [00:03<838:15:03,  2.35it/s][A
Building Word Frequencies:   0%|          | 8/7102520 [00:03<833:51:30,  2.37it/s][A
Building Word Frequencies:   0%|          | 9/7102520 [00:03<880:20:18,  2.24it/s][A
Building 

KeyboardInterrupt: 