<a href="https://colab.research.google.com/github/gauravchugh2006/GreenComputing_TP1_ModelsPerformance/blob/main/TP2GreenComputing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# =========================
# TP2 — Pandas Baseline
# =========================

In [None]:
!curl -L -o /content/sample.zip https://www.kaggle.com/api/v1/datasets/download/mohamedbakhet/amazon-books-reviews

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1087M  100 1087M    0     0  37.7M      0  0:00:28  0:00:28 --:--:-- 38.0M


In [None]:
import zipfile

zip_path = '/content/sample.zip'
extract_path = '/content/sample_unzipped'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
!pip install codecarbon

# 1. Imports
import pandas as pd
import numpy as np
import re
from codecarbon import EmissionsTracker
import time

# Start measuring emissions
tracker = EmissionsTracker(output_file="emissions_pandas.csv")
tracker.start()

start_time = time.time()







[codecarbon INFO @ 14:37:41] [setup] RAM Tracking...
[codecarbon INFO @ 14:37:41] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 14:37:43] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 14:37:43] [setup] GPU Tracking...
[codecarbon INFO @ 14:37:43] No GPU found.
[codecarbon INFO @ 14:37:43] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 14:37:43] >>> Tracker's metadata:
[codecarbon INFO @ 14:37:43]   Platform system: Linux-6.6.105+-x86_64-with-glibc2.35
[codecarbon INFO @ 14:37:43]   Python version: 3.12.12
[codecarbon INFO @ 14:37:43]   CodeCarbon version: 3.0.7
[codecarbon INFO @ 14:37:43]   Available RAM : 12.671 GB
[codecarbon INFO @ 14:37:43

# =========================
# 2. Load data
# =========================

In [None]:

books = pd.read_csv("/content/sample_unzipped/books_data.csv")
reviews = pd.read_csv("/content/sample_unzipped/Books_rating.csv")

print("Books shape:", books.shape)
print("Reviews shape:", reviews.shape)

[codecarbon INFO @ 14:37:58] Energy consumed for RAM : 0.000042 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 14:37:58] Delta energy consumed for CPU with constant : 0.000178 kWh, power : 42.5 W
[codecarbon INFO @ 14:37:58] Energy consumed for All CPU : 0.000178 kWh
[codecarbon INFO @ 14:37:58] 0.000219 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:38:13] Energy consumed for RAM : 0.000085 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 14:38:13] Delta energy consumed for CPU with constant : 0.000182 kWh, power : 42.5 W
[codecarbon INFO @ 14:38:13] Energy consumed for All CPU : 0.000360 kWh
[codecarbon INFO @ 14:38:13] 0.000444 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:38:28] Energy consumed for RAM : 0.000126 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 14:38:28] Delta energy consumed for CPU with constant : 0.000178 kWh, power : 42.5 W
[codecarbon INFO @ 14:38:28] Energy consumed for All CPU : 0.000538 kWh
[codecarbon INFO @ 14:38:28] 0.000664 kWh 

Books shape: (212404, 10)
Reviews shape: (3000000, 10)



# =========================
# 3. Data Cleaning
# =========================

In [None]:

# Handle missing values
books.fillna({"description": "", "publisher": "Unknown", "categories": "[]", "authors": "[]"}, inplace=True)
reviews.fillna({"Price": 0, "review/text": "", "review/summary": ""}, inplace=True)

# Normalize authors and categories (convert stringified lists to real lists)
def clean_list_column(x):
    if pd.isna(x):
        return []
    x = re.sub(r"[\[\]']", "", str(x))
    return [i.strip() for i in x.split(",") if i.strip()]

books["authors"] = books["authors"].apply(clean_list_column)
books["categories"] = books["categories"].apply(clean_list_column)



# =========================
# 4. Join datasets on title
# =========================

In [None]:

merged = pd.merge(reviews, books, on="Title", how="inner")
print("Merged shape:", merged.shape)

Merged shape: (3000000, 19)


In [None]:


# =========================
# 5. Compute metrics
# =========================

In [None]:


# Average rating per author
author_ratings = (
    merged.explode("authors")
    .groupby("authors")["review/score"]
    .mean()
    .reset_index()
    .rename(columns={"review/score": "avg_rating"})
)
author_ratings.to_csv("avg_rating_per_author.csv", index=False)

# Number of reviews per publisher
reviews_per_publisher = (
    merged.groupby("publisher")["Id"]
    .count()
    .reset_index()
    .rename(columns={"Id": "num_reviews"})
)
reviews_per_publisher.to_csv("reviews_per_publisher.csv", index=False)

# Top 10 most-reviewed categories
category_reviews = (
    merged.explode("categories")
    .groupby("categories")["Id"]
    .count()
    .reset_index()
    .rename(columns={"Id": "num_reviews"})
    .sort_values(by="num_reviews", ascending=False)
    .head(10)
)
category_reviews.to_csv("top10_categories.csv", index=False)


NameError: name 'merged' is not defined


# =========================
# 6. Text Processing
# =========================

In [None]:
# Compute average review length
merged["review_length"] = merged["review/text"].apply(lambda x: len(str(x).split()))
avg_review_length = merged["review_length"].mean()

# Count most frequent keywords (simple example)
from collections import Counter
all_words = " ".join(merged["review/text"]).lower().split()
word_counts = Counter(all_words)
most_common_words = pd.DataFrame(word_counts.most_common(10), columns=["word", "count"])
most_common_words.to_csv("top10_keywords.csv", index=False)

[codecarbon INFO @ 14:33:50] Energy consumed for RAM : 0.000252 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 14:33:50] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 14:33:50] Energy consumed for All CPU : 0.001072 kWh
[codecarbon INFO @ 14:33:50] 0.001325 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:34:05] Energy consumed for RAM : 0.000294 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 14:34:05] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 14:34:05] Energy consumed for All CPU : 0.001249 kWh
[codecarbon INFO @ 14:34:05] 0.001543 kWh of electricity used since the beginning.




# =========================
# 7. Save results
# =========================

In [None]:


# author_ratings.to_csv("avg_rating_per_author.csv", index=False)
# reviews_per_publisher.to_csv("reviews_per_publisher.csv", index=False)
# category_reviews.to_csv("top10_categories.csv", index=False)
# most_common_words.to_csv("top10_keywords.csv", index=False)

# Stop emissions tracking
emissions = tracker.stop()

duration = time.time() - start_time

print(f"Execution time: {duration:.2f} s")
print(f"CO2 emitted: {emissions:.6f} kg")

NameError: name 'author_ratings' is not defined