In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
zip_path = '/content/drive/MyDrive/archive_3.zip'
extract_folder = '/content/github_issues/'

!unzip -q "{zip_path}" -d "{extract_folder}"

In [3]:
csv_path = extract_folder + 'github_issues.csv'

In [None]:
!pip install pyarrow



In [5]:
# @title Load Less Data
import pandas as pd

file_path = '/content/github_issues/github_issues.csv'  # update to your actual file path

cols_to_load = ['issue_url', 'issue_title', 'body']

df = pd.read_csv(file_path, usecols=cols_to_load, nrows=100000)

print(df.shape)
print(df.head())
print(df.dtypes)

(100000, 3)
                                           issue_url  \
0  "https://github.com/zhangyuanwei/node-images/i...   
1     "https://github.com/Microsoft/pxt/issues/2543"   
2  "https://github.com/MatisiekPL/Czekolada/issue...   
3  "https://github.com/MatisiekPL/Czekolada/issue...   
4  "https://github.com/MatisiekPL/Czekolada/issue...   

                                         issue_title  \
0  can't load the addon. issue to: https://github...   
1  hcl accessibility a11yblocking a11ymas mas4.2....   
2  issue 1265: issue 1264: issue 1261: issue 1260...   
3  issue 1266: issue 1263: issue 1262: issue 1259...   
4  issue 1288: issue 1285: issue 1284: issue 1281...   

                                                body  
0  can't load the addon. issue to: https://github...  
1  user experience: user who depends on screen re...  
2  ┆attachments: <a href= https:& x2f;& x2f;githu...  
3  gitlo = github x trello\n---\nthis board is no...  
4  ┆attachments: <a href= https:& x2f;&

In [6]:
# @title Chunking
import pandas as pd

chunksize = 100_000  # Adjust if needed
chunk_list = []

for i, chunk in enumerate(pd.read_csv(
    csv_path,
    encoding='utf-8',
    engine='python',        # Use Python engine for tolerant parsing
    on_bad_lines='skip',    # Skip bad lines to avoid crashing
    chunksize=chunksize
    # low_memory parameter removed here!
)):
    print(f"Processing chunk {i+1} with shape {chunk.shape}")

    if 'state' in chunk.columns:
        filtered_chunk = chunk[chunk['state'] == 'open']
    else:
        filtered_chunk = chunk

    chunk_list.append(filtered_chunk.head(100))

    if i == 4:  # For demo, stop after 5 chunks
        break

df_sample = pd.concat(chunk_list)
print(f"\nSample dataframe shape: {df_sample.shape}")
df_sample.head()

Processing chunk 1 with shape (100000, 3)
Processing chunk 2 with shape (100000, 3)
Processing chunk 3 with shape (100000, 3)
Processing chunk 4 with shape (100000, 3)
Processing chunk 5 with shape (100000, 3)

Sample dataframe shape: (500, 3)


Unnamed: 0,issue_url,issue_title,body
0,"""https://github.com/zhangyuanwei/node-images/i...",can't load the addon. issue to: https://github...,can't load the addon. issue to: https://github...
1,"""https://github.com/Microsoft/pxt/issues/2543""",hcl accessibility a11yblocking a11ymas mas4.2....,user experience: user who depends on screen re...
2,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1265: issue 1264: issue 1261: issue 1260...,┆attachments: <a href= https:& x2f;& x2f;githu...
3,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1266: issue 1263: issue 1262: issue 1259...,gitlo = github x trello\n---\nthis board is no...
4,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1288: issue 1285: issue 1284: issue 1281...,┆attachments: <a href= https:& x2f;& x2f;githu...


In [7]:
# @title Optimize Data Types
import pandas as pd

def optimize_dtypes(df):
    cat_cols = ['state', 'author_association', 'repository_url']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    int_cols = ['comments', 'id']
    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

    float_cols = []
    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce', downcast='float')

    return df

# Example usage
df_sample = optimize_dtypes(df_sample)
print(df_sample.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 0 to 400099
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   issue_url    500 non-null    object
 1   issue_title  500 non-null    object
 2   body         500 non-null    object
dtypes: object(3)
memory usage: 639.5 KB
None


In [8]:
# @title Sampling
import pandas as pd
from sklearn.model_selection import train_test_split

# Print original data shape
print("Original dataframe shape:", df_sample.shape)

# --- RANDOM SAMPLING ---
random_sample_frac = 0.2  # 20%
random_sample = df_sample.sample(frac=random_sample_frac, random_state=42)

print(f"\nRandom sample shape ({int(random_sample_frac*100)}%):", random_sample.shape)
print("Random sample preview:")
print(random_sample.head())

# --- STRATIFIED SAMPLING ---

# Check if your dataframe has a suitable categorical column for stratification
stratify_col = 'state'  # Change this to your categorical column name

if stratify_col in df_sample.columns:
    # First check counts per group to ensure enough samples per group
    print("\nValue counts before filtering:")
    print(df_sample[stratify_col].value_counts())

    # Filter out rare groups with fewer than 2 samples (to avoid errors)
    valid_groups = df_sample[stratify_col].value_counts()[lambda x: x >= 2].index
    df_filtered = df_sample[df_sample[stratify_col].isin(valid_groups)]

    print("\nData shape after filtering rare groups:", df_filtered.shape)

    # Stratified split: keep 20% stratified by the categorical column
    strat_sample, _ = train_test_split(
        df_filtered,
        test_size=0.8,
        stratify=df_filtered[stratify_col],
        random_state=42
    )

    print(f"\nStratified sample shape ({int(random_sample_frac*100)}%):", strat_sample.shape)
    print(f"Stratified sample value counts ({stratify_col}):")
    print(strat_sample[stratify_col].value_counts())

    print("\nStratified sample preview:")
    print(strat_sample.head())

else:
    print(f"\nColumn '{stratify_col}' not found in dataframe for stratified sampling.")

Original dataframe shape: (500, 3)

Random sample shape (20%): (100, 3)
Random sample preview:
                                                issue_url  \
300061  "https://github.com/raksonibs/faker-elixir/iss...   
73      "https://github.com/MatisiekPL/Czekolada/issue...   
300074  "https://github.com/momentumfrc/Scouting-Websi...   
100055  "https://github.com/Intel-bigdata/OAP/issues/255"   
100004  "https://github.com/stavarengo/php-sigep/issue...   

                                              issue_title  \
300061        can you please add a faker for filesystem ?   
73      issue 2212: issue 2209: issue 2208: issue 2206...   
300074        can't delete search bar contents in firefox   
100055                         oap release 0.1 check list   
100004                         nova etiqueta dos correios   

                                                     body  
300061  can you please add a faker module for filesyst...  
73      gitlo = github x trello\n---\nthis board is

In [9]:
# @title Dataframe
import pandas as pd
import dask.dataframe as dd

csv_path = '/content/github_issues/github_issues.csv'
cleaned_path = '/content/github_issues/github_issues_cleaned.csv'

chunk_size = 100_000

# Clean in chunks, write to cleaned_path
with pd.read_csv(csv_path, engine='python', on_bad_lines='skip', encoding='utf-8', chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        chunk.to_csv(cleaned_path, mode='a', index=False, header=(i==0))
        print(f"Cleaned chunk {i+1}")

# Now read cleaned CSV with Dask
cols_to_use = ['issue_url', 'issue_title', 'body']
ddf = dd.read_csv(
    cleaned_path,
    usecols=cols_to_use,
    dtype='object',
    assume_missing=True,
    encoding='utf-8',
    blocksize='16MB'
)

print(ddf.head())

Cleaned chunk 1
Cleaned chunk 2
Cleaned chunk 3
Cleaned chunk 4
Cleaned chunk 5
Cleaned chunk 6
Cleaned chunk 7
Cleaned chunk 8
Cleaned chunk 9
Cleaned chunk 10
Cleaned chunk 11
Cleaned chunk 12
Cleaned chunk 13
Cleaned chunk 14
Cleaned chunk 15
Cleaned chunk 16
Cleaned chunk 17
Cleaned chunk 18
Cleaned chunk 19
Cleaned chunk 20
Cleaned chunk 21
Cleaned chunk 22
Cleaned chunk 23
Cleaned chunk 24
Cleaned chunk 25
Cleaned chunk 26
Cleaned chunk 27
Cleaned chunk 28
Cleaned chunk 29
Cleaned chunk 30
Cleaned chunk 31
Cleaned chunk 32
Cleaned chunk 33
Cleaned chunk 34
Cleaned chunk 35
Cleaned chunk 36
Cleaned chunk 37
Cleaned chunk 38
Cleaned chunk 39
Cleaned chunk 40
Cleaned chunk 41
Cleaned chunk 42
Cleaned chunk 43
Cleaned chunk 44
Cleaned chunk 45
Cleaned chunk 46
Cleaned chunk 47
Cleaned chunk 48
Cleaned chunk 49
Cleaned chunk 50
Cleaned chunk 51
Cleaned chunk 52
Cleaned chunk 53
Cleaned chunk 54
                                           issue_url  \
0  "https://github.com/zhangyuanwei

In [10]:
# @title Pandas
import pandas as pd
import time
import psutil
import threading

# File path
csv_path = '/content/github_issues/github_issues.csv'

# Monitoring setup
monitoring = True
performance_logs = []

def monitor_performance(log_list, interval=0.5):
    proc = psutil.Process()
    while monitoring:
        mem = proc.memory_info().rss / (1024*1024)  # MB
        cpu = proc.cpu_percent(interval=None)      # %
        log_list.append((time.time(), mem, cpu))
        time.sleep(interval)

monitor_thread = threading.Thread(target=monitor_performance, args=(performance_logs,))
monitor_thread.start()

start_time = time.time()
print("===== Pandas Processing Started =====")

# Load entire dataset (traditional)
df = pd.read_csv(csv_path, encoding='utf-8')

print(f"Loaded {len(df)} records.")

# Basic cleaning: fill missing, strip strings, drop duplicates
df.fillna({'issue_title': 'No Title', 'body': ''}, inplace=True)
for col in ['issue_title', 'body']:
    df[col] = df[col].astype(str).str.strip()

initial_len = len(df)
df.drop_duplicates(inplace=True)
print(f"Removed {initial_len - len(df)} duplicate rows.")

monitoring = False
monitor_thread.join()
end_time = time.time()

# Performance metrics
total_time = end_time - start_time
num_records = len(df)
throughput = num_records / total_time
peak_mem = max([m for _, m, _ in performance_logs])
avg_mem = sum([m for _, m, _ in performance_logs]) / len(performance_logs)
peak_cpu = max([c for _, _, c in performance_logs])
avg_cpu = sum([c for _, _, c in performance_logs]) / len(performance_logs)

print("\n===== Pandas Performance Summary =====")
print(f"Execution time: {total_time:.2f} seconds")
print(f"Records processed: {num_records}")
print(f"Throughput: {throughput:.2f} records/second")
print(f"Average memory usage: {avg_mem:.2f} MB")
print(f"Peak memory usage: {peak_mem:.2f} MB")
print(f"Average CPU usage: {avg_cpu:.2f}%")
print(f"Peak CPU usage: {peak_cpu:.2f}%")
print("Ease of Processing: Simple to use, widely known, but may be slow and memory intensive for very large datasets.")
print("======================================")

===== Pandas Processing Started =====
Loaded 5332153 records.
Removed 5097 duplicate rows.

===== Pandas Performance Summary =====
Execution time: 87.15 seconds
Records processed: 5327056
Throughput: 61126.66 records/second
Average memory usage: 4012.66 MB
Peak memory usage: 5494.21 MB
Average CPU usage: 98.10%
Peak CPU usage: 224.50%
Ease of Processing: Simple to use, widely known, but may be slow and memory intensive for very large datasets.


In [1]:
# @title Polars
import polars as pl
import time
import psutil
import threading

csv_path = '/content/github_issues/github_issues.csv'

monitoring = True
performance_logs = []

def monitor_performance(log_list, interval=0.5):
    proc = psutil.Process()
    while monitoring:
        mem = proc.memory_info().rss / (1024*1024)  # MB
        cpu = proc.cpu_percent(interval=None)      # %
        log_list.append((time.time(), mem, cpu))
        time.sleep(interval)

monitor_thread = threading.Thread(target=monitor_performance, args=(performance_logs,))
monitor_thread.start()

start_time = time.time()
print("===== Polars Processing Started =====")

df = pl.read_csv(csv_path)

print(f"Loaded {df.height} records.")

# Fill missing
fill_dict = {'issue_title': 'No Title', 'body': ''}
for col, val in fill_dict.items():
    if col in df.columns:
        df = df.with_columns(pl.col(col).fill_null(val))

# Strip whitespace
for col in ['issue_title', 'body']:
    if col in df.columns:
        df = df.with_columns(pl.col(col).str.strip_chars())

# Drop duplicates
initial_len = df.height
df = df.unique()
print(f"Removed {initial_len - df.height} duplicate rows.")

monitoring = False
monitor_thread.join()
end_time = time.time()

total_time = end_time - start_time
num_records = df.height
throughput = num_records / total_time
peak_mem = max([m for _, m, _ in performance_logs])
avg_mem = sum([m for _, m, _ in performance_logs]) / len(performance_logs)
peak_cpu = max([c for _, _, c in performance_logs])
avg_cpu = sum([c for _, _, c in performance_logs]) / len(performance_logs)

print("\n===== Polars Performance Summary =====")
print(f"Execution time: {total_time:.2f} seconds")
print(f"Records processed: {num_records}")
print(f"Throughput: {throughput:.2f} records/second")
print(f"Average memory usage: {avg_mem:.2f} MB")
print(f"Peak memory usage: {peak_mem:.2f} MB")
print(f"Average CPU usage: {avg_cpu:.2f}%")
print(f"Peak CPU usage: {peak_cpu:.2f}%")
print("Ease of Processing: Fast and memory efficient, but requires learning Polars API.")
print("======================================")

===== Polars Processing Started =====
Loaded 5332153 records.
Removed 5097 duplicate rows.

===== Polars Performance Summary =====
Execution time: 36.17 seconds
Records processed: 5327056
Throughput: 147290.82 records/second
Average memory usage: 3790.27 MB
Peak memory usage: 10736.56 MB
Average CPU usage: 65.79%
Peak CPU usage: 193.80%
Ease of Processing: Fast and memory efficient, but requires learning Polars API.


In [2]:
# @title PyArrow
import os
import pandas as pd
import pyarrow as pa
import pyarrow.csv as pv
import pyarrow.compute as pc
import time, psutil, threading

csv_path = '/content/github_issues/github_issues.csv'
cleaned_csv_path = '/content/github_issues_cleaned.csv'

# Step 0: Clear existing cleaned file if it exists
if os.path.exists(cleaned_csv_path):
    os.remove(cleaned_csv_path)

# Step 1: Clean CSV using pandas with sampling (RAM-friendly)
print("===== Strict Pandas Cleaning with Sampling =====")
chunksize = 100000
expected_columns = 3
max_chunks = 10  # Process only first 10 chunks
rows_per_chunk = 5000  # Sample size per chunk

with pd.read_csv(
    csv_path,
    chunksize=chunksize,
    encoding='utf-8',
    engine='python',
    on_bad_lines='skip',
    quoting=1,  # QUOTE_ALL
    skip_blank_lines=True
) as reader:
    for i, chunk in enumerate(reader):
        chunk.dropna(axis=0, how='any', inplace=True)
        if len(chunk.columns) > expected_columns:
            chunk = chunk.iloc[:, :expected_columns]
        chunk_sample = chunk.sample(min(rows_per_chunk, len(chunk)))
        chunk_sample.to_csv(cleaned_csv_path, mode='a', index=False, header=(i == 0))
        print(f"Strictly cleaned and sampled chunk {i + 1}")
        if i + 1 >= max_chunks:
            break

# Step 2: Monitor system performance
monitoring = True
performance_logs = []

def monitor_performance(log_list, interval=0.5):
    proc = psutil.Process()
    while monitoring:
        mem = proc.memory_info().rss / (1024 * 1024)
        cpu = proc.cpu_percent(interval=None)
        log_list.append((time.time(), mem, cpu))
        time.sleep(interval)

monitor_thread = threading.Thread(target=monitor_performance, args=(performance_logs,))
monitor_thread.start()

start_time = time.time()
print("===== PyArrow Processing Started =====")

# Step 3: Load with PyArrow
read_options = pv.ReadOptions(use_threads=True, block_size=5_000_000)
parse_options = pv.ParseOptions(
    delimiter=',',
    quote_char='"',
    newlines_in_values=True,
    invalid_row_handler=lambda row: 'skip'
)
convert_options = pv.ConvertOptions(strings_can_be_null=True)

table = pv.read_csv(
    cleaned_csv_path,
    read_options=read_options,
    parse_options=parse_options,
    convert_options=convert_options
)

print(f"Loaded {table.num_rows} records.")

# Step 4: Clean using PyArrow
for col in ['issue_title', 'body']:
    if col in table.column_names:
        table = table.set_column(
            table.schema.get_field_index(col),
            col,
            pc.utf8_trim(
                pc.fill_null(table[col], 'No Title' if col == 'issue_title' else ''),
                options=pc.TrimOptions(characters=' ')
            )
        )

# Step 5: Convert to Pandas and drop duplicates
df = table.to_pandas()
initial_len = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_len - len(df)} duplicate rows.")

# Step 6: (Optional) Convert back to PyArrow table
table = pa.Table.from_pandas(df)

monitoring = False
monitor_thread.join()
end_time = time.time()

# Step 7: Performance summary
total_time = end_time - start_time
num_records = table.num_rows
throughput = num_records / total_time
peak_mem = max(m for _, m, _ in performance_logs)
avg_mem = sum(m for _, m, _ in performance_logs) / len(performance_logs)
peak_cpu = max(c for _, _, c in performance_logs)
avg_cpu = sum(c for _, _, c in performance_logs) / len(performance_logs)

print("\n===== PyArrow Performance Summary =====")
print(f"Execution time: {total_time:.2f} seconds")
print(f"Records processed: {num_records}")
print(f"Throughput: {throughput:.2f} records/second")
print(f"Average memory usage: {avg_mem:.2f} MB")
print(f"Peak memory usage: {peak_mem:.2f} MB")
print(f"Average CPU usage: {avg_cpu:.2f}%")
print(f"Peak CPU usage: {peak_cpu:.2f}%")
print("Ease of Processing: Fast, memory-efficient, with seamless Pandas integration.")
print("======================================")


===== Strict Pandas Cleaning with Sampling =====
Strictly cleaned and sampled chunk 1
Strictly cleaned and sampled chunk 2
Strictly cleaned and sampled chunk 3
Strictly cleaned and sampled chunk 4
Strictly cleaned and sampled chunk 5
Strictly cleaned and sampled chunk 6
Strictly cleaned and sampled chunk 7
Strictly cleaned and sampled chunk 8
Strictly cleaned and sampled chunk 9
Strictly cleaned and sampled chunk 10
===== PyArrow Processing Started =====
Loaded 50000 records.
Removed 2 duplicate rows.

===== PyArrow Performance Summary =====
Execution time: 1.68 seconds
Records processed: 49998
Throughput: 29707.18 records/second
Average memory usage: 4858.99 MB
Peak memory usage: 5846.79 MB
Average CPU usage: 63.10%
Peak CPU usage: 96.10%
Ease of Processing: Fast, memory-efficient, with seamless Pandas integration.
