### 1. Concatenate final_data_2.csv to final_data_8.csv

In [1]:
import pandas as pd

# Create a list of file names from 2 to 8
files = [f"final_data_{i}.csv" for i in range(2, 9)]

# Read and concatenate
df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

# Save result
df.to_csv("combined_2_to_8.csv", index=False)

#print("CSV files final_data_2.csv to final_data_8.csv concatenated successfully!")

In [2]:
import re

# Load the combined file
df_combined = pd.read_csv("combined_2_to_8.csv")

# Define regex patterns
link_pattern = r"(https?://\S+|www\.\S+)"       # http://, https://, www.
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"  # emails
phone_pattern = r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}\b"  # phone numbers

# Combine into one pattern
combined_pattern = f"({link_pattern}|{email_pattern}|{phone_pattern})"

# Filter out rows that match
filtered_df = df_combined[~df_combined['text'].astype(str).str.contains(combined_pattern, regex=True)]

# Save result
filtered_df.to_csv("filtered_combined.csv", index=False)

print(f"Done! {len(df_combined) - len(filtered_df)} rows removed. {len(filtered_df)} rows kept.")

  filtered_df = df_combined[~df_combined['text'].astype(str).str.contains(combined_pattern, regex=True)]


Done! 2 rows removed. 65578 rows kept.


In [3]:
import pandas as pd
import math
import os

INPUT_FILE = "filtered_combined.csv"
CHUNK_SIZE = 10_000
START_IDX = 2   # first output index
END_IDX = 8     # last output index

# Load data
df = pd.read_csv(INPUT_FILE)

# Handle empty file
if df.empty:
    print("No rows in filtered_combined.csv — nothing to split.")
else:
    num_chunks = math.ceil(len(df) / CHUNK_SIZE)
    max_slots = END_IDX - START_IDX + 1

    if num_chunks > max_slots:
        print(f"Warning: You have {num_chunks} chunks but only {max_slots} filenames (2..8).")
        print("Only the first 7 files will be written with the requested names.")

    for i in range(num_chunks):
        start = i * CHUNK_SIZE
        end = min((i + 1) * CHUNK_SIZE, len(df))
        chunk = df.iloc[start:end]

        out_idx = START_IDX + i
        if out_idx > END_IDX:
            # Stop if you strictly only want 2..8
            # Break here to enforce the naming constraint:
            # break
            #
            # Or, if you'd prefer to keep writing additional files beyond 8, uncomment below:
            # pass  # out_idx can continue to 9, 10, ...
            print(f"Reached file index {END_IDX}. Skipping extra chunk {i+1}/{num_chunks}.")
            break

        out_name = f"final_data_{out_idx}_filtered.csv"
        chunk.to_csv(out_name, index=False)
        print(f"Wrote rows {start}-{end-1} to {out_name}")

    print("Done.")


Wrote rows 0-9999 to final_data_2_filtered.csv
Wrote rows 10000-19999 to final_data_3_filtered.csv
Wrote rows 20000-29999 to final_data_4_filtered.csv
Wrote rows 30000-39999 to final_data_5_filtered.csv
Wrote rows 40000-49999 to final_data_6_filtered.csv
Wrote rows 50000-59999 to final_data_7_filtered.csv
Wrote rows 60000-65577 to final_data_8_filtered.csv
Done.
