In [4]:
import pandas as pd

df = pd.read_csv("/home/veronica/Desktop/dod-unprocessed-data/Facebook_Data_batch_eg_20251006.csv")


In [16]:
import pandas as pd
import re
from datetime import datetime

# --- Config ---
INPUT_FILE = "/home/veronica/Desktop/dod-unprocessed-data/Facebook_Data_batch_eg_20251006.csv"
OUTPUT_VALID = "/home/veronica/Desktop/dod-unprocessed-data/process_file.csv"
OUTPUT_INVALID = "/home/veronica/Desktop/dod-unprocessed-data/invalid_rows.csv"

# --- Regex patterns ---
EMAIL_REGEX = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
PHONE_REGEX = re.compile(r"^\+?[\d\s()-]{6,}$")

def is_valid_email(value):
    return isinstance(value, str) and EMAIL_REGEX.match(value.strip())

def is_valid_phone(value):
    if not isinstance(value, str):
        value = str(value)
    value = value.strip()
    return bool(PHONE_REGEX.match(value))



In [17]:
# 🧩 Cell 2: Load and Inspect Input Data
df = pd.read_csv(INPUT_FILE)
print("Total rows:", len(df))
# df.head(10)


Total rows: 92622


In [18]:
# 🧩 Cell 3: Define Validation Function
def validate_row(row):
    errors = []

    # 1. input_url
    if not str(row.get("input_url", "")).strip():
        errors.append("input_url blank")

    # 2. time_stamp
    if not str(row.get("time_stamp", "")).strip():
        errors.append("time_stamp blank")

    # 3. fb_url
    fb_url = str(row.get("fb_url", "")).strip()
    if not fb_url:
        errors.append("fb_url blank")
    elif "category" in fb_url.lower():
        errors.append("fb_url contains 'category'")
    elif fb_url.startswith("https://l.facebook.com/l.php"):
        errors.append("fb_url invalid redirect")

    # 4. fb_company_name
    if not str(row.get("fb_company_name", "")).strip():
        errors.append("fb_company_name blank")

    # 5. fb_email_address
    email = str(row.get("fb_email_address", "")).strip()
    if email and not is_valid_email(email):
        errors.append("invalid email_address")
    elif not email:
        errors.append("fb_email_address blank")

    # 6. fb_url_type
    fb_type = str(row.get("fb_url_type", "")).strip().lower()
    if fb_type not in ["official page", "unofficial page", "official", "unofficial"]:
        errors.append("fb_url_type invalid")

    # 7. fb_number_of_followers
    followers = str(row.get("fb_number_of_followers", "")).strip()
    if followers == "":
        errors.append("fb_number_of_followers blank")

    # 8. fb_phone_number
    phone = str(row.get("fb_phone_number", "")).strip()
    if not phone or not is_valid_phone(phone):
        errors.append("invalid fb_phone_number")

    return errors


In [19]:
# 🧩 Cell 4: Apply Validation Rules
valid_rows = []
invalid_rows = []

for _, row in df.iterrows():
    record = row.to_dict()
    issues = validate_row(record)

    if not issues:
        valid_rows.append(record)
    else:
        record["validation_errors"] = "; ".join(issues)
        invalid_rows.append(record)

valid_df = pd.DataFrame(valid_rows)
invalid_df = pd.DataFrame(invalid_rows)

print(f"✅ Total Valid: {len(valid_df)} | ❌ Total Invalid: {len(invalid_df)}")


✅ Total Valid: 43792 | ❌ Total Invalid: 48830


In [11]:
# 🧩 Cell 5: Preview Top Rows
print("✅ VALID ROWS (Top 10):")
display(valid_df.head(10))

print("\n❌ INVALID ROWS (Top 10):")
display(invalid_df.head(10))


✅ VALID ROWS (Top 10):


Unnamed: 0,input_url,time_stamp,fb_last_post_date,fb_url,fb_url_type,fb_number_of_followers,fb_company_name,fb_company_intro,fb_category,fb_address,fb_phone_number,fb_email_address,fb_website,fb_website2,fb_website3
0,https://www.facebook.com/110820419571489,2025-10-03T22:30:40.394014,2025-09-30,https://www.facebook.com/luminousglambykrupa,Unofficial Page,2500.0,Luminous glam by krupa,specializing in all aspects of Hair and Makeup...,Makeup Artist,"21 ward street, schofields, Schofields, NSW, A...",+61 499 985 301,Enquiries@luminousglam.info,https://www.luminousglam.com,,
1,https://www.facebook.com/323649878022364,2025-10-03T22:30:41.749612,2019-08-05,https://www.facebook.com/angelicolors,Unofficial Page,88.0,Angelicolors,,Hair Salon,"591 N McKinley Street #104 , Corona, CA, Unite...",(909) 609-5569,Angelicakristinv@gmail.com,https://www.angelicolorshairstudio.com,,
2,https://www.facebook.com/293577300684222,2025-10-03T22:30:42.835170,2024-05-25,https://www.facebook.com/brewer.decorating,Unofficial Page,919.0,Brewer Decorating,Since 2003 Brewer Decorating has been family o...,Carpet and flooring shop,"105 Jay Bird Ln, Columbia, SC, United States, ...",+1 803-419-5292,info@brewerdecorating.com,https://www.brewerdecorating.com,,
3,https://www.facebook.com/112004355536028,2025-10-03T22:30:49.707224,2025-10-02,https://www.facebook.com/OliveBranchChristianS...,Unofficial Page,696.0,Olive Branch Christian School,"OBCS's mission is to love, live, and share Chr...",Elementary School,"7702 El Cerrito Rd, Corona, CA, United States,...",+1 951-279-9977,info@obcschool.org,https://www.obcschool.org,,
4,https://www.facebook.com/2216808388547258,2025-10-03T22:30:51.863068,2025-02-22,https://www.facebook.com/hpcreationshop,Unofficial Page,186.0,Hp Creations,"You cannot buy Love, but you can buy Handmade ...",Home decor,,(815) 382-6787,handpcreations@gmail.com,https://www.hpcreationsgifts.com,,
5,https://www.facebook.com/128554607194372,2025-10-03T22:30:56.459300,2025-10-02,https://www.facebook.com/SanctuaryKirklees,Unofficial Page,1900.0,Sanctuary Kirklees,The City of Sanctuary movement aims to build a...,Community Organization,"C/O Quaker Meeting House, Huddersfield, United...",+44 7952 810814,coshuddersfield@googlemail.com,https://www.kirklees.cityofsanctuary.org,,
6,https://www.facebook.com/538530473732573,2025-10-03T22:31:09.051679,2024-07-09,https://www.facebook.com/people/Russ-Hamlin-Al...,Unofficial Page,130.0,Russ Hamlin: Allstate Insurance,,Insurance Agent,"2119 Westmead Dr SW, Ste 7",(256) 350-3400,rhamlin@allstate.com,https://www.agents.allstate.com/russ-hamlin-de...,,
7,https://www.facebook.com/100775595303196,2025-10-03T22:31:14.019585,2025-09-30,https://www.facebook.com/apontes.resource,Unofficial Page,1300.0,Aponte's Resource,Planes de ahorro,Financial Consultant,"11999 Katy Fwy, Houston, TX 77079 Piso 2 Suite...",+1 832-938-0454,apontesresourcellc@gmail.com,https://www.apontesresource.com,,
8,https://www.facebook.com/1695483050732935,2025-10-03T22:31:20.483477,2025-05-19,https://www.facebook.com/southstauto,Unofficial Page,142.0,South St Auto Repairs,"South St Auto Repairs, is your local family ru...",Automotive Repair Shop,"2 South St , Forster, NSW, Australia, New Sout...",+61 2 6554 6320,southstautorepairs@gmail.com,,,
9,https://www.facebook.com/100299764716129,2025-10-03T22:31:29.196825,2025-08-13,https://www.facebook.com/people/Shape-of-Beaut...,Unofficial Page,158.0,Shape of Beauty,We are located in Vermont south shopping centr...,Beauty Salon,"Shop 35,495-511 Burwood Highway, Vermont South...",+61 469 667 750,Shapeofbeautybar@gmail.com,https://www.shape-of-beauty-lash-brow-bar.busi...,,



❌ INVALID ROWS (Top 10):


Unnamed: 0,input_url,time_stamp,fb_last_post_date,fb_url,fb_url_type,fb_number_of_followers,fb_company_name,fb_company_intro,fb_category,fb_address,fb_phone_number,fb_email_address,fb_website,fb_website2,fb_website3,validation_errors
0,https://www.facebook.com/1843080375972903,2025-10-03T22:30:40.147452,2025-10-03,https://www.facebook.com/FirstLightHomeCareHou...,Unofficial Page,501.0,FirstLight Home Care of Houston-Metro (Houston),As a leader in home care in the Houston-metro ...,Home Health Care Service,"5433 Westheimer Road, Suite 403, Houston, TX",(713) 714-5625,,https://www.houstonmetro.firstlighthomecare.com,,,invalid email_address
1,https://www.facebook.com/105120241896155,2025-10-03T22:30:42.906170,2025-10-02,https://www.facebook.com/lincolnparknutrition,Unofficial Page,2200.0,Lincoln Park Nutrition,shakes⁣🥤• t e a s ⁣⁣⁣⁣⁣☕️ • good v i b e s ✨ O...,Local business,"2617 Dix hwy, Lincoln Park, MI, United States,...",(313) 633-9056,,,,,invalid email_address
2,https://www.facebook.com/869217443222700,2025-10-03T22:30:47.100472,2025-10-02,https://www.facebook.com/ketlin.isufi,Unofficial Page,34.0,G Skin & Beauty Institute,,Spa,"1100 Sherman AveNaperville, IL 60563",,,http://www.gbeautyinstitutes.com/,,,invalid email_address; invalid fb_phone_number
3,https://www.facebook.com/301981730492017,2025-10-03T22:30:47.922273,2019-07-30,https://www.facebook.com/marlene.jante,Unofficial Page,0.0,"Coach Parking, Brisbane International Terminal",,Bus Station,"Brisbane, QLD, Australia",,,,,,invalid email_address; invalid fb_phone_number
4,https://www.facebook.com/102038254793077,2025-10-03T22:30:54.304707,2024-03-22,https://www.facebook.com/people/Preserve-Harve...,Unofficial Page,183.0,Preserve Harvest Gardens,We are a community of folks at the preserve in...,Community Garden,"15926 Grape Avenue, Chino, CA, United States, ...",,Harvest@farmscapegardens.com,,,,invalid fb_phone_number
5,https://www.facebook.com/624836304288343,2025-10-03T22:30:55.435220,2025-09-26,https://www.facebook.com/SmartStart404CrowellL...,Unofficial Page,234.0,Smart Start Ignition Interlock (404 Crowell Ln...,Smart Start is the Ignition Interlock leader o...,Automotive Parts Store,"404 Crowell Ln, Unit 1",(540) 317-4718,,https://www.smartstartinc.com/locations/igniti...,,,invalid email_address
6,https://www.facebook.com/112337888826756,2025-10-03T22:30:56.626504,2025-10-03,https://www.facebook.com/lakemacquariebmxclub,Unofficial Page,4600.0,Lake Macquarie BMX Club,𝕋𝕙𝕖 ℙ𝕝𝕒𝕔𝕖 𝕥𝕠 ℝ𝕒𝕔𝕖 BMX is for riders from ages ...,"Stadium, Arena & Sports Venue","Mary Street, Argenton, NSW, Australia, New Sou...",,lmbmx@hotmail.com.au,https://www.lake-macquarie-bmx.tidyhq.com,,,invalid fb_phone_number
7,https://www.facebook.com/298289803656494,2025-10-03T22:31:19.743634,2019-02-07,https://www.facebook.com/cassandrashub,Unofficial Page,231.0,Cassandra's Cafe,"Exciting and modern, new local cafe for Ardros...",Fast food restaurant,"46 Princes Street, Ardrossan, United Kingdom",+44 7786 497044,,,,,invalid email_address
8,https://www.facebook.com/102359424749945,2025-10-03T22:31:25.246050,2022-12-19,https://www.facebook.com/millercollc,Unofficial Page,80.0,Miller & Co LLC,Small Business Tax and Accounting Firm,Financial service,"Chicago, IL",,heather@millercollc.com,https://www.millerandcollc.com,,,invalid fb_phone_number
9,https://www.facebook.com/1564563767156379,2025-10-03T22:31:27.632090,,https://www.facebook.com/people/The-Shop/10005...,Unofficial Page,0.0,The Shop,,"Beauty, cosmetic & personal care","3004 Copper Kettle Cir, Pleasant View, TN, Uni...",(615) 944-1575,,,,,invalid email_address


In [12]:
# 🧩 Cell 6: (Optional) Save to CSVs for later comparison
valid_df.to_csv(OUTPUT_VALID, index=False)
invalid_df.to_csv(OUTPUT_INVALID, index=False)

print(f"Files saved → {OUTPUT_VALID}, {OUTPUT_INVALID}")


Files saved → /home/veronica/Desktop/dod-unprocessed-data/process_file.csv, /home/veronica/Desktop/dod-unprocessed-data/invalid_rows.csv
