In [3]:
import pandas as pd
import re

# I decided to gather information from the ACA (American Camp Association) about the overnight summer camps registered on their website.

- According to the ACA itself, there are around 20,000 camps, divided into three main categories: Day, Overnight, and Family/Adult camps. Some camps offer one or more of these categories throughout the year. I decided to use only ACA data because it is the most important association in this industry, with over 100 years of history.

In my first attempt, I used **Instant Data Scraper**, a Google Chrome extension, which retrieved data from more than 2,000 camps.

For my second attempt, I used **Octoparse** to collect more detailed information about each camp.

In [4]:
path1 = r"C:\Users\luism\Documents\Luis Code\CSV\Summer Camps USA - 2025.csv"
path2 = r"C:\Users\luism\Documents\Luis Code\CSV\find.csv"
path3 = r"C:\Users\luism\Documents\Luis Code\CSV\CAmps.2.csv"

csv2 = pd.read_csv(path2)
csv1 = pd.read_csv(path1)


**Due to the scraping method I used, the City and State were not in the first columns. Therefore, I had to search through the CSV file to retrieve this information.**



In [8]:

# Extract the 8th column (which contains the city and state information)
description_column = csv2["pio"]

# Display a sample of the 8th column to understand its structure
description_column.head()


0    (Great Barrington, MA)\t\t\t\t\t\t\tCamp Chaza...
1    (Merrill, NY)\t\t\t\t\t\t\tCamp Chateaugay is ...
2    (South Hero, VT)\t\t\t\t\t\t\tLocated on the b...
3    (Holly, MI)\t\t\t\t\t\t\tWhile discovering new...
4    (Warsaw, MO)\t\t\t\t\t\t\tSummer Bible Camp fo...
Name: pio, dtype: object

**Thanks to ChatGPT, it created two columns with City and State for each row.**

In [9]:

# Define a function to extract city and state using regex
def extract_city_state(description):
    match = re.search(r"\((.*?),\s*(.*?)\)", str(description))
    if match:
        return match.group(1), match.group(2)
    return None, None

# Apply the function to extract city and state
csv2["city"], csv2["state"] = zip(*description_column.apply(extract_city_state))

# Display the resulting dataset with the new columns
csv2[["city", "state"]].head()


Unnamed: 0,city,state
0,Great Barrington,MA
1,Merrill,NY
2,South Hero,VT
3,Holly,MI
4,Warsaw,MO


In [10]:

selected_columns = ["camp_name", "city", "state", "accredited-member", "col-sm-4 3"]
csv2 = csv2[selected_columns]


if 'col-sm-4 3' in csv2.columns:
    csv2['col-sm-4 3'] = csv2['col-sm-4 3'].str.replace(r'program\(s\)$', '', regex=True).str.strip()
else:
    print("Advertencia: La columna 'col-sm-4 3' no se encontró en csv1.")

if 'address' in csv1.columns:
    csv1['address'] = csv1['address'].str.replace(r'View Map »$', '', regex=True).str.strip()
else:
    print("Advertencia: La columna 'address' no se encontró en csv1.")

if 'Texto3' in csv1.columns:
    csv1['Texto3'] = csv1['Texto3'].str.replace(r'Director\(s\):', '', regex=True).str.strip()
    
if 'Texto4' in csv1.columns:
    csv1['Texto4'] = csv1['Texto4'].str.replace(r'Operator\(s\):', '', regex=True).str.strip()
   

if csv1["camp_name"].isnull().any() or csv2["camp_name"].isnull().any():
    print("Advertencia: Hay valores nulos en la columna 'Camp Name'.")

    

merged_csv = pd.merge(csv1, csv2, on="camp_name", how="left")
merged_csv['zip_code'] = merged_csv['address'].str.extract(r'(\d{5}(?:-\d{4})?)')
merged_csv['address'] = merged_csv['address'].str.replace(r'\d{5}(-\d{4})?$', '', regex=True).str.strip()

merged_csv[['address', 'city', 'state']] = merged_csv[['address', 'city', 'state']].fillna('')


merged_csv['address'] = merged_csv.apply(lambda row: row['address']
                               .replace(row['city'], '')
                               .replace(row['state'], '')
                               .replace(',,', ',')  
                               .strip(', ')         
                               .rstrip(','),        
                               axis=1)


if 'address' in merged_csv.columns:
    merged_csv['address'] = merged_csv['address'].str.strip().str.rstrip(',').str.strip()


merged_csv.head()




Advertencia: Hay valores nulos en la columna 'Camp Name'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csv2['col-sm-4 3'] = csv2['col-sm-4 3'].str.replace(r'program\(s\)$', '', regex=True).str.strip()


Unnamed: 0,camp_name,address,Texto2,Texto3,Texto4,city,state,accredited-member,col-sm-4 3,zip_code
0,URJ Eisner Camp,"53 Brookside Rd, 01230-2186View Map »","Jewish, Reform",,URJ Eisner and Crane Lake Camp (Operator),Great Barrington,MA,Accredited Member,4,01230-2186
1,Camp Chateaugay,"233 Gadway Rd, 12955-2307View Map »","Independent, For Profit",Mitch Goldman,"Chateaugay Campco, LLC (Operator)",Merrill,NY,Accredited Member,2,12955-2307
2,YWCA VT Camp Hochelaga,"34 Hochelaga Rd, 05486-4807View Map »",YWCA,Christine Perry,YWCA Vermont,South Hero,VT,Accredited Member,2,05486-4807
3,YMCA Camp Ohiyesa,"7300 Hickory Ridge Rd, 48442-8929View Map »",YMCA,Doug Grimm,YMCA Camping Services of Detroit,Holly,MI,Accredited Member,4,48442-8929
4,Camp CUMCITO,"13220 Mission Rd, 65355-5613View Map »",Nondenominational Christian,,City Union Mission,Warsaw,MO,Accredited Member,4,13220


In [5]:
merged = csv2.merge(csv1, on="camp_name", how="left", indicator=True)
missing_camps = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])



print(f"Se encontraron {len(missing_camps)} campamentos que no están en csv1.")


Se encontraron 128 campamentos que no están en csv1.
