# Falling Fruit Data Cleaning Survey

In [29]:
import pandas as pd
df = pd.read_csv('foraging_survey.csv')
df.head

<bound method NDFrame.head of               Timestamp How long have you been foraging?  \
0    6/17/2025 12:40:38                        3-4 years   
1    6/17/2025 12:42:08                        1-2 years   
2    6/17/2025 12:42:10                         5+ years   
3    6/17/2025 12:42:43                         5+ years   
4    6/17/2025 12:46:32                        3-4 years   
..                  ...                              ...   
145   6/19/2025 3:03:09                               15   
146   6/19/2025 4:41:35                         5+ years   
147   6/19/2025 5:00:53                     Under 1 year   
148   6/19/2025 6:58:50                         5+ years   
149   6/19/2025 7:20:36                        3-4 years   

    What kinds of environments do you forage in?  \
0                                Urban, Suburban   
1                                Urban, Suburban   
2                                Suburban, Rural   
3                   Urban, Suburban, Rura

In [30]:
df.rename(columns={'How did you find out about Falling Fruit?': 'FindFF', 
                   'If you are located in the US, what state?': 'State'}, inplace=True)

In [31]:
# filtering for the questions Edlyn is cleaning
new_df = df[['FindFF', 'State']]

In [32]:
new_df

Unnamed: 0,FindFF,State
0,"Word of mouth (friends, family, etc)",Florida
1,Google,New York
2,Reddit,North Carolina
3,"Word of mouth (friends, family, etc)",California
4,Google search and news articles about foraging,California
...,...,...
145,Tinterweb,
146,"Word of mouth (friends, family, etc)",IL
147,"Word of mouth (friends, family, etc)",
148,"Word of mouth (friends, family, etc)",Virginia


In [33]:
# filling NaaN values because those no bueno
new_df = new_df.fillna({
    'FindFF': 'Unknown',
    'State': 'Unknown'
    })
new_df

Unnamed: 0,FindFF,State
0,"Word of mouth (friends, family, etc)",Florida
1,Google,New York
2,Reddit,North Carolina
3,"Word of mouth (friends, family, etc)",California
4,Google search and news articles about foraging,California
...,...,...
145,Tinterweb,Unknown
146,"Word of mouth (friends, family, etc)",IL
147,"Word of mouth (friends, family, etc)",Unknown
148,"Word of mouth (friends, family, etc)",Virginia


In [34]:
# copy df to save original data
testing_df = new_df.copy()
testing_df

Unnamed: 0,FindFF,State
0,"Word of mouth (friends, family, etc)",Florida
1,Google,New York
2,Reddit,North Carolina
3,"Word of mouth (friends, family, etc)",California
4,Google search and news articles about foraging,California
...,...,...
145,Tinterweb,Unknown
146,"Word of mouth (friends, family, etc)",IL
147,"Word of mouth (friends, family, etc)",Unknown
148,"Word of mouth (friends, family, etc)",Virginia


In [35]:
# trimming capitalization

testing_df['FindFF'] = testing_df['FindFF'].str.strip().str.lower()   

testing_df['FindFF']

0                word of mouth (friends, family, etc)
1                                              google
2                                              reddit
3                word of mouth (friends, family, etc)
4      google search and news articles about foraging
                            ...                      
145                                         tinterweb
146              word of mouth (friends, family, etc)
147              word of mouth (friends, family, etc)
148              word of mouth (friends, family, etc)
149                                            reddit
Name: FindFF, Length: 150, dtype: object

## Cleaning 'How did you find out about Falling Fruit?'

In [36]:
# find unique answers
unique_categories = testing_df['FindFF'].unique()

print(unique_categories)

['word of mouth (friends, family, etc)' 'google' 'reddit'
 'google search and news articles about foraging' 'facebook' 'tiktok'
 "don't remember" 'google search'
 'i did an internet search years ago and got falling fruit as a result'
 'i’ve been using it for so many years that i don’t even remember how i first heard about it'
 'searching for a foraging tracking app'
 'probably a search engine but a long time ago' "i can't remember"
 'unknown' "i don't remember!" 'some sort of social media' 'internet'
 'google search for foraging apps'
 'i think google, i searched for an interactive map'
 'i forget honestly, but it was somewhere on the internet'
 'cannot remember' 'i think i did a google search'
 'i was searching for an app that would do exactly as i was looking for to look for inventory of free fruits and vegetables around the area'
 'internet search' 'instagram' "online, don't remember exactly"
 'app store' "i don't remember, i've had the app a long time"
 'googled fruit near me' 'app

In [37]:
for i, row in testing_df.iterrows():

    answer = testing_df.at[i,'FindFF']
    
    if isinstance(answer, str):
        
        if 'word of mouth' in answer:
            answer = 'word of mouth'
        
        elif any(keyword in answer for keyword in ['google', 'inter', 'search', 'online']):
            answer = 'internet search'
        
        elif any(keyword in answer for keyword in ['remember', "don't", 'not sure', 'social media', 'forgot']):
            answer = "don't remember"
        
        elif any(keyword in answer for keyword in ['app', 'store']):
            answer = 'app store'

        elif 'podcast' in answer:
            answer = 'podcast'

        # i did not realize tumblr was a good ad for FF
        elif 'tumblr' in answer:
            answer = 'tumblr'
        
        # grouping one of a kind answers while excluding NaN and former survey answers
        elif not any(keyword in answer for keyword in ['reddit', 'facebook', 'instagram', 'twitter (x)', 'unknown']):
            answer = 'unique'
    
    testing_df.at[i, 'FindFF'] = answer
    print(answer)

word of mouth
internet search
reddit
word of mouth
internet search
word of mouth
word of mouth
facebook
unique
internet search
don't remember
internet search
reddit
internet search
reddit
internet search
reddit
don't remember
facebook
internet search
internet search
internet search
reddit
don't remember
reddit
unknown
don't remember
reddit
word of mouth
don't remember
reddit
word of mouth
internet search
reddit
word of mouth
internet search
word of mouth
internet search
internet search
word of mouth
reddit
word of mouth
don't remember
word of mouth
word of mouth
word of mouth
don't remember
internet search
internet search
reddit
reddit
word of mouth
reddit
don't remember
facebook
word of mouth
don't remember
word of mouth
word of mouth
facebook
internet search
reddit
reddit
reddit
reddit
reddit
internet search
instagram
word of mouth
word of mouth
reddit
word of mouth
internet search
app store
don't remember
reddit
internet search
app store
word of mouth
word of mouth
internet search
i

In [38]:
# double checking for unique responses
unique = testing_df['FindFF'].unique()
print(unique)

['word of mouth' 'internet search' 'reddit' 'facebook' 'unique'
 "don't remember" 'unknown' 'instagram' 'app store' 'twitter (x)' 'tumblr'
 'podcast']


In [39]:
# can use this for data visualization
FindFF_counts = testing_df['FindFF'].value_counts()
FindFF_counts

FindFF
word of mouth      46
internet search    37
reddit             30
don't remember     16
facebook            7
unique              3
app store           3
unknown             2
tumblr              2
podcast             2
instagram           1
twitter (x)         1
Name: count, dtype: int64

# Cleaning 'If you are located in the US, what state?' 

In [40]:
# trimming capitalization

testing_df['State'] = testing_df['State'].str.strip().str.lower()   

testing_df['State']

0             florida
1            new york
2      north carolina
3          california
4          california
            ...      
145           unknown
146                il
147           unknown
148          virginia
149          virginia
Name: State, Length: 150, dtype: object

In [41]:
# setting up mapping abbreviations to proper state names
old_state_mapping = {
    "AL": "Alabama",
    "AK": "Alaska",
    "AZ": "Arizona",
    "AR": "Arkansas",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "IA": "Iowa",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "ME": "Maine",
    "MD": "Maryland",
    "MA": "Massachusetts",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MS": "Mississippi",
    "MO": "Missouri",
    "MT": "Montana",
    "NE": "Nebraska",
    "NV": "Nevada",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NY": "New York",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VT": "Vermont",
    "VA": "Virginia",
    "WA": "Washington",
    "WV": "West Virginia",
    "WI": "Wisconsin",
    "WY": "Wyoming",
    "DC": "District of Columbia",
    "AS": "American Samoa",
    "GU": "Guam",
    "MP": "Northern Mariana Islands",
    "PR": "Puerto Rico",
    "UM": "United States Minor Outlying Islands",
    "VI": "Virgin Islands, U.S."
}

In [42]:
# accepted US States ONLY!!! also ensuring values in old_state_mapping are same in states list
us_states_list = list(old_state_mapping.values())
us_states_list

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming',
 'District of Columbia',
 'American Samoa',
 'Guam',
 'Northern Mariana Islands',
 'Puerto Rico',
 'United States Minor Outlying Islands',
 'Virgin Islands, U.S.']

In [43]:
# lower casing states list 
for index, item in enumerate(us_states_list):
    us_states_list[index] = item.lower()

us_states_list

['alabama',
 'alaska',
 'arizona',
 'arkansas',
 'california',
 'colorado',
 'connecticut',
 'delaware',
 'florida',
 'georgia',
 'hawaii',
 'idaho',
 'illinois',
 'indiana',
 'iowa',
 'kansas',
 'kentucky',
 'louisiana',
 'maine',
 'maryland',
 'massachusetts',
 'michigan',
 'minnesota',
 'mississippi',
 'missouri',
 'montana',
 'nebraska',
 'nevada',
 'new hampshire',
 'new jersey',
 'new mexico',
 'new york',
 'north carolina',
 'north dakota',
 'ohio',
 'oklahoma',
 'oregon',
 'pennsylvania',
 'rhode island',
 'south carolina',
 'south dakota',
 'tennessee',
 'texas',
 'utah',
 'vermont',
 'virginia',
 'washington',
 'west virginia',
 'wisconsin',
 'wyoming',
 'district of columbia',
 'american samoa',
 'guam',
 'northern mariana islands',
 'puerto rico',
 'united states minor outlying islands',
 'virgin islands, u.s.']

In [44]:
# lower casing abbreviation mapping dictionary
state_mapping = {
    k.lower(): v.lower() for k, v in old_state_mapping.items()
}

state_mapping

{'al': 'alabama',
 'ak': 'alaska',
 'az': 'arizona',
 'ar': 'arkansas',
 'ca': 'california',
 'co': 'colorado',
 'ct': 'connecticut',
 'de': 'delaware',
 'fl': 'florida',
 'ga': 'georgia',
 'hi': 'hawaii',
 'id': 'idaho',
 'il': 'illinois',
 'in': 'indiana',
 'ia': 'iowa',
 'ks': 'kansas',
 'ky': 'kentucky',
 'la': 'louisiana',
 'me': 'maine',
 'md': 'maryland',
 'ma': 'massachusetts',
 'mi': 'michigan',
 'mn': 'minnesota',
 'ms': 'mississippi',
 'mo': 'missouri',
 'mt': 'montana',
 'ne': 'nebraska',
 'nv': 'nevada',
 'nh': 'new hampshire',
 'nj': 'new jersey',
 'nm': 'new mexico',
 'ny': 'new york',
 'nc': 'north carolina',
 'nd': 'north dakota',
 'oh': 'ohio',
 'ok': 'oklahoma',
 'or': 'oregon',
 'pa': 'pennsylvania',
 'ri': 'rhode island',
 'sc': 'south carolina',
 'sd': 'south dakota',
 'tn': 'tennessee',
 'tx': 'texas',
 'ut': 'utah',
 'vt': 'vermont',
 'va': 'virginia',
 'wa': 'washington',
 'wv': 'west virginia',
 'wi': 'wisconsin',
 'wy': 'wyoming',
 'dc': 'district of columbia

In [45]:
# find unique answers
unique_states = testing_df['State'].unique()

print(unique_states)

['florida' 'new york' 'north carolina' 'california' 'or' 'oklahoma'
 'michigan' 'unknown' 'west virginia' 'iowa' 'maryland' 'delaware'
 'washington' 'va' 'massachusetts' 'oregon'
 'currently virginia, but florida and alabama before that' 'ut' 'virginia'
 'md' 'ohio' 'tn' 'tennessee' 'connecticut' 'ca' 'colorado' 'arizona'
 'illinois' 'wisconsin' 'missouri' 'western australia' 'new jersey'
 'north carolina, previously in boston massachusetts' 'pennsylvania'
 'mass' 'pa' 'co' 'utah' 'louisiana' 'adirondacks, ny' 'pacific nw'
 'indiana' 'wa' 'district of columbia' 'minnesota' 'south carolina' 'fl'
 'il']


In [46]:
# this is assuming everyone spelled correctly!!!!!
for i, row in testing_df.iterrows():

    answer = testing_df.at[i,'State']

    if isinstance(answer, str):
        # replace abbreviations with full state name
        for key, value in state_mapping:

            if key == answer:
                answer = value

        # handling special cases
        if answer == 'unknown':
            answer = answer

        elif 'currently virginia' in answer:
            answer = 'virginia'

        elif 'north carolina, previously in boston massachusetts' in answer:
            answer = 'north carolina'

        elif 'adirondacks, ny' in answer:
            answer = 'new york'

        elif 'mass' in answer:
            answer = 'massachusetts'
        
        elif answer not in us_states_list:
            answer = 'not in US'
    
    testing_df.at[i, 'State'] = answer
    print(answer)

florida
new york
north carolina
california
california
not in US
california
oklahoma
michigan
unknown
west virginia
new york
iowa
iowa
maryland
delaware
washington
not in US
massachusetts
oregon
california
virginia
not in US
unknown
virginia
not in US
california
unknown
ohio
virginia
maryland
not in US
maryland
tennessee
california
connecticut
not in US
unknown
new york
virginia
california
colorado
arizona
california
unknown
washington
california
maryland
not in US
michigan
illinois
unknown
wisconsin
massachusetts
missouri
colorado
unknown
unknown
unknown
unknown
not in US
unknown
maryland
not in US
california
arizona
washington
new jersey
illinois
california
north carolina
california
pennsylvania
california
oklahoma
north carolina
unknown
north carolina
unknown
unknown
massachusetts
unknown
unknown
pennsylvania
virginia
not in US
illinois
not in US
oregon
unknown
utah
unknown
louisiana
new york
oregon
new york
michigan
colorado
not in US
california
indiana
unknown
not in US
california


In [47]:
# checking for unique answers again
testing_df['State'].unique()

array(['florida', 'new york', 'north carolina', 'california', 'not in US',
       'oklahoma', 'michigan', 'unknown', 'west virginia', 'iowa',
       'maryland', 'delaware', 'washington', 'massachusetts', 'oregon',
       'virginia', 'ohio', 'tennessee', 'connecticut', 'colorado',
       'arizona', 'illinois', 'wisconsin', 'missouri', 'new jersey',
       'pennsylvania', 'utah', 'louisiana', 'indiana',
       'district of columbia', 'minnesota', 'south carolina'],
      dtype=object)

In [48]:
# checking proportion of answers
state_counts = testing_df['State'].value_counts()
state_counts

State
unknown                 32
california              17
not in US               16
maryland                 9
new york                 7
virginia                 7
washington               5
illinois                 5
colorado                 5
michigan                 5
oregon                   4
ohio                     4
north carolina           4
massachusetts            3
utah                     3
pennsylvania             2
iowa                     2
tennessee                2
indiana                  2
oklahoma                 2
arizona                  2
louisiana                2
district of columbia     1
minnesota                1
florida                  1
new jersey               1
missouri                 1
wisconsin                1
connecticut              1
delaware                 1
west virginia            1
south carolina           1
Name: count, dtype: int64

In [49]:
# double-checking if there really are 32 unknowns 
unknown_indices = testing_df[testing_df['State'] == 'unknown'].index

print(unknown_indices)

Index([  9,  23,  27,  37,  44,  51,  56,  57,  58,  59,  61,  76,  78,  79,
        81,  82,  89,  91, 101, 110, 112, 119, 121, 123, 129, 132, 134, 136,
       143, 144, 145, 147],
      dtype='int64')


In [50]:
rows = [9,  23,  27,  37,  44,  51,  56,  57,  58,  59,  61,  76,  78,  79,
        81,  82,  89,  91, 101, 110, 112, 119, 121, 123, 129, 132, 134, 136,
       143, 144, 145, 147]

testing_df.iloc[rows]

Unnamed: 0,FindFF,State
9,internet search,unknown
23,don't remember,unknown
27,reddit,unknown
37,internet search,unknown
44,word of mouth,unknown
51,word of mouth,unknown
56,don't remember,unknown
57,word of mouth,unknown
58,word of mouth,unknown
59,facebook,unknown


In [51]:
testing_df

Unnamed: 0,FindFF,State
0,word of mouth,florida
1,internet search,new york
2,reddit,north carolina
3,word of mouth,california
4,internet search,california
...,...,...
145,internet search,unknown
146,word of mouth,not in US
147,word of mouth,unknown
148,word of mouth,virginia


In [52]:
testing_df.to_csv("findFF_states_cleaned.csv", index=False)


In [53]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


df_com = pd.read_csv('2 questions - Sheet1.csv')
df_com



Unnamed: 0,"Are you involved in a foraging community? We define ""community"" as a group of people living in the same place that gives a sense of fellowship or belonging that often share similar interests or goals.","If yes to the question above, how did you get involved with your community?"
0,No,Family/friends
1,No,
2,No,
3,No,Not involved
4,No,Family/friends
...,...,...
145,Yes,Started it
146,No,N/a
147,No,
148,Yes,Me


In [54]:
df_com.columns = ['involved_in_community', 'how_got_involved']

# Unique values for community involvement
print(df_com['involved_in_community'].unique())

# Unique values for how people got involved
print(df_com['how_got_involved'].unique())


['No' 'Yes' 'I used to be. Then I moved. '
 "I recently moved from the city where I lived and volunteered. I'm in a more suburban/exurban area now so I am trying to explore my area a bit. "
 'Unsure' 'Kind of ' "No, none around me I'm aware of."
 'I went looking for pawpaws with mu daughter and a guy helped us figure out what to look for and then gave us a few to try. I teach people about mulberries and blackberries when I see them in public spaces who have never eaten "wild" food.  Does that count? '
 'I want to, but am scared to get into a group.'
 "I was when I lived on the east coast. I moved to the west a few years back and haven't found any other enthusiasts yet."
 'No, but I wish I could find one. Maybe the app can identify "communities near me?"'
 "I've founded a local project called Fruta na Rua BH, but I don't have time or money to invest so much and mobilize the community so much, so it's pretty much not doing anything."
 "Used to be, there's a big scene in the Rochester, NY

In [55]:

# Clean the 'how_got_involved' column
df_com['how_got_involved'] = df_com['how_got_involved'].str.strip().str.lower()

# Define all variants that should be grouped as "na"
na_variants = [
    'n/a', 'na', 'none', 'no', 'not involved', 'not applicable',
    '', '.', 'i put no', 'i answered no', 'i answered no.', 'i am not involved',
    'i am not involved in the foraging community.', 'i have not engaged in any communities',
    'answered no above', 'erroneously required question', 'n/a - should not be required',
    'non applicable', 'do not', 'there is no foraging commuunity.', 'x','no answer', 'not involved in the community', "i'm not in a foraging community",
    'i am not a member of a community but i’d be interested in doing this',

]

# Replace them with "na"
df_com['how_got_involved'] = df_com['how_got_involved'].replace(na_variants, np.nan)

# View unique values
print(df_com['how_got_involved'].unique())

['family/friends' nan
 'wildlife and fisheries graduate student community, lots of us like foraging and met through work/school'
 'i have built my own community' 'social media' 'medieval reenactment'
 'volunteering in local friends of east rock park to remove invasive species, trash, environmental outreach.'
 'falling fruit' 'found group on their website' 'friend making apps'
 'the gleaning community and working for a produce company of people who also loved foraging.'
 'just talking to people about common interests'
 'meeting them outside foraging' 'started it' 'me'
 'i do foraging activities with other people through my college']


In [56]:
# Define substrings that suggest the person started their own community
self_variants = [
    'started it',
    'i have built my own community',
    'me'
]

# Replace those entries with 'self'
df_com['how_got_involved'] = df_com['how_got_involved'].replace(self_variants, 'self')


# Define categories and map responses accordingly
social_media_variants = [
    'social media', 'found group on their website', 'friend making apps'
]

school_variants = [
    'wildlife and fisheries graduate student community, lots of us like foraging and met through work/school',
    'i do foraging activities with other people through my college'
]

# Apply mappings
df_com['how_got_involved'] = df_com['how_got_involved'].replace(social_media_variants, 'social media')
df_com['how_got_involved'] = df_com['how_got_involved'].replace(school_variants, 'school')



falling_fruit_variants = [
    "falling fruit", 'found group on their website'
]

df_com['how_got_involved'] = df_com['how_got_involved'].replace(falling_fruit_variants, 'falling fruit')

# View unique values
print(df_com['how_got_involved'].unique())

['family/friends' nan 'school' 'self' 'social media'
 'medieval reenactment'
 'volunteering in local friends of east rock park to remove invasive species, trash, environmental outreach.'
 'falling fruit'
 'the gleaning community and working for a produce company of people who also loved foraging.'
 'just talking to people about common interests'
 'meeting them outside foraging']


In [57]:
df_com['how_got_involved'] = df_com['how_got_involved'].replace({
    'volunteering in local friends of east rock park to remove invasive species, trash, environmental outreach.': 'volunteering',
    'just talking to people about common interests': 'informal socializing',
    'the gleaning community and working for a produce company of people who also loved foraging.':"local community and work",
    'meeting them outside foraging': 'outside foraging'
})

# View unique values
print(df_com['how_got_involved'].unique())

['family/friends' nan 'school' 'self' 'social media'
 'medieval reenactment' 'volunteering' 'falling fruit'
 'local community and work' 'informal socializing' 'outside foraging']


In [58]:
# Step 1: Strip whitespace and lowercase
df_com['involved_in_community'] = df_com['involved_in_community'].str.strip().str.lower()

# Step 2: Standardize common variants
yes_variants = ['yes', 'y', 'yeah', 'yep', 'kindof but not regular', 'not actively, sometimes i pick with friends and family', "i'm in a mycology group but mostly forage alone or occasionally with a few friends."]
no_variants = ['no', 'n', 'nope', "no, none around me i'm aware of.", 'no, but i wish i could find one. maybe the app can identify "communities near me?"',
               "i've founded a local project called fruta na rua bh, but i don't have time or money to invest so much and mobilize the community so much, so it's pretty much not doing anything.",
               'i went looking for pawpaws with mu daughter and a guy helped us figure out what to look for and then gave us a few to try. i teach people about mulberries and blackberries when i see them in public spaces who have never eaten "wild" food.  does that count?',
               'unsure', "i recently moved from the city where i lived and volunteered. i'm in a more suburban/exurban area now so i am trying to explore my area a bit.",
               "i want to, but am scared to get into a group.", 'kind of']
used_to_variants = [
    "i used to be. then i moved.", "i was when i lived on the east coast. i moved to the west a few years back and haven't found any other enthusiasts yet.",
    "i was when i lived on the east coast. i moved to the west a few years back and haven't found any other enthusiasts yet.",
    "used to be, there's a big scene in the rochester, ny region, but i moved to the adirondacks. there's barely any pin drops here."
]

df_com['involved_in_community'] = df_com['involved_in_community'].replace(yes_variants, 'yes')
df_com['involved_in_community'] = df_com['involved_in_community'].replace(no_variants, 'no')
df_com['involved_in_community'] = df_com['involved_in_community'].replace(used_to_variants, 'used to')

print(df_com['involved_in_community'].unique())

['no' 'yes' 'used to']


In [60]:
df_com


Unnamed: 0,involved_in_community,how_got_involved
0,no,family/friends
1,no,
2,no,
3,no,
4,no,family/friends
...,...,...
145,yes,self
146,no,
147,no,
148,yes,self


In [61]:
df_com.to_csv('cleaned_data.csv', index=False)