<img style="float: right;" src="meetup_logo.svg" width=200>

# Meetup - Data Cleaning (all 2018)


<i>Cleaning the contents of the NYC meetup data</i>

<u>Datasets:</u>

1. <a href='#events'>Meetup Events</a> (all 2018)
2. <a href='#groups'>Meetup Groups</a>
3. <a href='#members'>Meetup Members</a>
***

### Import libraries

In [35]:
from haversine import haversine 
import reverse_geocode
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import time
import re
from meetup_api_functions import clean_text, remove_special_chars, get_subway_distances
from datacleaning_functions import load_events, engineer_events_features
import requests

***
<a id='events'></a>
### 1. Meetup Events

#### Load Data

In [17]:
# # open all_events file
# def load_events():
#     with open('monthly_events_2018/sepoct_events.pkl', 'rb') as f:
#         sepoct_events = pickle.load(f)

#     with open('monthly_events_2018/nov_events.pkl', 'rb') as f:
#         nov_events = pickle.load(f)

#     with open('monthly_events_2018/jan_events.pkl', 'rb') as f:
#         jan_events = pickle.load(f)

#     with open('monthly_events_2018/feb_events.pkl', 'rb') as f:
#         feb_events = pickle.load(f)

#     with open('monthly_events_2018/mar_events.pkl', 'rb') as f:
#         mar_events = pickle.load(f)

#     with open('monthly_events_2018/apr_events.pkl', 'rb') as f:
#         apr_events = pickle.load(f)

#     with open('monthly_events_2018/may_events.pkl', 'rb') as f:
#         may_events = pickle.load(f)

#     with open('monthly_events_2018/jun_events.pkl', 'rb') as f:
#         jun_events = pickle.load(f)

#     with open('monthly_events_2018/jul_events.pkl', 'rb') as f:
#         jul_events = pickle.load(f)

#     with open('monthly_events_2018/aug_events.pkl', 'rb') as f:
#         aug_events = pickle.load(f)

#     with open('monthly_events_2018/dec_events.pkl', 'rb') as f:
#         dec_events = pickle.load(f)
    
#     raw_2018 = sepoct_events+nov_events+jan_events+feb_events+mar_events+apr_events+may_events+jun_events+jul_events+aug_events+dec_events
    
#     return pd.DataFrame(raw_2018)



In [36]:
# from datacleaning_functions import load_events
df_events = load_events()

#### Data Cleaning

In [37]:
# check percentage of NaN values in each column
df_events.isna().sum()/len(df_events)*100

created              0.000000
description          0.831033
duration             7.386552
event_url            0.000000
fee                 84.621103
group                0.000000
headcount            0.000000
how_to_find_us      62.458504
id                   0.000000
maybe_rsvp_count     0.000000
name                 0.000000
photo_url           46.116816
rating               0.000000
rsvp_limit          75.924331
status               0.000000
time                 0.000000
updated              0.000000
utc_offset           0.000000
venue                9.424018
visibility           0.000000
waitlist_count       0.000000
why                 99.593684
yes_rsvp_count       0.000000
dtype: float64

Based on the information above, there is some data cleaning and handling of missing values to address:
- convert values in ```duration``` from miliseconds to minutes and fill in the missing values with median
- drop ```utc_offset``` since that information is captured in ```time```
- drop ```why``` since most values are NaNs
- clean up text in ```description``` with regex
- label encode ```fee```, ```photo_url```, and ```how_to_find_us```
- ```venue``` fill missing values with 'None'

In [27]:
# def clean_events_data(dataframe):
#     new_df = dataframe
    
#     # convert values in duration column from milliseconds to minutes
#     new_df['duration'] = new_df['duration'].apply(lambda x: x/60000)
    
#     # label encode value for whether group's join-mode is open or not
#     new_df['group_is_open'] = new_df.group.apply(lambda x: 1 if x['join_mode'] == 'open' else 0)
    
#     # extract group_id from the group column which contains a dictionary of group details
#     new_df['group_id'] = new_df.group.apply(lambda x: x.get('id'))
    
#     # rename column to note time unit of the data
#     new_df.rename(columns={'duration':'duration_min'}, inplace=True)
    
#     # fill in NaNs, then label encode
#     new_df['how_to_find_us'].fillna(0, inplace = True)
#     new_df['has_how_to_find'] = new_df['how_to_find_us'].apply(lambda x: 1 if x != 0 else 0)
    
#     new_df['rsvp_limit'].fillna(0, inplace =True)
#     new_df['has_rsvp_limit'] = new_df['rsvp_limit'].apply(lambda x: 1 if x != 0 else 0)
    
#     # fill in Nans, then clean text using regex helper function
#     new_df.description.fillna(value = 'None', inplace = True)
#     new_df['description'] = new_df['description'].apply(lambda x: clean_text(x))
    
#     # remove special characters and get word count of event descriptions
#     new_df['event_num_words'] = new_df.description.apply(lambda x: len(remove_special_chars(x.split(' '))))
    
#     # replace missing values in duration to median value
#     new_df.duration_min.fillna(value = new_df.duration_min.median(), inplace = True)
#     # replace missing venue values with 'None'
#     new_df.venue.fillna(value = 'None', inplace = True)
#     # replace missing fee values with 'N/A'
#     new_df.fee.fillna(value = 0, inplace = True)
#     # replace missing photo_url values with 'N/A'
#     new_df.photo_url.fillna(value = 0, inplace = True)
    
#     # extract just the amount from the fee dictionary
#     new_df.fee = new_df.fee.apply(lambda x: x['amount'] if x!= 0 else 0)
    
#     # encode photo_url column
#     new_df['has_photo'] = new_df.photo_url.apply(lambda x: 0 if x == 0 else 1)
    
#     """
#     clean the venue column
#     """
#     # converting the 'venue' column into its own dataframe
#     df_venues = new_df['venue'].apply(pd.Series)
#     # create list of venue latitude & longitude
#     new_df['venue_latlon'] = list(zip(df_venues.lat, df_venues.lon))
    
#     # drop the 'venue' column from df_events
#     new_df.drop(columns =['venue', 'why'], inplace=True)
    
#     # rename id column to event_id for clarity
#     new_df.rename(index=str, columns={"id": "event_id"})
    
#     return new_df   

In [38]:
df_cleaned = clean_events_data(df_events)

  index = _union_indexes(indexes, sort=sort)
  result = result.union(other)


In [39]:
df_cleaned.head()

Unnamed: 0,created,description,duration_min,event_url,fee,group,headcount,how_to_find_us,id,maybe_rsvp_count,...,visibility,waitlist_count,yes_rsvp_count,group_is_open,group_id,has_how_to_find,has_rsvp_limit,event_num_words,has_photo,venue_latlon
0,1535391367000,Join us in person or tune in online! Livestrea...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to second floor/ stream online at ...,254149786,0,...,public,0,42,1,21993357,1,0,137,0,"(40.723171, -73.997177)"
1,1535385547000,Get started now on challenges related to these...,150.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,https://zoom.us/j/417883916,254146381,0,...,public,0,64,1,21993357,1,0,131,0,"(40.74673, -73.98967)"
2,1535392484000,In this workshop we’ll get a clear sense of th...,150.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to second floor / Livestream at ht...,254150230,0,...,public,0,83,1,21993357,1,0,204,0,"(40.72317, -73.99718)"
3,1531947994000,The number of opportunities for software engin...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to the 2nd floor,252915161,0,...,public,0,113,1,21993357,1,0,229,0,"(40.723171, -73.997177)"
4,1535383458000,Please tune into the stream here: https://zoom...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please tune into the stream here: https://zoom...,254144933,0,...,public,0,21,1,21993357,1,0,165,0,"(40.746731, -73.98967)"


Let's engineer some additional event features. The function ```engineer_events_features``` will do the following:
- number of years the group has been around
- add day of week 
- number of stations within 0.5 miles
- number of events held in the month
- descriptiveness of event (determined by number of words in the description text)

In [30]:
# def engineer_events_features(dataframe):
    
#     # convert time to a datetime datatype
#     new_df = dataframe
#     new_df['time_datetime'] = pd.to_datetime(new_df['time'], unit = 'ms')
#     # adding event date as Year/Month/Day
#     new_df['time_m_d_y'] = new_df['time_datetime'].apply(lambda x: x.strftime('%Y-%m-%d')) 
    
#     # add column with day of week
#     new_df['time_m_d_y'] = pd.to_datetime(new_df['time_m_d_y'])
#     new_df['day_of_week'] = new_df['time_m_d_y'].dt.day_name()
    
#     # create column called event_hour - get hour of event
#     new_df['event_hour'] = new_df['time_datetime'].dt.hour
#     new_df['event_hour'] = new_df['event_hour'].astype('category')
    
#     # bin the event hour into 6 bins (4-hour intervals in 24-day)
#     bins = [0, 4,8,12,16,21,24]
#     new_df['event_hour_group'] = pd.cut(new_df['event_hour'], bins, right =False)
    
#     # add count of subway stations within 0.5 miles from venue
#     # load subway station data
#     df_subway = pd.read_csv("NYC_Subway_Data.csv")
    
#     # dropping duplicate stations (file contains a location for each entry/exit point which is not what we need)
#     df_unique_subway = df_subway.drop_duplicates(subset=["Division", "Station Name"])
    
#     # convert the latitude and longitude into floats for distance calculation
#     df_unique_subway['Station Latitude'].astype(float)
#     df_unique_subway['Station Longitude'].astype(float)
    
#     # create a new column with the converted latitude and longitutdes in a tuple
#     df_unique_subway['latlon'] = list(zip(df_unique_subway['Station Latitude'],df_unique_subway['Station Longitude']))
    
#     # create a variable with a list of each station's (latitude, longitude)
#     subway_locations = list(df_unique_subway['latlon'])
    
#     # save the subway_locations variable
#     with open('subway_locations.pkl', 'wb') as f: 
#         pickle.dump(subway_locations, f)
        
#     # import function created to get the distances of each venue to each subway station
#     # apply/lambda function to every event
#     new_df['subway_distances'] = new_df['venue_latlon'].apply(lambda x: get_subway_distances(x, subway_locations))
    
#     # create a column with a count of subway stations less than 0.5 miles from each venue
#     new_df['num_close_subways'] = new_df['subway_distances'].apply(lambda x: len([i for i in x if i <=0.5]))
    
#     # create new column that notes whether there is a fee or no fee for the event
#     new_df['has_fee'] = new_df.fee.apply(lambda x: 0 if x == 0 else 1)
    
#     # get number of days from event creation to event date
#     new_df['created_to_event_days'] = (new_df['time'].astype(int)-new_df['created'].astype(int))/86400000
    
#     # create dataframe for total number of events held in 2018 by group
#     df_num_past_events = pd.DataFrame(new_df.group_id.value_counts()).reset_index()
#     df_num_past_events.columns = ['group_id', 'num_past_events']

#     ## merge multiple dataframes
    
#     # load group dataframe
#     df_groups = pd.read_pickle('df_all_groups_cleaned.pickle')
#     df_events_group = pd.merge(new_df, df_groups, how='left', on = 'group_id')
#     df_events_group_past = pd.merge(df_events_group, df_num_past_events, how= 'left', on = 'group_id')
    
#     # rename columns
#     df_events_group_past.rename(columns = {'created_x': 'event_created',
#                                          'description_x': 'event_description',
#                                          'duration_min': 'event_duration',
#                                          'headcount': 'event_headcount',
#                                          'id': 'event_id',
#                                          'name_x': 'event_name',
#                                          'rating': 'event_rating',
#                                          'status_x': 'event_status',
#                                          'time': 'event_time',
#                                          'updated': 'event_updated',
#                                          'visibility_x': 'event_visibility',
#                                          'descrip_tokens': 'event_descrip_tokens',
#                                          'descrip_num_words':'event_descrip_num_words',
#                                          'has_fee': 'has_event_fee',
#                                          'created_y': 'group_created',
#                                          'description_y': 'group_description',
#                                          'join_mode': 'group_join_mode',
#                                          'lat': 'group_lat',
#                                          'lon': 'group_lon',
#                                          'link': 'group_link',
#                                          'state': 'group_state',
#                                          'members': 'num_members',
#                                          'name_y': 'group_name',
#                                          'status_y': 'group_status',
#                                          'urlname': 'group_urlname',
#                                          'visibility_y': 'group_visibility',
#                                          'who': 'group_who',
#                                          'category_name': 'group_category',
#                                          'organizer_id': 'group_organizer_id',
#                                          'yrs_since_created': 'group_yrs_est',
#                                          'created_date':'group_created_date'
#                                         }, inplace =True)
#     # save the merged dataframe
#     df_events_group_past.to_pickle('df_2018_cleaned.pickle')
    
#     return df_events_group_past


In [40]:
df_cleaned_eng = engineer_events_features(df_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  zip(df_unique_subway['Station Latitude'], df_unique_subway['Station Longitude']))


In [41]:
df_cleaned_eng.head()

Unnamed: 0,event_created,event_description,event_duration,event_url,fee,group,event_headcount,how_to_find_us,event_id,maybe_rsvp_count,...,group_state,group_status,group_urlname,group_visibility,group_who,group_category,group_organizer_id,group_yrs_est,group_created_date,num_past_events
0,1535391367000,Join us in person or tune in online! Livestrea...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to second floor/ stream online at ...,254149786,0,...,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02,236
1,1535385547000,Get started now on challenges related to these...,150.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,https://zoom.us/j/417883916,254146381,0,...,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02,236
2,1535392484000,In this workshop we’ll get a clear sense of th...,150.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to second floor / Livestream at ht...,254150230,0,...,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02,236
3,1531947994000,The number of opportunities for software engin...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please come to the 2nd floor,252915161,0,...,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02,236
4,1535383458000,Please tune into the stream here: https://zoom...,120.0,https://www.meetup.com/Build-with-Code-New-Yor...,0.0,"{'join_mode': 'open', 'created': 1484876702000...",0,Please tune into the stream here: https://zoom...,254144933,0,...,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02,236


In [42]:
df_cleaned_eng.columns

Index(['event_created', 'event_description', 'event_duration', 'event_url',
       'fee', 'group', 'event_headcount', 'how_to_find_us', 'event_id',
       'maybe_rsvp_count', 'event_name', 'photo_url', 'event_rating',
       'rsvp_limit', 'event_status', 'event_time', 'event_updated',
       'utc_offset', 'event_visibility', 'waitlist_count', 'yes_rsvp_count',
       'group_is_open', 'group_id', 'has_how_to_find', 'has_rsvp_limit',
       'event_num_words', 'has_photo', 'venue_latlon', 'time_datetime',
       'time_m_d_y', 'day_of_week', 'event_hour', 'event_hour_group',
       'subway_distances', 'num_close_subways', 'has_event_fee',
       'created_to_event_days', 'group_created', 'group_description',
       'group_join_mode', 'group_lat', 'group_link', 'localized_country_name',
       'localized_location', 'group_lon', 'num_members', 'group_name',
       'group_state', 'group_status', 'group_urlname', 'group_visibility',
       'group_who', 'group_category', 'group_organizer_id', 'g

***
<a id='groups'></a>
### 2. Meetup Groups

#### Load Data

In [200]:
# open all_groups file
with open('all_groups.pkl', 'rb') as f:
    all_groups = pickle.load(f)

In [201]:
# convert to dataframe
df_groups = pd.DataFrame(all_groups)

In [202]:
df_groups.shape

(8632, 29)

In [203]:
df_groups.head()

Unnamed: 0,category,city,country,created,description,group_photo,id,is_pro_hidden,join_mode,key_photo,...,organizer,pro_network,score,state,status,timezone,untranslated_city,urlname,visibility,who
0,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",New York,US,1484876702000,<p>Build with Code hosts free weekly JavaScrip...,,21993357,,open,"{'id': 464860413, 'highres_link': 'https://sec...",...,"{'id': 218119162, 'name': 'Jenny Mith', 'bio':...",,1.0,NY,active,US/Eastern,New York,Build-with-Code-New-York,public,Engineers
1,"{'id': 2, 'name': 'Career & Business', 'shortn...",New York,US,1550615516000,<p>The TechDay New York team invites you to jo...,,31207091,,open,"{'id': 480306005, 'highres_link': 'https://sec...",...,"{'id': 263284450, 'name': 'Ana ', 'bio': '', '...",,1.0,NY,active,US/Eastern,New York,TechDayHQ,public,Members
2,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",New York,US,1047953152000,<p>The NYC NoSQL &amp; NewSQL Group <br> (form...,"{'id': 460182357, 'highres_link': 'https://sec...",107592,,open,"{'id': 466506912, 'highres_link': 'https://sec...",...,"{'id': 6618661, 'name': 'Eric David Benari', '...",,1.0,NY,active,US/Eastern,New York,mysqlnyc,public,Data Enthusiasts
3,"{'id': 23, 'name': 'Outdoors & Adventure', 'sh...",New York,US,1548684384000,<p><span>The Awesome Events Meetup Group is th...,,31031999,,open,"{'id': 480057227, 'highres_link': 'https://sec...",...,"{'id': 236287112, 'name': 'Justin', 'bio': '',...",,1.0,NY,active,US/Eastern,New York,awesome-events,public,Awesome People
4,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",New York,US,1321563802000,<p><span>Data Driven NYC (organized by FirstMa...,"{'id': 442920809, 'highres_link': 'https://sec...",2829432,,approval,"{'id': 442991280, 'highres_link': 'https://sec...",...,"{'id': 2369792, 'name': 'Matt Turck', 'bio': '...",,1.0,NY,active,US/Eastern,New York,DataDrivenNYC,public,Members


In [204]:
# rename id to group_id
df_groups.rename(columns ={'id':'group_id'}, inplace = True)

In [205]:
df_groups.isna().sum()

category                     8
city                         0
country                      0
created                      0
description                  0
group_photo               4533
group_id                     0
is_pro_hidden             8628
join_mode                    0
key_photo                 1484
lat                          0
link                         0
localized_country_name       0
localized_location           0
lon                          0
members                      0
meta_category              154
name                         0
next_event                5852
organizer                    0
pro_network               8318
score                        0
state                        0
status                       0
timezone                     0
untranslated_city            0
urlname                      0
visibility                   0
who                          0
dtype: int64

We'll deal with most of the missing values in this dataset by dropping columns we won't need:
- ```is_pro_hidden```, ```pro_network```, ```next_event```, ```key_photo```, ```group_photo```, ```timezone```, ```untranslated_city```, ```score```, ```country```, ```city```, ```meta_category``` (contains the same info as ```category```)


In [206]:
df_groups.drop(columns = ['is_pro_hidden', 'pro_network', 'next_event', 'key_photo', 'group_photo',
                         'timezone', 'untranslated_city', 'score', 'country', 'city', 'meta_category'], 
               inplace = True)

In [207]:
df_groups.head()

Unnamed: 0,category,created,description,group_id,join_mode,lat,link,localized_country_name,localized_location,lon,members,name,organizer,state,status,urlname,visibility,who
0,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1484876702000,<p>Build with Code hosts free weekly JavaScrip...,21993357,open,40.75,https://www.meetup.com/Build-with-Code-New-York/,USA,"New York, NY",-73.99,8050,Build with Code - New York City,"{'id': 218119162, 'name': 'Jenny Mith', 'bio':...",NY,active,Build-with-Code-New-York,public,Engineers
1,"{'id': 2, 'name': 'Career & Business', 'shortn...",1550615516000,<p>The TechDay New York team invites you to jo...,31207091,open,40.75,https://www.meetup.com/TechDayHQ/,USA,"New York, NY",-73.99,1361,TechDay Meetup,"{'id': 263284450, 'name': 'Ana ', 'bio': '', '...",NY,active,TechDayHQ,public,Members
2,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1047953152000,<p>The NYC NoSQL &amp; NewSQL Group <br> (form...,107592,open,40.75,https://www.meetup.com/mysqlnyc/,USA,"New York, NY",-73.99,24226,"🔥 SQL NYC, The NoSQL & NewSQL Database Big Dat...","{'id': 6618661, 'name': 'Eric David Benari', '...",NY,active,mysqlnyc,public,Data Enthusiasts
3,"{'id': 23, 'name': 'Outdoors & Adventure', 'sh...",1548684384000,<p><span>The Awesome Events Meetup Group is th...,31031999,open,40.78,https://www.meetup.com/awesome-events/,USA,"New York, NY",-73.96,1694,Awesome Events,"{'id': 236287112, 'name': 'Justin', 'bio': '',...",NY,active,awesome-events,public,Awesome People
4,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1321563802000,<p><span>Data Driven NYC (organized by FirstMa...,2829432,approval,40.76,https://www.meetup.com/DataDrivenNYC/,USA,"New York, NY",-73.97,17382,Data Driven NYC (a FirstMark Event),"{'id': 2369792, 'name': 'Matt Turck', 'bio': '...",NY,active,DataDrivenNYC,public,Members


In [208]:
# clean text in descriptions
df_groups.description = df_groups.description.apply(lambda x: clean_text(x))

In [209]:
df_groups.head()

Unnamed: 0,category,created,description,group_id,join_mode,lat,link,localized_country_name,localized_location,lon,members,name,organizer,state,status,urlname,visibility,who
0,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1484876702000,Build with Code hosts free weekly JavaScript a...,21993357,open,40.75,https://www.meetup.com/Build-with-Code-New-York/,USA,"New York, NY",-73.99,8050,Build with Code - New York City,"{'id': 218119162, 'name': 'Jenny Mith', 'bio':...",NY,active,Build-with-Code-New-York,public,Engineers
1,"{'id': 2, 'name': 'Career & Business', 'shortn...",1550615516000,The TechDay New York team invites you to join ...,31207091,open,40.75,https://www.meetup.com/TechDayHQ/,USA,"New York, NY",-73.99,1361,TechDay Meetup,"{'id': 263284450, 'name': 'Ana ', 'bio': '', '...",NY,active,TechDayHQ,public,Members
2,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1047953152000,The NYC NoSQL NewSQL Group (formerly known a...,107592,open,40.75,https://www.meetup.com/mysqlnyc/,USA,"New York, NY",-73.99,24226,"🔥 SQL NYC, The NoSQL & NewSQL Database Big Dat...","{'id': 6618661, 'name': 'Eric David Benari', '...",NY,active,mysqlnyc,public,Data Enthusiasts
3,"{'id': 23, 'name': 'Outdoors & Adventure', 'sh...",1548684384000,The Awesome Events Meetup Group is the real-li...,31031999,open,40.78,https://www.meetup.com/awesome-events/,USA,"New York, NY",-73.96,1694,Awesome Events,"{'id': 236287112, 'name': 'Justin', 'bio': '',...",NY,active,awesome-events,public,Awesome People
4,"{'id': 34, 'name': 'Tech', 'shortname': 'tech'...",1321563802000,"Data Driven NYC (organized by FirstMark), is a...",2829432,approval,40.76,https://www.meetup.com/DataDrivenNYC/,USA,"New York, NY",-73.97,17382,Data Driven NYC (a FirstMark Event),"{'id': 2369792, 'name': 'Matt Turck', 'bio': '...",NY,active,DataDrivenNYC,public,Members


Let's look at the ```category``` column in more detail and extract just the information we want in the main ```df_groups``` dataframe.

In [210]:
df_category = df_groups['category'].apply(pd.Series)
df_category.head()

  index = _union_indexes(indexes, sort=sort)
  result = result.union(other)


Unnamed: 0,id,name,shortname,sort_name,0
0,34.0,Tech,tech,Tech,
1,2.0,Career & Business,career-business,Career & Business,
2,34.0,Tech,tech,Tech,
3,23.0,Outdoors & Adventure,outdoors-adventure,Outdoors & Adventure,
4,34.0,Tech,tech,Tech,


In [211]:
df_category.isna().sum()/(len(df_category))*100

id             0.092678
name           0.092678
shortname      0.092678
sort_name      0.092678
0            100.000000
dtype: float64

In [212]:
# replace NaNs
df_category['shortname'].fillna('None',inplace=True)

In [213]:
# add columns to main dataframe and drop 'category'
df_groups['category_name'] = df_category['shortname']
df_groups.drop(columns = ['category'], inplace=True)

In [214]:
df_groups.head()

Unnamed: 0,created,description,group_id,join_mode,lat,link,localized_country_name,localized_location,lon,members,name,organizer,state,status,urlname,visibility,who,category_name
0,1484876702000,Build with Code hosts free weekly JavaScript a...,21993357,open,40.75,https://www.meetup.com/Build-with-Code-New-York/,USA,"New York, NY",-73.99,8050,Build with Code - New York City,"{'id': 218119162, 'name': 'Jenny Mith', 'bio':...",NY,active,Build-with-Code-New-York,public,Engineers,tech
1,1550615516000,The TechDay New York team invites you to join ...,31207091,open,40.75,https://www.meetup.com/TechDayHQ/,USA,"New York, NY",-73.99,1361,TechDay Meetup,"{'id': 263284450, 'name': 'Ana ', 'bio': '', '...",NY,active,TechDayHQ,public,Members,career-business
2,1047953152000,The NYC NoSQL NewSQL Group (formerly known a...,107592,open,40.75,https://www.meetup.com/mysqlnyc/,USA,"New York, NY",-73.99,24226,"🔥 SQL NYC, The NoSQL & NewSQL Database Big Dat...","{'id': 6618661, 'name': 'Eric David Benari', '...",NY,active,mysqlnyc,public,Data Enthusiasts,tech
3,1548684384000,The Awesome Events Meetup Group is the real-li...,31031999,open,40.78,https://www.meetup.com/awesome-events/,USA,"New York, NY",-73.96,1694,Awesome Events,"{'id': 236287112, 'name': 'Justin', 'bio': '',...",NY,active,awesome-events,public,Awesome People,outdoors-adventure
4,1321563802000,"Data Driven NYC (organized by FirstMark), is a...",2829432,approval,40.76,https://www.meetup.com/DataDrivenNYC/,USA,"New York, NY",-73.97,17382,Data Driven NYC (a FirstMark Event),"{'id': 2369792, 'name': 'Matt Turck', 'bio': '...",NY,active,DataDrivenNYC,public,Members,tech


Let's look at the ```organizer``` column in more detail and extract just the information we want in the main ```df_groups``` dataframe.

In [215]:
df_org = df_groups['organizer'].apply(pd.Series)
df_org.head()

Unnamed: 0,id,name,bio,photo
0,218119162,Jenny Mith,,"{'id': 262996470, 'highres_link': 'https://sec..."
1,263284450,Ana,,"{'id': 281661741, 'highres_link': 'https://sec..."
2,6618661,Eric David Benari,,"{'id': 4946659, 'highres_link': 'https://secur..."
3,236287112,Justin,,"{'id': 284561488, 'highres_link': 'https://sec..."
4,2369792,Matt Turck,"Managing Director, FirstMark Capital","{'id': 266918773, 'highres_link': 'https://sec..."


In [216]:
df_org.isna().sum()/(len(df_org))*100

id       0.000000
name     0.000000
bio      0.000000
photo    6.452734
dtype: float64

In [217]:
# let's keep just the organizer's id just in case and drop the 'organizer' column from df_groups
df_groups['organizer_id'] = df_org['id']
df_groups.drop(columns = ['organizer'], inplace =True)

In [218]:
df_groups.head()

Unnamed: 0,created,description,group_id,join_mode,lat,link,localized_country_name,localized_location,lon,members,name,state,status,urlname,visibility,who,category_name,organizer_id
0,1484876702000,Build with Code hosts free weekly JavaScript a...,21993357,open,40.75,https://www.meetup.com/Build-with-Code-New-York/,USA,"New York, NY",-73.99,8050,Build with Code - New York City,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162
1,1550615516000,The TechDay New York team invites you to join ...,31207091,open,40.75,https://www.meetup.com/TechDayHQ/,USA,"New York, NY",-73.99,1361,TechDay Meetup,NY,active,TechDayHQ,public,Members,career-business,263284450
2,1047953152000,The NYC NoSQL NewSQL Group (formerly known a...,107592,open,40.75,https://www.meetup.com/mysqlnyc/,USA,"New York, NY",-73.99,24226,"🔥 SQL NYC, The NoSQL & NewSQL Database Big Dat...",NY,active,mysqlnyc,public,Data Enthusiasts,tech,6618661
3,1548684384000,The Awesome Events Meetup Group is the real-li...,31031999,open,40.78,https://www.meetup.com/awesome-events/,USA,"New York, NY",-73.96,1694,Awesome Events,NY,active,awesome-events,public,Awesome People,outdoors-adventure,236287112
4,1321563802000,"Data Driven NYC (organized by FirstMark), is a...",2829432,approval,40.76,https://www.meetup.com/DataDrivenNYC/,USA,"New York, NY",-73.97,17382,Data Driven NYC (a FirstMark Event),NY,active,DataDrivenNYC,public,Members,tech,2369792


In [219]:
df_groups.columns

Index(['created', 'description', 'group_id', 'join_mode', 'lat', 'link',
       'localized_country_name', 'localized_location', 'lon', 'members',
       'name', 'state', 'status', 'urlname', 'visibility', 'who',
       'category_name', 'organizer_id'],
      dtype='object')

In [220]:
# let's get a count to see how long each group has been around by subtracting 'created' timestamp from May 1st.
df_groups['yrs_since_created'] = ((1556683200000 - df_groups['created'])/86400000)/365

In [221]:
df_groups['created_date'] = df_groups['created'].apply(lambda x:time.strftime('%m/%d/%Y %H:%M:%S', time.gmtime(x/1000.)))

In [222]:
df_groups.head()

Unnamed: 0,created,description,group_id,join_mode,lat,link,localized_country_name,localized_location,lon,members,name,state,status,urlname,visibility,who,category_name,organizer_id,yrs_since_created,created_date
0,1484876702000,Build with Code hosts free weekly JavaScript a...,21993357,open,40.75,https://www.meetup.com/Build-with-Code-New-York/,USA,"New York, NY",-73.99,8050,Build with Code - New York City,NY,active,Build-with-Code-New-York,public,Engineers,tech,218119162,2.276969,01/20/2017 01:45:02
1,1550615516000,The TechDay New York team invites you to join ...,31207091,open,40.75,https://www.meetup.com/TechDayHQ/,USA,"New York, NY",-73.99,1361,TechDay Meetup,NY,active,TechDayHQ,public,Members,career-business,263284450,0.192405,02/19/2019 22:31:56
2,1047953152000,The NYC NoSQL NewSQL Group (formerly known a...,107592,open,40.75,https://www.meetup.com/mysqlnyc/,USA,"New York, NY",-73.99,24226,"🔥 SQL NYC, The NoSQL & NewSQL Database Big Dat...",NY,active,mysqlnyc,public,Data Enthusiasts,tech,6618661,16.131724,03/18/2003 02:05:52
3,1548684384000,The Awesome Events Meetup Group is the real-li...,31031999,open,40.78,https://www.meetup.com/awesome-events/,USA,"New York, NY",-73.96,1694,Awesome Events,NY,active,awesome-events,public,Awesome People,outdoors-adventure,236287112,0.253641,01/28/2019 14:06:24
4,1321563802000,"Data Driven NYC (organized by FirstMark), is a...",2829432,approval,40.76,https://www.meetup.com/DataDrivenNYC/,USA,"New York, NY",-73.97,17382,Data Driven NYC (a FirstMark Event),NY,active,DataDrivenNYC,public,Members,tech,2369792,7.455587,11/17/2011 21:03:22


In [223]:
# pickle cleaned group dataframe
df_groups.to_pickle('df_all_groups_cleaned.pickle')

***
<a id='members'></a>
### 3. Meetup Members

Here we will merge two dataframes containing information on members. The first is information scraped from member profile pages and the other is member info obtained from the members API endpoint.


#### Scraped data

In [351]:
# importing member profiles scraped:
with open('member_profiles_16000.pkl', 'rb') as f:
    member_profiles = pickle.load(f)

In [352]:
print(f"Scraped {len(member_profiles)} profiles")

Scraped 15990 profiles


In [353]:
# view data in dataframe
df_members = pd.DataFrame(member_profiles)
df_members.head()

Unnamed: 0,groups,interests,member_url
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863


In [354]:
# count number of items in groups and interest; will drop members without any group or interest information
df_members['num_groups'] = df_members.groups.apply(lambda x: len(x))
df_members['num_interests'] = df_members.interests.apply(lambda x: len(x))

In [355]:
# get the indices of rows that are missing both group and interest data; use indices to drop rows
missing_groups_ints = df_members[(df_members['num_groups'] == 0) & (df_members['num_interests']==0)]
df_members.drop(index = missing_groups_ints.index, axis = 0, inplace = True)

In [356]:
# now's lets also drop members missing either groups or interests (1,105 in total) so that we only work with 
# users with full info
missing_groups = df_members[df_members['num_groups'] == 0]
missing_ints = df_members[df_members['num_interests'] == 0]

df_members.drop(index = missing_groups.index, axis = 0, inplace = True)
df_members.drop(index = missing_ints.index, axis = 0, inplace = True)

In [357]:
df_members.shape

(14879, 5)

In [358]:
# preview the updated dataframe
df_members.head()

Unnamed: 0,groups,interests,member_url,num_groups,num_interests
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912,7,4
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603,8,23
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602,3,9
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532,12,51
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863,9,14


In [360]:
# save final dataframe to json and pickle
df_members.to_json("member_profiles_1600_cleaned.json")
df_members.to_pickle("df_scraped_profiles_cleaned.pickle")

#### API data

In [361]:
# getting back pickled dataframe containing the API member info
df_membersapi = pd.read_pickle('df_unique_members.pickle')

In [362]:
df_membersapi.shape

(234609, 17)

In [363]:
df_membersapi.head()

Unnamed: 0,bio,city,country,hometown,id,joined,lat,link,lon,name,other_services,photo,self,state,status,topics,visited
0,,Bronx,us,,276413419,1552398000000.0,40.82,http://www.meetup.com/members/276413419,-73.92,Charisse,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'newtech', 'name': 'New Technology...",1552398000000.0
1,,New York,us,,245744462,1515612000000.0,40.75,http://www.meetup.com/members/245744462,-73.99,Ibrahima Diallo,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'newtech', 'name': 'New Technology...",1515612000000.0
2,,New York,us,,273936256,1549559000000.0,40.75,http://www.meetup.com/members/273936256,-73.99,Victoria Read,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,[],1549559000000.0
3,,New York,us,,258398074,1531030000000.0,40.75,http://www.meetup.com/members/258398074,-73.99,+V信feng4343注册得99链接186053.com,{},,{'common': {}},NY,active,[],1531030000000.0
4,,New York,us,,259737701,1552287000000.0,40.75,http://www.meetup.com/members/259737701,-73.99,¥en,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,[],1552287000000.0


In [364]:
# renaming the link column to stage for merging with scraped dataframe
df_membersapi.rename(columns={'link':'member_url'}, inplace = True)
df_membersapi.head()

Unnamed: 0,bio,city,country,hometown,id,joined,lat,member_url,lon,name,other_services,photo,self,state,status,topics,visited
0,,Bronx,us,,276413419,1552398000000.0,40.82,http://www.meetup.com/members/276413419,-73.92,Charisse,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'newtech', 'name': 'New Technology...",1552398000000.0
1,,New York,us,,245744462,1515612000000.0,40.75,http://www.meetup.com/members/245744462,-73.99,Ibrahima Diallo,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'newtech', 'name': 'New Technology...",1515612000000.0
2,,New York,us,,273936256,1549559000000.0,40.75,http://www.meetup.com/members/273936256,-73.99,Victoria Read,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,[],1549559000000.0
3,,New York,us,,258398074,1531030000000.0,40.75,http://www.meetup.com/members/258398074,-73.99,+V信feng4343注册得99链接186053.com,{},,{'common': {}},NY,active,[],1531030000000.0
4,,New York,us,,259737701,1552287000000.0,40.75,http://www.meetup.com/members/259737701,-73.99,¥en,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,[],1552287000000.0


#### Merged data

In [365]:
# left merge of dataframe on member_url column
full_df_members = pd.merge(df_members, df_membersapi, how = 'left', on= 'member_url')

In [366]:
full_df_members.shape

(14879, 21)

In [367]:
# preview the merged dataframe
full_df_members.head()

Unnamed: 0,groups,interests,member_url,num_groups,num_interests,bio,city,country,hometown,id,...,lat,lon,name,other_services,photo,self,state,status,topics,visited
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912,7,4,,Secaucus,us,secaucus,57678912,...,40.79,-74.06,Dee,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NJ,active,"[{'urlkey': 'business-referral-networking', 'n...",1466428000000.0
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603,8,23,,New York,us,"St. Gallen, Switzerland",230923603,...,40.72,-73.98,Alistair Barrell,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'foodie', 'name': 'Foodie', 'id': ...",1554763000000.0
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602,3,9,,New York,us,,24427602,...,40.72,-74.0,Beth Barber,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'visual-studio', 'name': 'Visual S...",1447760000000.0
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532,12,51,,New Haven,us,New Haven,75979532,...,41.33,-72.97,Kathy,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},CT,active,"[{'urlkey': 'coffee', 'name': 'Coffee', 'id': ...",1514860000000.0
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863,9,14,,West Hempstead,us,,279891863,...,40.69,-73.65,Karen White Kelly,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'hiphop', 'name': 'Hip Hop', 'id':...",1556335000000.0


In [368]:
full_df_members.columns

Index(['groups', 'interests', 'member_url', 'num_groups', 'num_interests',
       'bio', 'city', 'country', 'hometown', 'id', 'joined', 'lat', 'lon',
       'name', 'other_services', 'photo', 'self', 'state', 'status', 'topics',
       'visited'],
      dtype='object')

In [370]:
# we can drop the self columns since they are all empty
full_df_members.self.value_counts()

{'common': {}}    14879
Name: self, dtype: int64

In [None]:
full_df_members.drop(columns = ['self'], inplace = True)

In [371]:
# the other_services column contains other social media contacts for the member
full_df_members.other_services.value_counts()

{}                                                                                                                                                                                                                                                                                               13778
{'twitter': {'identifier': 'http://'}}                                                                                                                                                                                                                                                               3
{'twitter': {'identifier': '@redvioletdar'}}                                                                                                                                                                                                                                                         1
{'twitter': {'identifier': '@HarlemFund'}, 'linkedin': {'identifier': 'http://www.linkedin.com/in/thomas-lopez-pier

In [380]:
# create a column with a count of the number of connected social media accounts
full_df_members['num_sm_accounts'] = full_df_members.other_services.apply(lambda x: len(x))

In [381]:
full_df_members.head()

Unnamed: 0,groups,interests,member_url,num_groups,num_interests,bio,city,country,hometown,id,...,lon,name,other_services,photo,self,state,status,topics,visited,num_sm_accounts
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912,7,4,,Secaucus,us,secaucus,57678912,...,-74.06,Dee,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NJ,active,"[{'urlkey': 'business-referral-networking', 'n...",1466428000000.0,0
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603,8,23,,New York,us,"St. Gallen, Switzerland",230923603,...,-73.98,Alistair Barrell,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'foodie', 'name': 'Foodie', 'id': ...",1554763000000.0,0
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602,3,9,,New York,us,,24427602,...,-74.0,Beth Barber,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'visual-studio', 'name': 'Visual S...",1447760000000.0,0
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532,12,51,,New Haven,us,New Haven,75979532,...,-72.97,Kathy,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},CT,active,"[{'urlkey': 'coffee', 'name': 'Coffee', 'id': ...",1514860000000.0,0
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863,9,14,,West Hempstead,us,,279891863,...,-73.65,Karen White Kelly,{},{'highres_link': 'https://secure.meetupstatic....,{'common': {}},NY,active,"[{'urlkey': 'hiphop', 'name': 'Hip Hop', 'id':...",1556335000000.0,0


In [382]:
# drop the other_services column
full_df_members.drop(columns = ['other_services'], inplace =True)

In [396]:
(full_df_members.isna().sum()/len(full_df_members))*100

groups             0.0
interests          0.0
member_url         0.0
num_groups         0.0
num_interests      0.0
bio                0.0
city               0.0
country            0.0
hometown           0.0
id                 0.0
joined             0.0
lat                0.0
lon                0.0
name               0.0
self               0.0
state              0.0
status             0.0
topics             0.0
visited            0.0
num_sm_accounts    0.0
has_photo          0.0
dtype: float64

In [395]:
# fill in the state, bio, and hometown NaN values with 'None'
full_df_members.state.fillna('None', inplace = True)
full_df_members.bio.fillna('None', inplace = True)
full_df_members.hometown.fillna('None', inplace = True)

In [390]:
# create a new column indicating whether member has a photo (1) or not (0) to replace the 'photo' column
full_df_members['has_photo'] = full_df_members.photo.apply(lambda x: 0 if x == None else 1)

In [392]:
# drop the 'photo' column
full_df_members.drop(columns = ['photo'], inplace = True)

In [393]:
full_df_members.head()

Unnamed: 0,groups,interests,member_url,num_groups,num_interests,bio,city,country,hometown,id,...,lat,lon,name,self,state,status,topics,visited,num_sm_accounts,has_photo
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912,7,4,,Secaucus,us,secaucus,57678912,...,40.79,-74.06,Dee,{'common': {}},NJ,active,"[{'urlkey': 'business-referral-networking', 'n...",1466428000000.0,0,1
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603,8,23,,New York,us,"St. Gallen, Switzerland",230923603,...,40.72,-73.98,Alistair Barrell,{'common': {}},NY,active,"[{'urlkey': 'foodie', 'name': 'Foodie', 'id': ...",1554763000000.0,0,1
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602,3,9,,New York,us,,24427602,...,40.72,-74.0,Beth Barber,{'common': {}},NY,active,"[{'urlkey': 'visual-studio', 'name': 'Visual S...",1447760000000.0,0,1
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532,12,51,,New Haven,us,New Haven,75979532,...,41.33,-72.97,Kathy,{'common': {}},CT,active,"[{'urlkey': 'coffee', 'name': 'Coffee', 'id': ...",1514860000000.0,0,1
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863,9,14,,West Hempstead,us,,279891863,...,40.69,-73.65,Karen White Kelly,{'common': {}},NY,active,"[{'urlkey': 'hiphop', 'name': 'Hip Hop', 'id':...",1556335000000.0,0,1


In [405]:
# can drop 'topic' column as it contains the same info as 'interests'
full_df_members.drop(columns= ['topics'], inplace = True)

In [406]:
full_df_members.head()

Unnamed: 0,groups,interests,member_url,num_groups,num_interests,bio,city,country,hometown,id,joined,lat,lon,name,self,state,status,visited,num_sm_accounts,has_photo
0,"[Closing Deals in 6 Inch Heels NYC, Entreprene...","[Professional Development, Professional Women,...",http://www.meetup.com/members/57678912,7,4,,Secaucus,us,secaucus,57678912,1459463000000.0,40.79,-74.06,Dee,{'common': {}},NJ,active,1466428000000.0,0,1
1,"[Ann Arbor Web Accessibility, Data Driven NYC ...","[Adventure, Language & Culture, Nightlife, Bac...",http://www.meetup.com/members/230923603,8,23,,New York,us,"St. Gallen, Switzerland",230923603,1537537000000.0,40.72,-73.98,Alistair Barrell,{'common': {}},NY,active,1554763000000.0,0,1
2,"[ArtForward, Central Park Sketching & Art Meet...","[Theater, Performing Arts, Walking, Writing, A...",http://www.meetup.com/members/24427602,3,9,,New York,us,,24427602,1436840000000.0,40.72,-74.0,Beth Barber,{'common': {}},NY,active,1447760000000.0,0,1
3,"[#Resist: Danbury, Adult Day Camp, Black Nonbe...","[Museum, Cooking Dinner Parties, Wine, Healthy...",http://www.meetup.com/members/75979532,12,51,,New Haven,us,New Haven,75979532,1468890000000.0,41.33,-72.97,Kathy,{'common': {}},CT,active,1514860000000.0,0,1
4,['NYC- Small Business and Entrepreneurs Networ...,"[Hip Hop, Wine, Business Strategy, Dining Out,...",http://www.meetup.com/members/279891863,9,14,,West Hempstead,us,,279891863,1556335000000.0,40.69,-73.65,Karen White Kelly,{'common': {}},NY,active,1556335000000.0,0,1


In [407]:
# save the cleaned dataframe
full_df_members.to_pickle("full_df_members.pickle")