In [293]:
# Members who visited the gym most often earned an average income of $80,300.
# Fifty percent of members surveyed said that they stayed with their current gym based on the convenience of its location, while 38 percent agreed that the equipment was worth sticking around for.
# Approximately 44 percent of gym-goers are exercising with one other person, according to the report.
# In fact, the number of members who use trainers is just 12.5 percent.
# 

In [294]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [295]:
affiliates = pd.read_csv('affiliates.csv')
cfprices = pd.read_csv('cfprices.csv')
income = pd.read_csv('data.csv')
abbv = pd.read_csv('statesabv.csv')

Affiliates has only two columns but the 'LOCATION' column needs to be separated to create 'city' and 'state'. 

In [296]:
affiliates['city'] = affiliates['LOCATION'].apply(lambda x: x.split(',')[0])
affiliates['state'] = affiliates['LOCATION'].apply(lambda x: x.split(',')[1])
affiliates.drop(columns='LOCATION', inplace=True)

The columns in the income dataframe must be adjusted and renamed. This will allow a merge and standaridized names. 

In [297]:
income.rename(columns={'State': 'state', 'Population': 'population', 'HouseholdIncome': 'income'}, inplace=True)
# standardizing the capitalization of the dataframes below to allow a merge
income['state'] = income['state'].apply(lambda x: x.lower())
abbv['state'] = abbv['state'].apply(lambda x: x.lower())
abbv['state_id'] = abbv['state'].index
# merged the two and dropped the unnecessary columns
income = income.merge(abbv, on ='state', how='left').drop(columns=['rank', 'state'])
# renaming the new abbreviated state name column as 'state' and dropping all NaN values were DC
income.rename(columns={'abbv': 'state'}, inplace=True)
# update the missing value for District of Columbia as DC
income.iloc[0, 2] = 'DC'


In [298]:
# make all column names lower case
cfprices.columns = map(str.lower, cfprices.columns)
cfprices = cfprices.drop(columns=['currency', 'email', 'box name'])
workdf = cfprices.merge(income, how='left', on='state')

The cfprices dataset has a significant amount of data we do not need and also

In [299]:
# only interested in the US population
workdf = workdf[workdf['country']=='United States']
#  
workdf['price'] = workdf['price'].apply(lambda x: x.replace('$',"").strip())
workdf['price'] = workdf["price"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x])).astype('float64')
# 
workdf = workdf[pd.notna(workdf['population'])]
workdf = workdf[(workdf['type']!='Other') & (workdf['type']!='Weightlifting') & (workdf['type']!='Open Gym')]

In [300]:
workdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 937 entries, 0 to 1387
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city        934 non-null    object 
 1   state       937 non-null    object 
 2   price       937 non-null    float64
 3   type        937 non-null    object 
 4   country     937 non-null    object 
 5   population  937 non-null    float64
 6   income      937 non-null    float64
 7   state_id    936 non-null    float64
dtypes: float64(4), object(4)
memory usage: 65.9+ KB


In [301]:
# standardizes the values for the type
def change_type(x):
    if x == 'Unlimited':
        return 'Unlimited'
    elif x == '3 days/week':
        return '3xpw'
    elif x == 'unlimited':
        return 'Unlimited'
    elif x == '4 days/week':
        return '4xpw'
    elif x == '2 days/week':
        return '2xpw'
    elif x == '5 days/week':
        return '5xpw'
    elif x == 'Unlimited Couple':
        return 'Unlimited'
    elif x == 'Unlimited Family':
        return 'Unlimited'
    elif x == '3x per week':
        return '3xpw'
    elif x == 'Unlimited (grandfathered in at this price)':
        return 'Unlimited - Exception'
    elif x == 'Punch Card (10 classes)':
        return '10 Classes'
    elif x == '3x a week':
        return '3xpw'

workdf['type'] = workdf['type'].apply(lambda x: change_type(x))


In [302]:
# create a numerical id for the different types
type_ids = workdf['type'].value_counts().reset_index()
type_ids.drop(columns='type', inplace=True)
type_ids.rename(columns=({'index':'type'}), inplace=True)
type_ids['member_type_id'] = type_ids.index


In [303]:

workdf = workdf.merge(type_ids, how='left', on='type')


In [304]:
# yearly cost of a membership
workdf['yearly_cost'] = workdf['price'].apply(lambda x: x*12)
# percent of income
workdf['income_percent'] = round((workdf['yearly_cost']/workdf['income'])*100,2)
# if gym membership is below 2.75% the person will be likely to join given an average salary state; 0 = No; 1 = Yes
workdf['join_probability'] = workdf['income_percent'].apply(lambda x: 1 if x <= 2.75 else 0)
workdf = workdf[pd.notna(workdf['state_id'])]
workdf = workdf[pd.notna(workdf['member_type_id'])]


In [305]:
# creating our data selection
X = np.array(workdf[['state_id', 'member_type_id','price','income']])
y = np.array(workdf['join_probability'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)
clf.predict(X_test)
print('Train Score (n=3):', clf.score(X_train, y_train))
print('Test Score(n=3):', clf.score(X_test, y_test))

Train Score (n=3): 0.9771754636233951
Test Score(n=3): 0.9358974358974359


In [312]:

workdf[['state_id','type', 'member_type_id', 'income']]

Unnamed: 0,state_id,type,member_type_id,income
0,41.0,Unlimited,0.0,50972.0
1,20.0,Unlimited,0.0,77378.0
2,5.0,Unlimited,0.0,68811.0
3,29.0,3xpw,1.0,79363.0
4,20.0,Unlimited,0.0,77378.0
...,...,...,...,...
932,4.0,Unlimited,0.0,71228.0
933,42.0,Unlimited,0.0,59570.0
934,4.0,Unlimited,0.0,71228.0
935,31.0,Unlimited,0.0,65323.0


In [328]:
# person lives in Florida (8.0), attends 3x per week, price is 180 for membership, and their income is 75000
test = [np.array([8.0, 1.0, 180.0, 150000 ])]
if clf.predict((test)) == [0]:
    print('Will not join')
else:
    print('Will join!')

Will join!


<br>Training Scores for differnt values of n on the classifier method
<br>
<br>Overfitting
<br>Train Score (n=1): 1.0
<br>Test Score(n=1): 0.9743589743589743
<br>
<br>Chosen
<br>Train Score (n=2): 0.9757489300998573
<br>Test Score(n=2): 0.9487179487179487
<br>
<br>Second Best
<br>Train Score (n=3): 0.9771754636233951
<br>Test Score(n=3): 0.9358974358974359
<br>
<br>Underfitting
<br>Train Score (n=5): 0.9529243937232525
<br>Test Score(n=5): 0.9188034188034188
<br>
<br>Underfitting
<br>Train Score (n=7): 0.948644793152639
<br>Test Score(n=7): 0.9188034188034188

# Add column which says the average age of each state use below to classify state age
# add column which adds obesity rate 
https://rallyfitness.com/blogs/news/16063884-latest-crossfit-market-research-data
Under 18: 18%

18 - 24   :  6%

25 - 34   : 40%

35 - 44   : 20%

45 - 54   :  8%

55 - 64   :  2%

65+        :  5%