## Python Mini-Project: Data Clean-Up, Pt. 1

Pay close attention to the prompts to help guide you through this task.

In [234]:
# Dependencies
import pandas as pd

In [235]:
# load CSV
closedfile = 'Resources/2016-FCC-New-Coders-Survey-Data.csv'

df = pd.read_csv(closedfile)

In [236]:
# Read with pandas

df.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampMonthsAgo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,...,ResourceSoloLearn,ResourceStackOverflow,ResourceTreehouse,ResourceUdacity,ResourceUdemy,ResourceW3Schools,ResourceYouTube,SchoolDegree,SchoolMajor,StudentDebtOwe
0,28.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,20000.0
1,22.0,0.0,,,,,,,,,...,,,,,1.0,,,"some college credit, no degree",,
2,19.0,0.0,,,,,,,,,...,,,,,,,,high school diploma or equivalent (GED),,
3,26.0,0.0,,,,,,,,,...,,,,,,,,bachelor's degree,Cinematography And Film,7000.0
4,20.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,


In [237]:
# Inspect all columns
df.columns

Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampFullJobAfter',
       'BootcampLoanYesNo', 'BootcampMonthsAgo', 'BootcampName',
       'BootcampPostSalary', 'BootcampRecommend', 'ChildrenNumber',
       ...
       'ResourceSoloLearn', 'ResourceStackOverflow', 'ResourceTreehouse',
       'ResourceUdacity', 'ResourceUdemy', 'ResourceW3Schools',
       'ResourceYouTube', 'SchoolDegree', 'SchoolMajor', 'StudentDebtOwe'],
      dtype='object', length=113)

In [238]:
# Extract only columns 0, 1, 2, 3, 4, 7, 8, 9, 10,11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111
# Use iloc to accomplish this (remember that you need to take the position of the rows and columns into account)

df = df.iloc[:, [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111]]

df.columns

Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampFullJobAfter',
       'BootcampLoanYesNo', 'BootcampPostSalary', 'BootcampRecommend',
       'ChildrenNumber', 'CityPopulation', 'CodeEventBootcamp', 'CountryLive',
       'EmploymentField', 'EmploymentStatus', 'Gender', 'HasChildren',
       'HoursLearning', 'Income', 'JobRoleInterest', 'SchoolDegree',
       'SchoolMajor'],
      dtype='object')

In [239]:
# Change "0" to "No" and "1" to "Yes" in response columns
# Hint use the df.replace function

df['AttendedBootcamp'] = df['AttendedBootcamp'].replace({0 : 'No', 1 : 'Yes'})


df.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,CodeEventBootcamp,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
0,28.0,No,,,,,,,"between 100,000 and 1 million",,United States of America,office and administrative support,Employed for wages,male,0.0,30.0,32000.0,,"some college credit, no degree",
1,22.0,No,,,,,,,"between 100,000 and 1 million",,United States of America,food and beverage,Employed for wages,male,,30.0,15000.0,Front-End Web Developer,"some college credit, no degree",
2,19.0,No,,,,,,,more than 1 million,,United States of America,finance,Employed for wages,male,,20.0,48000.0,,high school diploma or equivalent (GED),
3,26.0,No,,,,,,,more than 1 million,,United States of America,"arts, entertainment, sports, or media",Employed for wages,female,,20.0,43000.0,Front-End Web Developer,bachelor's degree,Cinematography And Film
4,20.0,No,,,,,,,"between 100,000 and 1 million",,United States of America,education,Employed for wages,female,,25.0,6000.0,Full-Stack Web Developer,"some college credit, no degree",


In [240]:
# Calculate total number of respondents in survey
# consider using the len() function


respondents = len(df)
respondents

15620

In [241]:
# Extract rows corresponding only to people who attended a bootcamp
# Filter using df.loc on the AttendedBootcamp column


bootcamp_attendee_df = df.loc[df['AttendedBootcamp'] == 'Yes']

bootcamp_attendee_df.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,CodeEventBootcamp,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
93,32.0,Yes,1.0,0.0,0.0,,0.0,,"between 100,000 and 1 million",,United States of America,"arts, entertainment, sports, or media",Self-employed business owner,male,,20.0,67000.0,,bachelor's degree,Biology
97,26.0,Yes,1.0,1.0,0.0,45000.0,0.0,,more than 1 million,,United States of America,software development,Employed for wages,male,0.0,10.0,40000.0,,master's degree (non-professional),Music
130,41.0,Yes,1.0,1.0,1.0,75000.0,1.0,3.0,"less than 100,000",,United States of America,software development,Employed for wages,male,1.0,30.0,75000.0,,"some college credit, no degree",
159,26.0,Yes,1.0,0.0,0.0,,0.0,,"between 100,000 and 1 million",,United States of America,,Not working and not looking for work,female,,30.0,,Full-Stack Web Developer,"some college credit, no degree",
188,24.0,Yes,0.0,,1.0,,0.0,,"between 100,000 and 1 million",,Canada,,Not working but looking for work,female,,60.0,,,"some college credit, no degree",


In [242]:
# Calculate average age of attendees
# Pull out the `Age` series and take the average of it

attendee_age = bootcamp_attendee_df['Age'].mean()
attendee_age

31.066014669926652

In [243]:
# Calculate how many people attended a bootcamp
# Count the values of the AttendedBootcamp column

attendee_count = bootcamp_attendee_df['AttendedBootcamp'].count()
attendee_count

953

In [244]:
# Calculate how many attendees hold degrees
# It is easier to do this with `value_counts() count values for the SchoolDegree column

attendee_deg = bootcamp_attendee_df['SchoolDegree'].value_counts()
attendee_deg

bachelor's degree                           462
some college credit, no degree              116
master's degree (non-professional)           96
professional degree (MBA, MD, JD, etc.)      39
high school diploma or equivalent (GED)      38
associate's degree                           32
trade, technical, or vocational training     24
some high school                             10
Ph.D.                                         8
no high school (secondary school)             7
Name: SchoolDegree, dtype: int64

In [245]:
# Count the number of records where the person is a degree holder
# There are several ways to approach this. You can look for people who have degrees
# or for people who don't have degrees depending on the value of the SchoolDegree column




In [246]:
# Count number of attendees who self-identify as male; female; or are of non-binary gender identification

attendee_gender = bootcamp_attendee_df['Gender'].value_counts()
attendee_gender

male           496
female         326
genderqueer      6
trans            3
agender          2
Name: Gender, dtype: int64

In [247]:
# Calculate percentage of respondents who attended a bootcamp

bootcamp_pct = (len(bootcamp_attendee_df) / len(df)) * 100

bootcamp_pct

6.101152368758003

In [248]:
# Calculate percentage of respondents belonging to each gender

res_gender = df['Gender'].value_counts()


male = res_gender[0]
female = res_gender[1]
genderqueer = res_gender[2]
agender = res_gender[3]
trans = res_gender[4]

male_pct = (res_gender[0] /  respondents) * 100
female_pct = (res_gender[1] /  respondents) * 100
genderqueer_pct = (res_gender[2] /  respondents) * 100
agender_pct = (res_gender[3] /  respondents) * 100
trans_pct = (res_gender[4] /  respondents) * 100

print(respondents)
print(res_gender)
print(male_pct)
print(female_pct)
print(genderqueer_pct)
print(agender_pct)
print(trans_pct)


15620
male           10766
female          2840
genderqueer       66
agender           38
trans             36
Name: Gender, dtype: int64
68.92445582586427
18.181818181818183
0.42253521126760557
0.24327784891165172
0.23047375160051217


In [249]:
# Calculate percentage of attendees with a college degree



In [250]:
# Calculate average post-bootcamp salary

camp_salary_df = df['BootcampPostSalary'].mean()
camp_salary_df

63740.50606060606

In [251]:
# Create a new table consolidating above calculations



In [252]:
# Improve formatting before outputting spreadsheet



In [2]:
# Export to Excel
# Use df.to_excel to export to excel. Don't include the indexes

