## Python Mini-Project: Data Clean-Up, Pt. 2

Pay close attention to the prompts to help guide you through this task.

In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# load CSV
csv_path = "Resources/2016-FCC-New-Coders-Survey-Data.csv"

In [3]:
# Read with pandas--low_memory required to suppress errors about mixed data types
# You may need to encode this file
coders_df = pd.read_csv(csv_path, low_memory = False)
coders_df.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampMonthsAgo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,...,ResourceSoloLearn,ResourceStackOverflow,ResourceTreehouse,ResourceUdacity,ResourceUdemy,ResourceW3Schools,ResourceYouTube,SchoolDegree,SchoolMajor,StudentDebtOwe
0,28.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,20000.0
1,22.0,0.0,,,,,,,,,...,,,,,1.0,,,"some college credit, no degree",,
2,19.0,0.0,,,,,,,,,...,,,,,,,,high school diploma or equivalent (GED),,
3,26.0,0.0,,,,,,,,,...,,,,,,,,bachelor's degree,Cinematography And Film,7000.0
4,20.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,


In [4]:
# Take only columns 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111
# Consider using iloc
coders_short_df = coders_df.iloc[:, [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111]]
coders_short_df.head()


Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,...,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
0,28.0,0.0,,,,,,,,"between 100,000 and 1 million",...,United States of America,office and administrative support,Employed for wages,male,0.0,30.0,32000.0,,"some college credit, no degree",
1,22.0,0.0,,,,,,,,"between 100,000 and 1 million",...,United States of America,food and beverage,Employed for wages,male,,30.0,15000.0,Front-End Web Developer,"some college credit, no degree",
2,19.0,0.0,,,,,,,,more than 1 million,...,United States of America,finance,Employed for wages,male,,20.0,48000.0,,high school diploma or equivalent (GED),
3,26.0,0.0,,,,,,,,more than 1 million,...,United States of America,"arts, entertainment, sports, or media",Employed for wages,female,,20.0,43000.0,Front-End Web Developer,bachelor's degree,Cinematography And Film
4,20.0,0.0,,,,,,,,"between 100,000 and 1 million",...,United States of America,education,Employed for wages,female,,25.0,6000.0,Full-Stack Web Developer,"some college credit, no degree",


In [5]:
# Change reading 0 and 1 to No and Yes, respectively
coders_short_new_df = coders_short_df.replace({0.0: "No", 1.0: "Yes", "NA": np.nan})
coders_short_new_df.head()


Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,...,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
0,28,No,,,,,,,,"between 100,000 and 1 million",...,United States of America,office and administrative support,Employed for wages,male,No,30,32000,,"some college credit, no degree",
1,22,No,,,,,,,,"between 100,000 and 1 million",...,United States of America,food and beverage,Employed for wages,male,,30,15000,Front-End Web Developer,"some college credit, no degree",
2,19,No,,,,,,,,more than 1 million,...,United States of America,finance,Employed for wages,male,,20,48000,,high school diploma or equivalent (GED),
3,26,No,,,,,,,,more than 1 million,...,United States of America,"arts, entertainment, sports, or media",Employed for wages,female,,20,43000,Front-End Web Developer,bachelor's degree,Cinematography And Film
4,20,No,,,,,,,,"between 100,000 and 1 million",...,United States of America,education,Employed for wages,female,,25,6000,Full-Stack Web Developer,"some college credit, no degree",


In [6]:
# Extract rows for only those who attended a bootcamp
coders_short_attended_df = coders_short_new_df.loc[coders_short_new_df["AttendedBootcamp"] == "Yes"]
coders_short_attended_df.head()


Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,...,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
93,32,Yes,Yes,No,No,Codify Academy,,No,,"between 100,000 and 1 million",...,United States of America,"arts, entertainment, sports, or media",Self-employed business owner,male,,20,67000.0,,bachelor's degree,Biology
97,26,Yes,Yes,Yes,No,DaVinci Coders,45000.0,No,,more than 1 million,...,United States of America,software development,Employed for wages,male,No,10,40000.0,,master's degree (non-professional),Music
130,41,Yes,Yes,Yes,Yes,Coder Foundry,75000.0,Yes,3.0,"less than 100,000",...,United States of America,software development,Employed for wages,male,Yes,30,75000.0,,"some college credit, no degree",
159,26,Yes,Yes,No,No,General Assembly,,No,,"between 100,000 and 1 million",...,United States of America,,Not working and not looking for work,female,,30,,Full-Stack Web Developer,"some college credit, no degree",
188,24,Yes,No,,Yes,,,No,,"between 100,000 and 1 million",...,Canada,,Not working but looking for work,female,,60,,,"some college credit, no degree",


In [48]:
# Create DataFrame of the various boot camps along with the number of participants (enrollees)
# Create a DataFrame from using value_counts
names_df = pd.DataFrame(coders_short_attended_df["BootcampName"].value_counts())
names_df.reset_index(inplace = True)
names_df.columns = ["BootcampName", "Count"]
names_df

Unnamed: 0,BootcampName,Count
0,General Assembly,90
1,Flatiron School,54
2,Dev Bootcamp,48
3,The Iron Yard,40
4,Prime Digital Academy,30
...,...,...
123,Stackademy,1
124,DESIGNATION,1
125,AngelHack Education,1
126,Microsoft Research Data Science Summer School,1


In [42]:
# Count how many attendees of each bootcamp would recommend the bootcamp
recommend_bootcamp_df = coders_short_attended_df.loc[(coders_short_attended_df['BootcampRecommend'] == "Yes"), :]
recommend_bootcamp_df = pd.DataFrame(recommend_bootcamp_df['BootcampName'].value_counts())
recommend_bootcamp_df.reset_index(inplace = True)
recommend_bootcamp_df.columns = ["BootcampName", "Recommenders"]
recommend_bootcamp_df

Unnamed: 0,BootcampName,Recommenders
0,General Assembly,70
1,Flatiron School,50
2,Dev Bootcamp,41
3,The Iron Yard,31
4,Hack Reactor,27
...,...,...
111,Code 42,1
112,Betamore,1
113,Coding House,1
114,Launch School (formerly Tealeaf Academy),1


In [43]:
# Merge the two created data frames on the name of tbe bootcamp
name_recommend_df = pd.merge(names_df, recommend_bootcamp_df, how = "outer", on = "BootcampName")
name_recommend_df['Recommenders'] = name_recommend_df['Recommenders'].fillna(0)
name_recommend_df

Unnamed: 0,BootcampName,Count,Recommenders
0,General Assembly,90,70.0
1,Flatiron School,54,50.0
2,Dev Bootcamp,48,41.0
3,The Iron Yard,40,31.0
4,Prime Digital Academy,30,25.0
...,...,...,...
123,Stackademy,1,1.0
124,DESIGNATION,1,1.0
125,AngelHack Education,1,1.0
126,Microsoft Research Data Science Summer School,1,1.0


In [44]:
# Calculate percentage of each bootcamp's students who are recommenders
# Think of what should be the numerator and the demoninator
name_recommend_df["% Recommend"] = (name_recommend_df["Recommenders"] / name_recommend_df["Count"])*100
# Sort results in descending order
name_recommend_df = name_recommend_df.sort_values(['% Recommend'], ascending = False).round(2)
# Format for percentages
name_recommend_df["% Recommend"] = name_recommend_df["% Recommend"].map("{0:,.2f}%".format)
name_recommend_df

Unnamed: 0,BootcampName,Count,Recommenders,% Recommend
64,Grand Circus,3,3.0,100.00%
47,devCodeCamp,4,4.0,100.00%
95,SeedPaths,1,1.0,100.00%
94,Le Wagon,1,1.0,100.00%
92,Coder's Lab,1,1.0,100.00%
...,...,...,...,...
101,Atlanta Code,1,0.0,0.00%
109,TalentBuddy,1,0.0,0.00%
102,Data Science Dojo,1,0.0,0.00%
104,Academic Work Academy,1,0.0,0.00%


In [45]:
# Export to excel and remove index
name_recommend_df.to_excel('output/Bootcamppart2_CJM.xlsx', index = False)