### PayScale Data

Data found on PayScale include the target variable once engineered (expected income > 60,000), and additional features including meaning percentage, STEM percentage, and school types.  

In [1]:
# Import data from PayScale 

#Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd 

#Extract College Salary Report from PayScale
url= 'https://www.payscale.com/college-salary-report/bachelors/page/1'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

#Create list with school name 
table_container = soup.find('table')
school_name = table_container.findAll('a')
school_name_list = [school.text for school in school_name]

#Create urls to scrape data for all 64 pages 
payscale_pages = []
for i in range(1,64):
    url = 'https://www.payscale.com/college-salary-report/bachelors/page/' + str(i)
    payscale_pages.append(url)

#Create lists so we can create dataframe
school_names = []
school_types = []
early_pay_list = []
mid_pay_list = []
meaning_percent_list = []
stem_percent_list = []
    
#Create function that scrapes data from Payscale for each webpage 
def scrape_data(url):
    #access html content 
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    
    #access school names and add to list 
    table_container = soup.find('table')
    for school in table_container.findAll('td', class_='data-table__cell csr-col--school-name'):
        school_name = school.text[12:]
        school_names.append(school_name)
        
    #access school types and add to list 
        school_type = school.nextSibling.text[12:]
        school_types.append(school_type)
        
        #access early career pay and add to list 
        early_pay = school.nextSibling.nextSibling.text[18:]
        early_pay = int(early_pay.replace(',', ''))
        early_pay_list.append(early_pay)
        
        #access mid career pay and add to list 
        mid_pay = school.nextSibling.nextSibling.nextSibling.text[16:]
        mid_pay = int(mid_pay.replace(',',''))
        mid_pay_list.append(mid_pay)
        
        #access meaning percentage and add to list 
        meaning = school.nextSibling.nextSibling.nextSibling.nextSibling.text[15:17]
        meaning = int(meaning.replace('-', '0'))
        meaning_percent_list.append(meaning)
        
        #access STEM percentage and add to list 
        stem = school.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.text[15:17]
        stem = int(stem.replace('%', ''))
        stem_percent_list.append(stem)
        

#Scrape Data for each webpage 
for url in payscale_pages:
    scrape_data(url)
    
#Create dataframe from lists 
schools_dict = {'school_name': school_names,
               'school_types': school_types,
               'early_career_pay': early_pay_list,
               'mid_career_pay': mid_pay_list,
               'meaning_percentage': meaning_percent_list,
               'stem_percentage': stem_percent_list}

# Turn dictionary into a dataframe
df = pd.DataFrame(schools_dict)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554 entries, 0 to 1553
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   school_name         1554 non-null   object
 1   school_types        1554 non-null   object
 2   early_career_pay    1554 non-null   int64 
 3   mid_career_pay      1554 non-null   int64 
 4   meaning_percentage  1554 non-null   int64 
 5   stem_percentage     1554 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 73.0+ KB


In [2]:
# Remove characters from school name so school names are able to match properly
df['school_name'] = df['school_name'].str.replace('-',' ')
df['school_name'] = df['school_name'].str.replace(':',' ')
df.head()

Unnamed: 0,school_name,school_types,early_career_pay,mid_career_pay,meaning_percentage,stem_percentage
0,Harvey Mudd College,"Engineering, Liberal Arts School, Private School",91400,162500,56,85
1,Massachusetts Institute of Technology,"Engineering, Private School, Research University",88300,158100,52,69
2,United States Naval Academy,"Engineering, Liberal Arts School, Sober School...",79600,152600,61,58
3,Princeton University,"Ivy League, Private School, Research Universit...",77300,150500,49,48
4,California Institute of Technology,"Engineering, Private School, Research University",87600,150300,56,97


In [6]:
# Determine how many unique school types there are, clean out to allow for categorical variable
df['school_types'].unique()

array(['Engineering, Liberal Arts School, Private School',
       'Engineering, Private School, Research University',
       'Engineering, Liberal Arts School, Sober School, For Sports Fans, State School',
       'Ivy League, Private School, Research University, For Sports Fans',
       'Engineering, Private School, Research University, For Sports Fans',
       'Private School, Religious, For Sports Fans',
       'Liberal Arts School, Sober School, For Sports Fans, State School',
       'Business, Private School',
       'Liberal Arts School, Party School, Private School, For Sports Fans',
       'State School', 'Engineering, Research University, State School',
       'Liberal Arts School, Private School', 'Private School',
       'Research University, For Sports Fans, State School',
       'Engineering, State School', 'Engineering, Private School',
       'Engineering, Research University, For Sports Fans, State School',
       'Private School, Religious, Research University, For Spor

In [8]:
# Separate school types into individual types 
school_type = list(df['school_types'].unique())

# Create list of unique school types by separating by comma 
unique_type = []
for types in school_type:
    for t in types.split(','):
        if t not in unique_type:
            t= t.lower()
            t= t.replace(' ', '_')
            unique_type.append(t)
        else:
            None 

# Remove duplicate school types
unique_types = []
for t in set(unique_type):
    if t[0] == '_':
        new_t = t[1:]
        unique_types.append(new_t)
    else:
        unique_types.append(t)

unique_types = list(set(unique_types))
unique_types

['ivy_league',
 'party_school',
 'private_school',
 'state_school',
 'art',
 'research_university',
 'engineering',
 'liberal_arts_school',
 'business',
 'religious',
 'sober_school',
 'for_sports_fans']

There are 12 different types of schools. We'll need to create separate columns for each.

In [12]:
# Create 12 new columns in DataFrame that signify whether school falls under each school type 
df['engineering'] = df.school_types.str.contains('Engineering')
df['private_school'] = df.school_types.str.contains('Private School')
df['religious'] = df.school_types.str.contains('Religious')
df['art'] = df.school_types.str.contains('Art')
df['for_sports_fans'] = df.school_types.str.contains('For Sports Fans')
df['party_school'] = df.school_types.str.contains('Party')
df['liberal_arts_school'] = df.school_types.str.contains('Liberal Arts')
df['state_school'] = df.school_types.str.contains('State')
df['research_university'] = df.school_types.str.contains('Research')
df['business'] = df.school_types.str.contains('Business')
df['sober_school'] = df.school_types.str.contains('Sober')
df['ivy_league'] = df.school_types.str.contains('Ivy League')
df.drop(columns=['school_types'], inplace=True, axis=1)
df.head()

Unnamed: 0,school_name,early_career_pay,mid_career_pay,meaning_percentage,stem_percentage,engineering,private_school,religious,art,for_sports_fans,party_school,liberal_arts_school,state_school,research_university,business,sober_school,ivy_league
0,Harvey Mudd College,91400,162500,56,85,True,True,False,True,False,False,True,False,False,False,False,False
1,Massachusetts Institute of Technology,88300,158100,52,69,True,True,False,False,False,False,False,False,True,False,False,False
2,United States Naval Academy,79600,152600,61,58,True,False,False,True,True,False,True,True,False,False,True,False
3,Princeton University,77300,150500,49,48,False,True,False,False,True,False,False,False,True,False,False,True
4,California Institute of Technology,87600,150300,56,97,True,True,False,False,False,False,False,False,True,False,False,False


### Tuition Data from data.world 

DataFrame contains tuition costs (in-state, out-of-state, room and board), as well as state and school type.

In [4]:
tuition_cost_df = pd.read_csv('/Users/christianmoya/Documents/Flatiron/Phase_3/Phase_3_Project/tuition_data/tuition_cost.csv')

# Remove characters from school name to match properly 
tuition_cost_df['name'] = tuition_cost_df['name'].str.replace('-',' ')
tuition_cost_df['name'] = tuition_cost_df['name'].str.replace(':', ' ')

tuition_cost_df.head()

Unnamed: 0,name,state,state_code,type,degree_length,room_and_board,in_state_tuition,in_state_total,out_of_state_tuition,out_of_state_total
0,Aaniiih Nakoda College,Montana,MT,Public,2 Year,,2380,2380,2380,2380
1,Abilene Christian University,Texas,TX,Private,4 Year,10350.0,34850,45200,34850,45200
2,Abraham Baldwin Agricultural College,Georgia,GA,Public,2 Year,8474.0,4128,12602,12550,21024
3,Academy College,Minnesota,MN,For Profit,2 Year,,17661,17661,17661,17661
4,Academy of Art University,California,CA,For Profit,4 Year,16648.0,27810,44458,27810,44458


### Diversity Enrollment from data.world 

DataFrame contains enrollment for minority groups and total enrollment. 

In [13]:
# Attain diversity information
diversity_df = pd.read_csv('/Users/christianmoya/Documents/Flatiron/Phase_3/Phase_3_Project/tuition_data/diversity_school.csv')
diversity_df.head()

Unnamed: 0,name,total_enrollment,state,category,enrollment
0,University of Phoenix-Arizona,195059,Arizona,Women,134722
1,University of Phoenix-Arizona,195059,Arizona,American Indian / Alaska Native,876
2,University of Phoenix-Arizona,195059,Arizona,Asian,1959
3,University of Phoenix-Arizona,195059,Arizona,Black,31455
4,University of Phoenix-Arizona,195059,Arizona,Hispanic,13984


In [15]:
# Pivot table to get columns for each category for each college
diversity_table = pd.pivot_table(diversity_df, index='name', columns='category')
diversity_table.columns = diversity_table.columns.map('_'.join)
diversity_table.columns = diversity_table.columns.str.replace(' / ', '_')
diversity_table.columns = diversity_table.columns.str.replace(' ', '_')
diversity_table.columns = diversity_table.columns.str.replace('-', '_')
diversity_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4574 entries, A.T. Still University of Health Sciences to Zion Bible College
Data columns (total 22 columns):
 #   Column                                             Non-Null Count  Dtype
---  ------                                             --------------  -----
 0   enrollment_American_Indian_Alaska_Native           4574 non-null   int64
 1   enrollment_Asian                                   4574 non-null   int64
 2   enrollment_Black                                   4574 non-null   int64
 3   enrollment_Hispanic                                4574 non-null   int64
 4   enrollment_Native_Hawaiian_Pacific_Islander        4574 non-null   int64
 5   enrollment_Non_Resident_Foreign                    4574 non-null   int64
 6   enrollment_Total_Minority                          4574 non-null   int64
 7   enrollment_Two_Or_More_Races                       4574 non-null   int64
 8   enrollment_Unknown                                 4574 non-

In [16]:
# Remove excess columns and rename appropriately 
diversity_table = diversity_table[['enrollment_American_Indian_Alaska_Native', 
                                   'enrollment_Asian',
                                  'enrollment_Black', 
                                  'enrollment_Hispanic', 
                                  'enrollment_Native_Hawaiian_Pacific_Islander', 
                                  'enrollment_Non_Resident_Foreign',
                                  'enrollment_Total_Minority',
                                  'enrollment_Two_Or_More_Races',
                                  'enrollment_Unknown',
                                  'enrollment_White',
                                  'enrollment_Women', 
                                  'total_enrollment_American_Indian_Alaska_Native']]

diversity_table.columns = ['AIAN_enrollment',
                          'Asian_enrollment',
                          'Black_enrollment', 
                          'Hispanic_enrollment',
                          'NHPI_enrollment',
                          'non_resident_enrollment',
                          'total_minority_enrollment',
                          'two_or_more_races_enrollment',
                          'unknown_enrollment',
                          'White_enrollment',
                          'women_enrollment',
                          'total_enrollment']

diversity_table = diversity_table.reset_index()
diversity_table.head()

Unnamed: 0,name,AIAN_enrollment,Asian_enrollment,Black_enrollment,Hispanic_enrollment,NHPI_enrollment,non_resident_enrollment,total_minority_enrollment,two_or_more_races_enrollment,unknown_enrollment,White_enrollment,women_enrollment,total_enrollment
0,A.T. Still University of Health Sciences,19,453,170,189,10,53,1042,201,153,1978,1803,3226
1,AIB College of Business,3,8,38,49,5,20,116,13,421,457,533,1014
2,AOMA Graduate School of Integrative Medicine,0,13,10,17,0,0,40,0,18,138,146,196
3,ASA Institute of Business and Computer Technology,4,345,1500,1788,6,710,3704,61,16,194,2897,4624
4,ATA Career Education,0,1,21,32,0,0,56,2,34,116,187,206


In [18]:
# Calculate percentage of enrollment for each minority group 
minority_enrollment_list = ['AIAN_enrollment',
                          'Asian_enrollment',
                          'Black_enrollment', 
                          'Hispanic_enrollment',
                          'NHPI_enrollment',
                          'non_resident_enrollment',
                          'total_minority_enrollment',
                          'two_or_more_races_enrollment',
                          'unknown_enrollment',
                          'White_enrollment',
                          'women_enrollment']

def calculate_percentage(col):
    new_col_name = col + '_percentage'
    diversity_table[new_col_name] = diversity_table[col] / diversity_table['total_enrollment']
    
for col in minority_enrollment_list: 
    calculate_percentage(col)
    
# Remove extra characters so college names match more 
diversity_table['name'] = diversity_table['name'].str.replace('-',' ')
diversity_table['name'] = diversity_table['name'].str.replace(':', ' ')
diversity_table.info()
diversity_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4574 entries, 0 to 4573
Data columns (total 24 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   name                                     4574 non-null   object 
 1   AIAN_enrollment                          4574 non-null   int64  
 2   Asian_enrollment                         4574 non-null   int64  
 3   Black_enrollment                         4574 non-null   int64  
 4   Hispanic_enrollment                      4574 non-null   int64  
 5   NHPI_enrollment                          4574 non-null   int64  
 6   non_resident_enrollment                  4574 non-null   int64  
 7   total_minority_enrollment                4574 non-null   int64  
 8   two_or_more_races_enrollment             4574 non-null   int64  
 9   unknown_enrollment                       4574 non-null   int64  
 10  White_enrollment                         4574 no

Unnamed: 0,name,AIAN_enrollment,Asian_enrollment,Black_enrollment,Hispanic_enrollment,NHPI_enrollment,non_resident_enrollment,total_minority_enrollment,two_or_more_races_enrollment,unknown_enrollment,...,Asian_enrollment_percentage,Black_enrollment_percentage,Hispanic_enrollment_percentage,NHPI_enrollment_percentage,non_resident_enrollment_percentage,total_minority_enrollment_percentage,two_or_more_races_enrollment_percentage,unknown_enrollment_percentage,White_enrollment_percentage,women_enrollment_percentage
0,A.T. Still University of Health Sciences,19,453,170,189,10,53,1042,201,153,...,0.140422,0.052697,0.058586,0.0031,0.016429,0.323001,0.062306,0.047427,0.613143,0.558896
1,AIB College of Business,3,8,38,49,5,20,116,13,421,...,0.00789,0.037475,0.048323,0.004931,0.019724,0.114398,0.012821,0.415187,0.45069,0.525641
2,AOMA Graduate School of Integrative Medicine,0,13,10,17,0,0,40,0,18,...,0.066327,0.05102,0.086735,0.0,0.0,0.204082,0.0,0.091837,0.704082,0.744898
3,ASA Institute of Business and Computer Technology,4,345,1500,1788,6,710,3704,61,16,...,0.074611,0.324394,0.386678,0.001298,0.153547,0.801038,0.013192,0.00346,0.041955,0.626514
4,ATA Career Education,0,1,21,32,0,0,56,2,34,...,0.004854,0.101942,0.15534,0.0,0.0,0.271845,0.009709,0.165049,0.563107,0.907767


In [19]:
# Remove excess columns
diversity_table_percentage = diversity_table[['name', 'total_enrollment', 'AIAN_enrollment_percentage', 'Asian_enrollment_percentage', 'Black_enrollment_percentage', 'Hispanic_enrollment_percentage', 'NHPI_enrollment_percentage', 'non_resident_enrollment_percentage', 'total_minority_enrollment_percentage', 'unknown_enrollment_percentage', 'White_enrollment_percentage', 'women_enrollment_percentage']]
diversity_table_percentage.head()

Unnamed: 0,name,total_enrollment,AIAN_enrollment_percentage,Asian_enrollment_percentage,Black_enrollment_percentage,Hispanic_enrollment_percentage,NHPI_enrollment_percentage,non_resident_enrollment_percentage,total_minority_enrollment_percentage,unknown_enrollment_percentage,White_enrollment_percentage,women_enrollment_percentage
0,A.T. Still University of Health Sciences,3226,0.00589,0.140422,0.052697,0.058586,0.0031,0.016429,0.323001,0.047427,0.613143,0.558896
1,AIB College of Business,1014,0.002959,0.00789,0.037475,0.048323,0.004931,0.019724,0.114398,0.415187,0.45069,0.525641
2,AOMA Graduate School of Integrative Medicine,196,0.0,0.066327,0.05102,0.086735,0.0,0.0,0.204082,0.091837,0.704082,0.744898
3,ASA Institute of Business and Computer Technology,4624,0.000865,0.074611,0.324394,0.386678,0.001298,0.153547,0.801038,0.00346,0.041955,0.626514
4,ATA Career Education,206,0.0,0.004854,0.101942,0.15534,0.0,0.0,0.271845,0.165049,0.563107,0.907767


### Rankings DataFrame from usnews.com (DF found on data.world) 

DataFrame contains rankings for top 250 colleges and universites in the US, will need to bin ranks later.

In [20]:
rankings_df = pd.read_csv('/Users/christianmoya/Documents/Flatiron/Phase_3/Phase_3_Project/tuition_data/National Universities Rankings.csv', encoding='latin-1')
rankings_df['Name'] = rankings_df['Name'].str.replace('-',' ')
rankings_df['Name'] = rankings_df['Name'].str.replace(':', ' ')
rankings_df.head()

Unnamed: 0,Name,Location,Rank,Description,Tuition and fees,In-state,Undergrad Enrollment
0,Princeton University,"Princeton, NJ",1,"Princeton, the fourth-oldest college in the Un...","$45,320",,5402
1,Harvard University,"Cambridge, MA",2,"Harvard is located in Cambridge, Massachusetts...","$47,074",,6699
2,University of Chicago,"Chicago, IL",3,"The University of Chicago, situated in Chicago...","$52,491",,5844
3,Yale University,"New Haven, CT",3,"Yale University, located in New Haven, Connect...","$49,480",,5532
4,Columbia University,"New York, NY",5,"Columbia University, located in Manhattan's Mo...","$55,056",,6102


### Merge DataFrames Together

We now have data from 4 different DataFrames, must join together to use for model and EDA. 

In [28]:
colleges_df = pd.merge(df, tuition_cost_df, how='left', left_on='school_name', right_on='name')
colleges_df = pd.merge(colleges_df, diversity_table_percentage, how='left', left_on='school_name', right_on='name')
colleges_df = pd.merge(colleges_df, rankings_df, how='left', left_on='school_name', right_on='Name')
colleges_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 1557
Data columns (total 46 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   school_name                           1558 non-null   object 
 1   early_career_pay                      1558 non-null   int64  
 2   mid_career_pay                        1558 non-null   int64  
 3   meaning_percentage                    1558 non-null   int64  
 4   stem_percentage                       1558 non-null   int64  
 5   engineering                           1558 non-null   bool   
 6   private_school                        1558 non-null   bool   
 7   religious                             1558 non-null   bool   
 8   art                                   1558 non-null   bool   
 9   for_sports_fans                       1558 non-null   bool   
 10  party_school                          1558 non-null   bool   
 11  liberal_arts_scho

In [29]:
# Bin ranks into 50, fill missing rank with 300 since not on Top 250 list 
colleges_df['Rank'].fillna(300.00, inplace=True)
bins = [0, 50, 100, 150, 200, 250, 300]
group_names = ['top_50', 'top_100', 'top_150', 'top_200', 'top_250', 'over_250']
colleges_df['school_rank'] = pd.cut(colleges_df['Rank'], bins, labels=group_names)
colleges_df.info()
colleges_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 1557
Data columns (total 47 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   school_name                           1558 non-null   object  
 1   early_career_pay                      1558 non-null   int64   
 2   mid_career_pay                        1558 non-null   int64   
 3   meaning_percentage                    1558 non-null   int64   
 4   stem_percentage                       1558 non-null   int64   
 5   engineering                           1558 non-null   bool    
 6   private_school                        1558 non-null   bool    
 7   religious                             1558 non-null   bool    
 8   art                                   1558 non-null   bool    
 9   for_sports_fans                       1558 non-null   bool    
 10  party_school                          1558 non-null   bool    
 11  libe

Unnamed: 0,school_name,early_career_pay,mid_career_pay,meaning_percentage,stem_percentage,engineering,private_school,religious,art,for_sports_fans,...,White_enrollment_percentage,women_enrollment_percentage,Name,Location,Rank,Description,Tuition and fees,In-state,Undergrad Enrollment,school_rank
0,Harvey Mudd College,91400,162500,56,85,True,True,False,True,False,...,0.437811,0.46393,,,300.0,,,,,over_250
1,Massachusetts Institute of Technology,88300,158100,52,69,True,True,False,False,False,...,0.331125,0.373355,Massachusetts Institute of Technology,"Cambridge, MA",7.0,"MIT is located in Cambridge, Massachusetts, ac...","$48,452",,4527.0,top_50
2,United States Naval Academy,79600,152600,61,58,True,False,False,True,True,...,,,,,300.0,,,,,over_250
3,Princeton University,77300,150500,49,48,False,True,False,False,True,...,0.431998,0.453635,Princeton University,"Princeton, NJ",1.0,"Princeton, the fourth-oldest college in the Un...","$45,320",,5402.0,top_50
4,California Institute of Technology,87600,150300,56,97,True,True,False,False,False,...,0.336351,0.307379,California Institute of Technology,"Pasadena, CA",12.0,"Caltech, which focuses on science and engineer...","$47,577",,1001.0,top_50


In [31]:
# Remove unnecessary columns 
colleges_df = colleges_df.drop(columns=['name_x', 'state', 'name_y', 'Name', 'Location', 'Rank', 'Description', 'Tuition and fees', 'In-state', 'Undergrad Enrollment'])
colleges_df.info()
colleges_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 1557
Data columns (total 37 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   school_name                           1558 non-null   object  
 1   early_career_pay                      1558 non-null   int64   
 2   mid_career_pay                        1558 non-null   int64   
 3   meaning_percentage                    1558 non-null   int64   
 4   stem_percentage                       1558 non-null   int64   
 5   engineering                           1558 non-null   bool    
 6   private_school                        1558 non-null   bool    
 7   religious                             1558 non-null   bool    
 8   art                                   1558 non-null   bool    
 9   for_sports_fans                       1558 non-null   bool    
 10  party_school                          1558 non-null   bool    
 11  libe

Unnamed: 0,school_name,early_career_pay,mid_career_pay,meaning_percentage,stem_percentage,engineering,private_school,religious,art,for_sports_fans,...,Asian_enrollment_percentage,Black_enrollment_percentage,Hispanic_enrollment_percentage,NHPI_enrollment_percentage,non_resident_enrollment_percentage,total_minority_enrollment_percentage,unknown_enrollment_percentage,White_enrollment_percentage,women_enrollment_percentage,school_rank
0,Harvey Mudd College,91400,162500,56,85,True,True,False,True,False,...,0.207711,0.016169,0.099502,0.0,0.126866,0.390547,0.044776,0.437811,0.46393,over_250
1,Massachusetts Institute of Technology,88300,158100,52,69,True,True,False,False,False,...,0.161852,0.030391,0.096033,8.8e-05,0.291722,0.326619,0.050534,0.331125,0.373355,top_50
2,United States Naval Academy,79600,152600,61,58,True,False,False,True,True,...,,,,,,,,,,over_250
3,Princeton University,77300,150500,49,48,False,True,False,False,True,...,0.158506,0.058605,0.070598,0.000865,0.209199,0.324555,0.034248,0.431998,0.453635,top_50
4,California Institute of Technology,87600,150300,56,97,True,True,False,False,False,...,0.262562,0.012223,0.078316,0.000453,0.272974,0.386148,0.004527,0.336351,0.307379,top_50


In [32]:
# Save DataFrame 
colleges_df.to_csv('colleges.csv')