In [1]:
#show all output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
#read in dataset
df = pd.read_csv('hbcu.csv')
#print out column names
print(df.columns)

Index(['Name', 'Address', 'Website', 'Type', 'Awards offered',
       'Campus setting', 'Campus housing', 'Student population',
       'Undergraduate students', 'Graduation Rate', 'Transfer-Out Rate',
       'Cohort Year *', 'Net Price **', 'Largest Program', 'IPEDS ID',
       'OPE ID'],
      dtype='object')


In [2]:
#get data type for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    102 non-null    object 
 1   Address                 105 non-null    object 
 2   Website                 102 non-null    object 
 3   Type                    102 non-null    object 
 4   Awards offered          102 non-null    object 
 5   Campus setting          101 non-null    object 
 6   Campus housing          102 non-null    object 
 7   Student population      100 non-null    float64
 8   Undergraduate students  97 non-null     float64
 9   Graduation Rate         96 non-null     object 
 10  Transfer-Out Rate       63 non-null     object 
 11  Cohort Year *           102 non-null    object 
 12  Net Price **            96 non-null     object 
 13  Largest Program         102 non-null    object 
 14  IPEDS ID                102 non-null    fl

In [3]:
#check Name column for missing data
df['Name'].isna().sum()
#remove rows with missing Names
df = df[df['Name'].notna()]
df['Name'].isna().sum()

3

0

In [4]:
#get information on dataframe after rows with missing Names are removed
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 0 to 101
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    102 non-null    object 
 1   Address                 102 non-null    object 
 2   Website                 102 non-null    object 
 3   Type                    102 non-null    object 
 4   Awards offered          102 non-null    object 
 5   Campus setting          101 non-null    object 
 6   Campus housing          102 non-null    object 
 7   Student population      100 non-null    float64
 8   Undergraduate students  97 non-null     float64
 9   Graduation Rate         96 non-null     object 
 10  Transfer-Out Rate       63 non-null     object 
 11  Cohort Year *           102 non-null    object 
 12  Net Price **            96 non-null     object 
 13  Largest Program         102 non-null    object 
 14  IPEDS ID                102 non-null    float64

In [5]:
#create new highest degree column
#get unique values for 'Awards Offered' column
df['Awards offered'].unique()

array(["Bachelor's degree|Master's degree|Post-master's certificate|Doctor's degree - research/scholarship",
       "Bachelor's degree|Postbaccalaureate certificate|Master's degree|Post-master's certificate|Doctor's degree - research/scholarship|Doctor's degree - professional practice",
       "Less than one year certificate|One but less than two years certificate|Associate's degree|Bachelor's degree|Master's degree|Post-master's certificate",
       "Associate's degree|Bachelor's degree|Master's degree|Post-master's certificate|Doctor's degree - professional practice",
       "Associate's degree|Bachelor's degree|Master's degree|Doctor's degree - research/scholarship",
       "Associate's degree|Bachelor's degree",
       "Bachelor's degree|Master's degree", "Bachelor's degree",
       "Bachelor's degree|Postbaccalaureate certificate|Master's degree",
       "Less than one year certificate|One but less than two years certificate|Associate's degree",
       "Less than one year certific

In [6]:
#find NaN values
df['Awards offered'].isna().sum()

0

In [7]:
#creating binary variables for each degree type
df['Doctor'] = df['Awards offered'].str.contains('Doctor')
df['Master'] = df['Awards offered'].str.contains('Master')
df['Bachelor'] = df['Awards offered'].str.contains('Bachelor')
df['Associate'] = df['Awards offered'].str.contains('Associate')

In [8]:
#create highest degree column
df.loc[df['Associate'] == True, 'Highest Degree'] = 'Associate\'s degree' 
df.loc[df['Bachelor'] == True, 'Highest Degree'] = 'Bachelor\'s degree' 
df.loc[df['Master'] == True, 'Highest Degree'] = 'Master\'s degree' 
df.loc[df['Doctor'] == True, 'Highest Degree'] = 'Doctoral degree' 
df['Highest Degree'].value_counts()

Highest Degree
Doctoral degree       40
Master's degree       33
Bachelor's degree     18
Associate's degree    11
Name: count, dtype: int64

In [9]:
#bar plot for number of schools by highest degree offered

fig = px.histogram(df, x=df['Highest Degree'],
                   title='Number of HBCUs by the Highest Degree Offered',
                   color_discrete_sequence=['navy'])
fig.show()

In [10]:
#chart for student population
#create private/public variable
#get unique values for Type column
df['Type'].unique()
#create mapping from Type variable to Public/Private variable
type_mapping = {'4-year, Public':'Public HBCU',
                '2-year, Public': 'Public HBCU', 
                '4-year, Private not-for-profit':'Private HBCU',
                '2-year, Private not-for-profit':'Private HBCU',
                "4-year, primarily associate's, Private not-for-profit": 'Private HBCU'}
#recode Type column into Public/Private column
df = df.assign(PublicPrivate=df.Type.map(type_mapping))
#crosstab to see if recode went correctly
pd.crosstab(index= df['Type'], columns=df['PublicPrivate'])

array(['4-year, Public', '4-year, Private not-for-profit',
       '2-year, Public', '2-year, Private not-for-profit',
       "4-year, primarily associate's, Private not-for-profit"],
      dtype=object)

PublicPrivate,Private HBCU,Public HBCU
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
"2-year, Private not-for-profit",1,0
"2-year, Public",0,10
"4-year, Private not-for-profit",48,0
"4-year, Public",0,42
"4-year, primarily associate's, Private not-for-profit",1,0


In [11]:
#get Highest degree numbers for public and private HBCUs

fig = px.histogram(df, df['Highest Degree'],
                   title='Number of HBCUs by the Highest Degree Offered',
                   color=df['PublicPrivate'],color_discrete_sequence=px.colors.qualitative.Pastel
                   )
fig.show()

In [12]:
#student population
#look for schools with missing data on student population
df['Name'][df['Student population'].isna()]
#fill in  missing information
df.loc[df['Name']=='Southern University Law Center', 'Student population']=843
df.loc[df['Name']=='University of the Virgin Islands-Albert A. Sheen', 'Student population']=2138
df['Name'][df['Student population'].isna()]

77                      Southern University Law Center
92    University of the Virgin Islands-Albert A. Sheen
Name: Name, dtype: object

Series([], Name: Name, dtype: object)

In [13]:
#create scatterplot
fig = px.scatter(df,x=df['Student population'],
                 size=df['Student population'],
                 hover_name=df['Name'],
                 title='Student population of HBCUs',color_discrete_sequence=px.colors.qualitative.Bold,
                  color=df["PublicPrivate"])
fig.show()

In [14]:
#geocode Address column
#get longitude and latitude coordinates for school addresses
from geopy.geocoders import Nominatim
addresses = df["Address"].to_list()
names = df['Name'].to_list()

geo=Nominatim(user_agent="hbcu")
Latitude=[]
Longitude=[]
for index,x in enumerate(names):
    l1 = geo.geocode(x)
    if l1 is None:
        l1=geo.geocode(addresses[index])
    Latitude.append(l1.latitude)
    Longitude.append(l1.longitude)


AttributeError: 'NoneType' object has no attribute 'latitude'