# <font color=deeppink>**Combining All the Upward Bound Data Files**</font>

## <font color=royalblue>Load Necessary Libraries</font>

In [1]:
import pandas as pd
import numpy as np
import datetime
import csv
!pip install xlrd==1.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## <font color=royalblue>Pertinent Functions and Classes</font>

#### <font color=darkviolet>Functions for Reformatting the Data</font>

In [2]:
# Avoid Dropping Leading Zeros from Numerical Codes
def avoid_dropping_zeros(row_entry, character, length):
    row_entry = str(row_entry)
    
    # Find a Consistent Reference Point
    reference = row_entry.find(character)
    
    # Extract the Desired Text Using the Reference Point
    desired = row_entry[(reference + 1):(reference + 1 + length)]

    return desired

# Create a Consistent Date Format
def fix_date(value):
    value = str(value)
    
    # Find a Consistent Reference Point
    ## Data is in MM/DD/YYYY
    if value.find("/") >= 0:
        first_slash = value.find("/")
        
        # Extract Month
        month = int(value[(first_slash - 2):first_slash])
        # Extract Day
        day = int(value[(first_slash + 1):(first_slash + 3)])
        # Extract Year
        year = int(value[(first_slash + 4):(first_slash + 8)])
        
    ## Data is in YYYY-MM-DD
    else:
        first_slash = value.find("-")
        
        # Extract Year
        year = int(value[(first_slash - 4):first_slash])
        # Extract Month
        month = int(value[(first_slash + 1):(first_slash + 3)])
        # Extract Day
        day = int(value[(first_slash + 4):(first_slash + 6)])
    
    # Reformat in Year, Month, Day Form
    ## Cannot Make This a "Date" Datatype Because thr College Data Dates Are Not Valid Dates
    date = (year, month, day)
    
    return date

# Convert Categorical Values to More Sensical Values of 1, 0, and -1
def make_categorical(value):
    value = int(value)
    
    if value == 2:
        # No
        return 0
    elif value == 0:
        # Unknown
        return -1
    else:
        # Yes
        return value
  
# Convert Unkown Values from 0 to -1
def fix_unknown(value):
    value = int(value)
    
    if value == 0:
        # Make Unknown Values a -1
        return -1
    else:
        return value

# Convert Categorical Values to More Sensical Values, Including Not Applicable
def fix_categorical_values(value):
    value = int(value)
    
    if value == 2:
        # No
        return 0
    elif value == 0:
        # Unknown
        return -1
    elif value == 1:
        # Yes
        return value
    else:
        # Not Applicable
        return 9

# Convert Proficiency Values to the Categorical Numbering Format 
def fix_proficiency(value):
    value = int(value)
    
    if value == 4:
        # No
        return 0
    elif value == 0:
        # Unknown
        return -1
    elif value == 3:
        # Yes
        return 1
    else:
        # Not Applicable
        return value

# Convert Not Applicable and Unknown Values to 9 and -1, Respectively
def fix_na_and_unknown(value):
    value = int(value)
    
    if (value == 8) | (value == 9):
        # Not Applicable
        return 9
    elif value == 0:
        # Make Unknown Values a -1
        return -1
    else:
        # Categorical Values
        return value

# Create a Consistent Date Format That Incorporates Not Applicable
def fix_date_with_na(value):
    value = str(value)
    
    # Determine the Date's Current Format
    if value.find("/") >= 0:
        first_slash = value.find("/")
    else:
        first_slash = value.find("-")
        
    # All 88/88/8888 Dates Must be Changed to 99/99/9999
    if value[(first_slash - 2)] == 8:
        return fix_date("99/99/9999")
    else:
        return fix_date(value)

# Extract the Desired Text from the String Entry
def extract_string(row_entry, character):
    row_entry = str(row_entry)
    
    # Find the Start of the String
    start = row_entry.find(character)
    
    # Find the End of the String
    end = row_entry.rfind(character)
    
    # Extract the Desired Text Using the Reference Points
    desired = row_entry[(start + 1):end]

    return desired

# Separate the City and State
def extract_citystate(row_entry, character):
    citystate = extract_string(row_entry, character)

    # Returns a List of the Form [City, State]
    return citystate.split(",")

#### <font color=darkviolet>Student Data Classes</font>

In [3]:
# High School
class HS_Student:
        def __init__(self, row, year):
            ## Collect the Relevant Information on Each Graduating Senior
            # 9-Digit Value
            self.ID = avoid_dropping_zeros(row["SSN"], "S", 9)
                
            # Extract the Date
            date = fix_date(row["DOB"])
            ## Make It a "Date" Datatype
            self.DOB = datetime.date(date[0], date[1], date[2])

            # Reformat Data so 0 Indicates "Male", 1 Indicates "Female", and -1 Indicates "Unknown"
            self.Gender = int(row["GenderCD"]) - 1
            
            # For the Categorical Variables, 0 Indicates "No" and 1 Indicates "Yes"
            self.Hispanic = make_categorical(row["Ethnic"])
            self.AmerInd = make_categorical(row["Race1"])
            self.Asian = make_categorical(row["Race2"])
            self.Black = make_categorical(row["Race3"])
            self.White = make_categorical(row["Race4"])
            self.Hawaiian = make_categorical(row["Race5"])
            self.LimitedEnglish = make_categorical(row["LEPEntry"])

            # Eligibility      1: Low Income and First Generation
            #                  2: Low Income Only
            #                  3: First Generation Only
            #                  4: At Risk for Academic Failure Only
            #                  5: Low Income and At High Risk for Academic Failure
            #                  6: First Generation and At High Risk for Academic Failure
            #                  7: Low Income, First Generation, and At High Risk for Academic Failure
            #                 -1: Unknown
            self.Eligibility = fix_unknown(row["EligibilityCD"])
            
            if year <= 2011:
                # Academic Need   1: Low GPA
                #                 2: Low Achievement Test Scores
                #                 3: Low Educational Aspirations
                #                 4: Low GPA and Low Educational Aspirations
                #                 5: Low GPA and Low Achievement Test Scores
                #                 6: Low Achievement Test Scores and Low Educational Aspirations
                #                 7: Lack of Opportunity/Support
                #                 8: Lack of Career Goals
                #                 9: Limited English Proficiency
                #                 10: Lack of Confidence/Self Esteem/Social Skills
                #                 11: Predominately Low Income Community
                #                 12: Rural Isolation
                #                 13: Interest in Careers in Math and Science
                #                 14: Other
                #                 15: Diagnosed Learning Disability
                #                 -1: Unknown
                self.AcademNeed = fix_unknown(row["NeedCD1"])
            else:
                # Academic Need   3: Low Educational Aspirations
                #                 7: Lack of Opportunity/Support
                #                 8: Lack of Career Goals
                #                 9: Limited English Proficiency
                #                 10: Lack of Confidence/Self Esteem/Social Skills
                #                 11: Predominately Low Income Community
                #                 12: Rural Isolation
                #                 13: Interest in Careers in Math and Science
                #                 14: Other
                #                 15: Diagnosed Learning Disability
                #                 16: Pre-Algebra or Algebra Course Not Successfully Completed by 10th Grade
                #                 -1: Unkown
                self.AcademNeed = fix_unknown(row["OtherNeed"])
                
                # Update the AcademNeed Value if either of the Following Is True
                if int(row["AtRiskStdTest"]) == 1:
                    ## Low Achievement Test Scores
                     self.AcademNeed = 2

                if int(row["AtRiskLowGPA"]) == 1:
                    if self.AcademNeed == 2:
                        ## Low GPA and Low Achievement Test Scores
                        self.AcademNeed = 5  
                    else:
                        ## Low GPA
                        self.AcademNeed = 1
            
            if year <= 2011:
                # Field Didn't Exist Yet
                self.Disconnected = -1
            else:
                self.Disconnected = fix_categorical_values(row["Disconnected"])
            
            # 12-Digit Value
            self.HS_SchoolID = avoid_dropping_zeros(row["NCESSchID"], "H", 12)
            
            # Extract the Date
            date = fix_date(row["ProjEntryDT"])
            ## Make It a Date Datatype
            self.Date_EnteredUB = datetime.date(date[0], date[1], date[2])
            
            self.Grade_EnteredUB = int(row["EnterGradeLV"])
            
            # Participation    1: Academic Year And Summer Components
            #                  2: Academic Year And Summer Bridge
            #                  3: Academic Year Only
            #                  4: Summer Component Only
            #                  5: Summer Bridge Only
            #                  6: Prior-Year Participant
            #                 -1: Unknown
            self.Participation = fix_unknown(row["PartLV"])
            
            # Start and End Refer to the Start and End of the Reported School Year
            self.StartGradeLevel = int(row["StartGradeLV"])
            self.EndGradeLevel = int(row["EndGradeLV"])
            
            if year <= 2011:
                self.GPA = float(row["HSGPA1"])
            else:
                self.GPA = float(row["HSGPA"])
            
            # Graduation Status    1: Currently Enrolled in High School
            #                      2: High School Dropout
            #                      3: Received High School Diploma
            #                      4: Received High School Equivalency Certificate
            #                      5: Other
            #                      6: Completed 12th Grade but Failed to Meet State/Local Graduation Requirements
            #                     -1: Unknown
            self.GradStatus = fix_unknown(row["HsGRAD"])
            
            # Extract the Date
            date = fix_date(row["HsgradDT"])
            ## Make It a Date Datatype
            self.GraduationDate = datetime.date(date[0], date[1], date[2])
            
            # Compute the Student's Age at Graduation (Expressed as the Number of Days)
            self.HSGrad_Age = (self.GraduationDate - self.DOB).days
            
            # Proficiency      1: Yes
            #                  0: No
            #                  9: Not Applicable
            #                 -1: Unknown
            self.AcademAch_ELA = fix_proficiency(row["HSProficientRLA"])
            self.AcademAch_Math = fix_proficiency(row["HSProficientMATH"])
            
            # Non-Program Info    1: Yes
            #                     0: No
            #                     9: Not Applicable
            #                    -1: Unknown
            if year <= 2011:
                self.Employed = fix_categorical_values(row["Employ"])
            else:
                ## Yes; Job Secured by Student without Assistance
                if int(row["Employ"]) == 2:
                    self.Employed = 1
                elif int(row["Employ"]) == 3:
                    self.Employed = 0
                else:
                    self.Employed = fix_categorical_values(row["Employ"])
                    
            self.CulturalAct = fix_categorical_values(row["CultAct"])
            self.CommServ = fix_categorical_values(row["CmtySer"])
            self.LEPServs = fix_categorical_values(row["LEPInstruct"])
            
            # Extract the Student's Home Address
            Student_Address = extract_string(row["StreetAddress"], "*")
            
            # Extract the Student's Home City and State
            city_state = extract_citystate(row["City_State"], "*")
            Student_City = city_state[0].strip()
            Student_State = city_state[1].strip()
            
            # Create a String of the Student's Entire Address
            self.Student_FullAddress = Student_Address + ", " + Student_City + ", " + Student_State
            
            # 5-Digit Value
            self.Student_ZIP = str("#" + avoid_dropping_zeros(row["ZipCode"], "#", 5))
            
            # A Value of -1 Indicates an Unknown Value
            self.Absences = int(row["Absences"])
            self.SAT_Reading = int(row["SAT_Reading"])
            self.SAT_Math = int(row["SAT_Math"])
            self.SAT_Writing = int(row["SAT_Writing"])
            
            # AP Courses           1: Student Took at Least One AP Course
            #                      0: Student Did Not Take an AP Course
            self.AP = int(row["AP"])
            
            # Honors Courses       1: Student Took at Least One Honors Course
            #                      0: Student Did Not Take an Honors Course
            self.Honors = int(row["Honors"])
            
        # Retrieve the Student's Unique ID Number
        def getID(self):
            # Returns the Student's ID Number
            return self.ID
        
        # Add the Student's Info to the Dictionary of Dictionaries
        def create_dictionary(self):
            student_dict = {}
            
            student_dict["DOB"] = self.DOB
            student_dict["Gender"] = self.Gender
            student_dict["Hispanic"] = self.Hispanic
            student_dict["AmerInd"] = self.AmerInd
            student_dict["Asian"] = self.Asian
            student_dict["Black"] = self.Black
            student_dict["White"] = self.White
            student_dict["Hawaiian"] = self.Hawaiian
            student_dict["LimitedEnglish"] = self.LimitedEnglish
            student_dict["Eligibility"] = self.Eligibility
            student_dict["AcademNeed"] = self.AcademNeed
            student_dict["Disconnected"] = self.Disconnected
            student_dict["HS_SchoolID"] = self.HS_SchoolID
            student_dict["Date_EnteredUB"] = self.Date_EnteredUB
            student_dict["Grade_EnteredUB"] = self.Grade_EnteredUB
            student_dict["Participation"] = self.Participation
            student_dict["StartGradeLevel"] = self.StartGradeLevel
            student_dict["EndGradeLevel"] = self.EndGradeLevel
            student_dict["GPA"] = self.GPA
            student_dict["GradStatus"] = self.GradStatus
            student_dict["GraduationDate"] = self.GraduationDate
            student_dict["HSGrad_Age"] = self.HSGrad_Age
            student_dict["AcademAch_ELA"] = self.AcademAch_ELA
            student_dict["AcademAch_Math"] = self.AcademAch_Math
            student_dict["Employed"] = self.Employed
            student_dict["CulturalAct"] = self.CulturalAct
            student_dict["CommServ"] = self.CommServ
            student_dict["LEPServs"] = self.LEPServs
            student_dict["Student_FullAddress"] = self.Student_FullAddress
            student_dict["Student_ZIP"] = self.Student_ZIP
            student_dict["Absences"] = self.Absences
            student_dict["SAT_Reading"] = self.SAT_Reading
            student_dict["SAT_Math"] = self.SAT_Math
            student_dict["SAT_Writing"] = self.SAT_Writing
            student_dict["AP"] = self.AP
            student_dict["Honors"] = self.Honors
            
            return student_dict

In [4]:
# College
class College_Student:
        def __init__(self, row, year):
            ## Collect the Relevant Information on Each College Student
            # 9-Digit Value
            self.ID = avoid_dropping_zeros(row["SSN"], "S", 9)
                
            # Extract the Date
            # 1st Enrollment Date    66/66/6666: Accepted but Deferred to Next Semester
            #                        88/88/8888: Still in High School
            #                        99/99/9999: Graduated but Not Yet Enrolled in Postsecondary Program
            #                        00/00/0000: Unknown
            self.PS_DateEnrolled = fix_date(row["FirstEnrollDT"])
            
            self.PS_Cohort = int(row["PSECohort"])
            
            # 6-Digit Value
            self.PS_SchoolID = str("#" + avoid_dropping_zeros(row["PSECDFE"], "C", 6))
            
            # Grade Level      1: Accepted but Not Yet Enrolled
            #                  2: First Year Postsecondary Student
            #                  3: Continuing Postsecondary Student/Still Enrolled
            #                  4: Completed a Postsecondary Program but Continuing in Postsecondary
            #                  5: Not Enrolled; Completed a Postsecondary Program Prior to Reporting Year
            #                  7: Left Postsecondary Program without Completing It
            #                  9: Not Applicable
            #                 10: Other
            #                 11: Deferred Enrollment
            #                 -1: Unknown
            self.PS_GradeLevel = fix_na_and_unknown(row["PSEGradeLV"])
            
            # Certificate or Diploma     1: Yes
            #                            2: No; Pursued Another Kind of Postsecondary Credential
            #                            3: Deferred Enrollment
            #                            6: Currently Enrolled
            #                            7: Left Postsecondary Program without Completing It
            #                            9: Not Applicable
            #                           -1: Unknown
            self.Cert_or_Diploma = fix_na_and_unknown(row["CertificateCD"])
        
            # Extract the Date
            # Certificate Date          22/22/2222: Pursued Another Kind of Postsecondary Credential
            #                           33/33/3333: Deferred Enrollment
            #                           66/66/6666: Still Enrolled
            #                           77/77/7777: Left Postsecondary Program without Completing It
            #                           99/99/9999: Not Applicable
            #                           00/00/0000: Unknown
            self.Cert_Date = fix_date_with_na(row["CertificateDT"])
            
            # Associate Degree          1: Yes
            #                           2: No; Pursued Another Kind of Postsecondary Credential
            #                           3: Deferred Enrollment
            #                           5: Transferred to Four-Year Without Completing Associate
            #                           6: Currently Enrolled
            #                           7: Left Postsecondary Program without Completing It
            #                           9: Not Applicable
            #                          -1: Unknown
            self.AssocDeg = fix_na_and_unknown(row["AssocDegreeCD"])
            
            # Extract the Date
            # Associate Degree Date       22/22/2222: Pursued Another Kind of Postsecondary Credential
            #                             33/33/3333: Deferred Enrollment
            #                             55/55/5555: Transferred to Four-Year Without Completing Associate
            #                             66/66/6666: Still Enrolled
            #                             77/77/7777: Left Postsecondary Program without Completing It
            #                             99/99/9999: Not Applicable
            #                             00/00/0000: Unknown
            self.AssocDeg_Date = fix_date_with_na(row["AssocDegreeDT"])
            
            # Bachelor's Degree         1: Yes
            #                           2: No; Pursued Another Kind of Postsecondary Credential
            #                           3: Deferred Enrollment
            #                           6: Currently Enrolled
            #                           7: Left Postsecondary Program without Completing It
            #                           9: Not Applicable
            #                          -1: Unknown
            self.BachDeg = fix_na_and_unknown(row["BachDegreeCD"])
            
            # Extract the Date
            # Bachelor's Degree Date      22/22/2222: Pursued Another Kind of Postsecondary Credential
            #                             33/33/3333: Deferred Enrollment
            #                             66/66/6666: Still Enrolled
            #                             77/77/7777: Left Postsecondary Program without Completing It
            #                             99/99/9999: Not Applicable
            #                             00/00/0000: Unknown
            self.BachDeg_Date = fix_date_with_na(row["BachDegreeDT"])
            
            # Graduated College      1: Yes
            #                        0: No
            #                        9: Not Applicable
            #                       -1: Unknown
            self.PS_Graduated = fix_categorical_values(row["PSCompleteNum"])       
                
            # STEM Degree            1: Yes; "Hard" Science
            #                        2: Yes; Psychology or Social Science
            #                        0: No
            #                        9: Not Applicable
            #                       -1: Unknown
            if int(row["STEMDegree"]) == 3:
                # No
                self.STEMDeg = 0
            else:
                self.STEMDeg = fix_na_and_unknown(row["STEMDegree"])
        
        # Retrieve the Student's Unique ID Number
        def getID(self):
            # Returns the Student's ID Number
            return self.ID
        
        # Add the College Information to the Student's Dictionary
        def add_to_dictionary(self, dict_for_student):
            dict_for_student["PS_DateEnrolled"] = self.PS_DateEnrolled
            dict_for_student["PS_Cohort"] = self.PS_Cohort
            dict_for_student["PS_SchoolID"] = self.PS_SchoolID
            dict_for_student["PS_GradeLevel"] = self.PS_GradeLevel
            dict_for_student["Cert_or_Diploma"] = self.Cert_or_Diploma
            dict_for_student["Cert_Date"] = self.Cert_Date
            dict_for_student["AssocDeg"] = self.AssocDeg
            dict_for_student["AssocDeg_Date"] = self.AssocDeg_Date
            dict_for_student["BachDeg"] = self.BachDeg
            dict_for_student["BachDeg_Date"] = self.BachDeg_Date
            dict_for_student["PS_Graduated"] = self.PS_Graduated
            dict_for_student["STEMDeg"] = self.STEMDeg
            
            return

## <font color=royalblue>Read In Data</font>

In [5]:
# Information Needed to Effectively Access and Extract from the Data Files
years = ["2007-08", "2008-09", "2009-10", "2010-11", "2011-12", "2012-13", "2013-14", "2014-15", 
         "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21"]

# Uses the Number of Rows in the File Minus 1
## These Values Have Already Been Subtracted
file_nrows = [186, 209, 230, 245, 274, 293, 315, 254, 
              277, 302, 224, 227, 213, 208]

file_cols = ["A:BL", "A:BL", "A:BL", "A:BL", "A:BN", "A:CA", "A:CE", "A:CE",
             "A:BU", "A:BU", "A:BU", "A:BU", "A:BU", "A:BU"]

file_index = 0

# Columns Used for High School Data
## Years 2007-08 to 2011-12
hs_columns_1 = ["SSN", "DOB", "GenderCD", "Ethnic", "Race1", "Race2", "Race3", "Race4", "Race5", "LEPEntry", "EligibilityCD", 
                "NeedCD1", "NCESSchID", "ProjEntryDT", "EnterGradeLV", "PartLV", "StartGradeLV", "EndGradeLV", "HSGPA1", 
                "HsGRAD", "HsgradDT", "HSProficientRLA", "HSProficientMATH", "Employ", "CultAct", "CmtySer", "LEPInstruct", 
                "StreetAddress", "City_State", "ZipCode", "Absences", "SAT_Reading", "SAT_Math", "SAT_Writing", "AP", "Honors"]
## Years 2012-13 to 2014-15
hs_columns_2 = ["SSN", "DOB", "GenderCD", "Ethnic", "Race1", "Race2", "Race3", "Race4", "Race5", "EligibilityCD", 
                "AtRiskStdTest", "AtRiskLowGPA", "OtherNeed", "LEPEntry", "Disconnected", "NCESSchID", "ProjEntryDT", 
                "EnterGradeLV", "PartLV", "StartGradeLV", "EndGradeLV", "HSGPA", "HsGRAD", "HsgradDT", "HSProficientRLA", 
                "HSProficientMATH", "Employ", "CultAct", "CmtySer", "LEPInstruct", "StreetAddress", "City_State", "ZipCode", 
                "Absences", "SAT_Reading", "SAT_Math", "SAT_Writing", "AP", "Honors"]

# Columns Used for College Data
college_columns = ["SSN", "FirstEnrollDT", "PSECohort", "PSECDFE", "PSEGradeLV", "CertificateCD", "CertificateDT", 
                   "AssocDegreeCD", "AssocDegreeDT", "BachDegreeCD", "BachDegreeDT", "PSCompleteNum", "STEMDegree"]

# Create a Dictionary to Store All the Information for Each Student
students_dict = {}



# Extract Data from Each File
for year in years:
    # High School
    ## Only Use the Years: 2007-08, 2008-09, 2009-10, 2010-11, 2011-12, 2012-13, 2013-14, 2014-15
    if int(year[:4]) <= 2014:
        df = pd.read_excel("/content/drive/MyDrive/Thesis/Codes/UB Datasets/Final Report " + year + ".xlsx", 
                           nrows=file_nrows[file_index], usecols=file_cols[file_index])
        
        # Keep Only High School Seniors Who Attended College the Following Year
        graduating_seniors = df[(df.StartGradeLV ==12) & (df.EndGradeLV == 13)]
        
        # Determine Which Column Set to Use
        ## New Columns Were Added in 2012-13
        if int(year[:4]) <= 2011:
            hs_columns = hs_columns_1
        else:
            hs_columns = hs_columns_2
        
        # Drop Unnecessary Columns
        graduating_seniors = graduating_seniors.loc[:, hs_columns]
        
        # Iterate through the Students
        for student in graduating_seniors.SSN:
            # Skip over Students Already in the Dictionary
                ## Accounts for the Errors in the Data Where Students from Previous Years Were Just Pasted to the Bottom of the 
                ## Document without Updating Their Grade Levels
            if str(student)[1:] in students_dict:
                continue
            else:
                student_row = graduating_seniors[graduating_seniors.SSN == student]
                
                # Make Each Graduating Senior an Instance of the HS_Student Class
                student = HS_Student(student_row, int(year[:4]))

                # Add the Student to the Student Dictionary
                students_dict[student.getID()] = student.create_dictionary()
    
    # College
    ## Only Use the Years: 2013-14, 2014-15, 2015-16, 2016-17, 2017-18, 2018-19, 2019-20, 2020-21
    if int(year[:4]) >= 2013:
        df = pd.read_excel("/content/drive/MyDrive/Thesis/Codes/UB Datasets/Final Report " + year + ".xlsx", 
                           nrows=file_nrows[file_index], usecols=file_cols[file_index])
       
        # Determine the Report's Graduation Cohort (File's End Year - 6)
        ## Upward Bound Students Are Expected to Graduate from College within Six Years of Graduating High School
        cohort = int("20" + year[5:]) - 6
        
        # Keep Only College Students in the Year's Graduation Cohort
        college_students = df[df.PSECohort == cohort]
        
        # Drop Unnecessary Columns
        college_students = college_students.loc[:, college_columns]
        
        # Iterate through the Students
        for student in college_students.SSN:
            # Ensure the Student Is Already in the Dictionary
            if str(student)[1:] in students_dict:
                pass
            else:
                print("New Student in year", year + "!")
                print(student)
            
            student_row = college_students[college_students.SSN == student]
            
            # Make Each College Student an Instance of the College_Student Class
            student = College_Student(student_row, int(year[:4]))

            # Add the Student to the Student Dictionary
            student.add_to_dictionary(students_dict[student.getID()])
    
    # Increase File Counter
    file_index += 1

In [6]:
len(students_dict)

150

## <font color=royalblue>Create a DataFrame</font>


In [7]:
# Create a DataFrame with All the Students and Their Relevant Upward Bound Data
## Each Row is a Student and Each Column is a Predictor
students_df = pd.DataFrame(students_dict).T

In [8]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150 entries, 138156102 to 135046565
Data columns (total 48 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   DOB                  150 non-null    object
 1   Gender               150 non-null    object
 2   Hispanic             150 non-null    object
 3   AmerInd              150 non-null    object
 4   Asian                150 non-null    object
 5   Black                150 non-null    object
 6   White                150 non-null    object
 7   Hawaiian             150 non-null    object
 8   LimitedEnglish       150 non-null    object
 9   Eligibility          150 non-null    object
 10  AcademNeed           150 non-null    object
 11  Disconnected         150 non-null    object
 12  HS_SchoolID          150 non-null    object
 13  Date_EnteredUB       150 non-null    object
 14  Grade_EnteredUB      150 non-null    object
 15  Participation        150 non-null    object
 16 

In [9]:
# Determine Which Students Are Missing College Data
students_df[students_df.PS_DateEnrolled.isnull() == True].shape

(3, 48)

In [10]:
# Determine the Graduation Years of the Students to Verify the Data Is Not Mislabled
students_df[students_df.PS_DateEnrolled.isnull() == True].GraduationDate.values

array([datetime.date(2008, 6, 15), datetime.date(2014, 6, 15),
       datetime.date(2015, 6, 15)], dtype=object)

## <font color=royalblue>Write the Final Dataset to a .csv File</font>

In [11]:
# Drop the 3 Students Who Are Missing All College Data
final_df = students_df[students_df.PS_DateEnrolled.notnull() == True]

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147 entries, 138156102 to 135046565
Data columns (total 48 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   DOB                  147 non-null    object
 1   Gender               147 non-null    object
 2   Hispanic             147 non-null    object
 3   AmerInd              147 non-null    object
 4   Asian                147 non-null    object
 5   Black                147 non-null    object
 6   White                147 non-null    object
 7   Hawaiian             147 non-null    object
 8   LimitedEnglish       147 non-null    object
 9   Eligibility          147 non-null    object
 10  AcademNeed           147 non-null    object
 11  Disconnected         147 non-null    object
 12  HS_SchoolID          147 non-null    object
 13  Date_EnteredUB       147 non-null    object
 14  Grade_EnteredUB      147 non-null    object
 15  Participation        147 non-null    object
 16 

In [13]:
# Remove the SSNs (Used as the Index) from the DataFrame
## Replace the Index with Unidentifying Values
final_df.reset_index(drop=True, inplace=True)
final_df

Unnamed: 0,DOB,Gender,Hispanic,AmerInd,Asian,Black,White,Hawaiian,LimitedEnglish,Eligibility,...,PS_SchoolID,PS_GradeLevel,Cert_or_Diploma,Cert_Date,AssocDeg,AssocDeg_Date,BachDeg,BachDeg_Date,PS_Graduated,STEMDeg
0,1989-11-08,1,0,0,1,0,0,0,1,1,...,#009344,5,2,"(2222, 22, 22)",2,"(2222, 22, 22)",1,"(2012, 5, 16)",1,1
1,1990-10-28,1,0,0,0,1,0,0,0,1,...,#002629,7,2,"(2222, 22, 22)",2,"(2222, 22, 22)",7,"(7777, 77, 77)",0,0
2,1990-12-07,1,1,0,0,0,0,0,0,1,...,#001397,5,2,"(2222, 22, 22)",2,"(2222, 22, 22)",1,"(2012, 5, 15)",1,-1
3,1989-12-27,1,0,0,0,1,0,0,0,1,...,#007502,5,2,"(2222, 22, 22)",2,"(2222, 22, 22)",1,"(2013, 1, 13)",9,9
4,1990-04-10,1,0,0,0,1,0,0,0,2,...,#002629,3,2,"(2222, 22, 22)",2,"(2222, 22, 22)",6,"(6666, 66, 66)",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,1996-08-21,0,0,0,0,1,0,0,0,1,...,#002617,7,2,"(2222, 22, 22)",7,"(7777, 77, 77)",7,"(7777, 77, 77)",0,0
143,1997-05-07,1,1,0,0,0,1,0,0,1,...,#002629,5,2,"(2222, 22, 22)",2,"(2222, 22, 22)",1,"(2021, 5, 1)",1,1
144,1997-05-02,0,1,0,0,0,1,0,0,1,...,#009344,5,2,"(2222, 22, 22)",2,"(2222, 22, 22)",1,"(2019, 12, 21)",1,0
145,1997-06-19,1,1,0,0,0,0,0,0,2,...,#009344,7,2,"(2222, 22, 22)",7,"(7777, 77, 77)",7,"(7777, 77, 77)",0,0


In [None]:
# Write the DataFrame to a .csv File
final_df.to_csv("/content/drive/MyDrive/Thesis/Codes/Combining Datasets/ub_data.csv", index_label="ID")