# <font color=deeppink>**Create a Dataset to Use for EDA**</font>

## <font color=royalblue>Load Necessary Libraries</font>

In [None]:
import pandas as pd
import numpy as np
import datetime
import csv

## <font color=royalblue>Pertinent Functions</font>

In [None]:
# The Data Was Formatted So That 0 Indicates "Male", 1 Indicates "Female", and -1 Indicates "Unknown"
def gender_labels(numericalValue):
    if numericalValue == 0:
        text = "Male"
    elif numericalValue == 1:
        text = "Female"
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
    
    return text

# The Data Was Formatted So That 0 Indicates "No" and 1 Indicates "Yes"
def yes_no_labels(numericalValue):
    if numericalValue == 0:
        text = "No"
    elif numericalValue == 1:
        text = "Yes"
    else:
        text = "Error"
        
    return text

def eligibility_labels(numericalValue):
    # 1: Low Income and First Generation
    if numericalValue == 1:
        text = "Low Income & First Generation"
    # 2: Low Income Only
    elif numericalValue == 2:
        text = "Low Income"
    # 3: First Generation Only
    elif numericalValue == 3:
        text = "First Generation"
    # 4: At Risk for Academic Failure Only
    elif numericalValue == 4:
        text = "At Risk for Academic Failure"
    # 5: Low Income and At High Risk for Academic Failure
    elif numericalValue == 5:
        text = "Low Income & At Risk for Academic Failure"
    # 6: First Generation and At High Risk for Academic Failure
    elif numericalValue == 6:
        text = "First Generation & At Risk for Academic Failure"
    # 7: Low Income, First Generation, and At High Risk for Academic Failure
    elif numericalValue == 7:
        text = "Low Income, First Generation, & At Risk for Academic Failure"
    # -1: Unknown
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
    
    return text

def academic_need_labels(numericalValue):
    # 1: Low GPA
    if numericalValue == 1:
        text = "Low GPA"
    # 2: Low Achievement Test Scores
    elif numericalValue == 2:
        text = "Low Test Scores"
    # 3: Low Educational Aspirations
    elif numericalValue == 3:
        text = "Low Educational Aspirations"
    # 4: Low GPA and Low Educational Aspirations
    elif numericalValue == 4:
        text = "Low GPA & Low Educational Aspirations"
    # 5: Low GPA and Low Achievement Test Scores
    elif numericalValue == 5:
        text = "Low GPA & Low Test Scores"
    # 6: Low Achievement Test Scores and Low Educational Aspirations
    elif numericalValue == 6:
        text = "Low Test Scores & Low Educational Aspirations"
    # 7: Lack of Opportunity/Support
    elif numericalValue == 7:
        text = "Lack of Opportunity/Support"
    # 8: Lack of Career Goals
    elif numericalValue == 8:
        text = "Lack of Career Goals"
    #  9: Limited English Proficiency
    elif numericalValue == 9:
        text = "Limited English Proficiency"
    # 10: Lack of Confidence/Self Esteem/Social Skills
    elif numericalValue == 10:
        text = "Lack of Confidence/Self Esteem/Social Skills"
    # 11: Predominately Low Income Community
    elif numericalValue == 11:
        text = "Predominately Low Income Community"
    # 12: Rural Isolation
    elif numericalValue == 12:
        text = "Rural Isolation"
    # 13: Interest in Careers in Math and Science
    elif numericalValue == 13:
        text = "Interested in Math/Science"
    # 14: Other
    elif numericalValue == 14:
        text = "Other"
    # 15: Diagnosed Learning Disability
    elif numericalValue == 15:
        text = "Diagnosed Learning Disability"
    # 16: Pre-Algebra or Algebra Course Not Successfully Completed by 10th Grade
    elif numericalValue == 16:
        text = "Pre-Algebra or Algebra Not Completed"
    # -1: Unkown
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
    
    return text

# The Data Was Formatted So That 0 Indicates "No", 1 Indicates "Yes", 9 Indicates "Not Applicable", and -1 Indicates "Unknown"
def categorical_labels(numericalValue):
    if (numericalValue == 0) | (numericalValue == 2):
        text = "No"
    elif numericalValue == 1:
        text = "Yes"
    elif numericalValue == 9:
        text = "Not Applicable"
    elif numericalValue == -1:
        text = "Unkown"
    else:
        text = "Error"
        
    return text

def grade_entered_labels(numericalValue):
    if numericalValue == 9:
        text = "Freshman"
    elif numericalValue == 10:
        text = "Sophomore"
    elif numericalValue == 11:
        text = "Junior"
    elif numericalValue == 12:
        text = "Senior"
    else:
        text = "Error"
        
    return text

def participation_labels(numericalValue):
    # 1: Academic Year And Summer Components
    if numericalValue == 1:
        text = "Full Year"
    # 2: Academic Year And Summer Bridge
    elif numericalValue == 2:
        text = "Academic Year And Summer Bridge"
    # 3: Academic Year Only
    elif numericalValue == 3:
        text = "Academic Year Only"
    # 4: Summer Component Only
    elif numericalValue == 4:
        text = "Summer Component Only"
    # 5: Summer Bridge Only
    elif numericalValue == 5:
        text = "Summer Bridge Only"
    # 6: Prior-Year Participant
    elif numericalValue == 6:
        text = "Not Active This Year"
    # -1: Unknown
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
    
    return text

def course_labels(numericalValue):
    # 1: Student Took at Least One Course of This Level
    if numericalValue == 1:
        text = "Took At Least One"
    # 0: Student Did Not Take Any Courses of This Level
    elif numericalValue == 0:
        text = "Took No Such Courses"
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
        
    return text


def degree_labels(numericalValue):
    # 1: Yes
    if numericalValue == 1:
        text = "Yes"
    # 2: No; Pursued Another Kind of Postsecondary Credential
    elif numericalValue == 2:
        text = "Pursued Another Kind of Postsecondary Credential"
    # 3: Deferred Enrollment
    elif numericalValue == 3:
        text = "Deferred Enrollment"
    # 5: Transferred to Four-Year Without Completing Associate
    elif numericalValue == 5:
        text = "Transferred to Four-Year"
    # 6: Currently Enrolled
    elif numericalValue == 6:
        text = "Currently Enrolled"
    # 7: Left Postsecondary Program without Completing It
    elif numericalValue == 7:
        text = "Dropped Out"
    # 9: Not Applicable
    elif numericalValue == 9:
        text = "Not Applicable"
    # -1: Unknown
    elif numericalValue == -1:
        text = "Unknown"
    else:
        text = "Error"
        
    return text

def stem_degree_labels(numericalValue):
    # 1: Yes; "Hard" Science
    if numericalValue == 1:
        text = "Yes; 'Hard' Science"
    # 2: Yes; Psychology or Social Science
    elif numericalValue == 2:
        text = "Yes; Psychology or Social Science"
    else:
        text = categorical_labels(numericalValue)
        
    return text

## <font color=royalblue>Read In Data</font>

In [None]:
eda_df = pd.read_csv("/content/drive/MyDrive/Thesis/Codes/Final Datasets/cjb_final_dataset.csv", index_col="ID")
eda_df

Unnamed: 0_level_0,Gender,Hispanic,AmerInd,Asian,Black,White,Hawaiian,Multinational,LimitedEnglish,Eligibility,...,Student_RapesCateg,Student_RobberiesCateg,Student_AssaultsCateg,Student_PropertyCrimesCateg,Student_BurglariesCateg,Student_LarceniesCateg,Student_VehicleTheftsCateg,Distance_to_HS,Distance_to_PS,PS_Graduated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,1,0,0,0,0,1,...,Low,High,High,Low,High,Low,High,1.527317,2.651641,0
1,1,1,0,0,0,0,0,0,0,2,...,Low,High,High,High,High,Low,High,3.227670,27.026511,1
2,0,1,0,0,0,0,0,0,0,3,...,Low,High,High,High,High,Low,High,1.803466,5.922479,0
3,0,0,0,0,1,0,0,0,0,1,...,Low,High,High,Low,High,Low,High,1.287508,20.718053,0
4,1,1,0,0,0,0,0,0,0,3,...,Low,High,Low,Low,High,Low,High,4.410934,2.378576,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,0,1,0,0,0,0,0,0,0,1,...,Low,High,Low,Low,Low,Low,High,19.355286,20.589324,0
120,1,0,0,0,0,1,0,0,0,1,...,Low,High,Low,Low,High,Low,High,22.010710,161.404493,0
121,0,1,0,0,0,0,0,0,0,1,...,Low,High,Low,Low,Low,Low,High,2.954628,18.060702,1
122,1,1,0,0,0,0,0,0,1,1,...,Low,High,High,High,High,Low,High,21.245041,88.935781,1


## <font color=royalblue>Replace the Numerical Values with Clear Labels</font>

#### <font color=darkviolet>Gender</font>

In [None]:
eda_df["Gender"] = eda_df.Gender.transform(lambda x: gender_labels(x))

print(eda_df["Gender"].value_counts())

Female    69
Male      55
Name: Gender, dtype: int64


#### <font color=darkviolet>Ethnicity and Race</font>

In [None]:
races = ["Hispanic", "AmerInd", "Asian", "Black", "White", "Hawaiian", "Multinational"]

for race in races:
    eda_df[race] = eda_df[race].transform(lambda x: yes_no_labels(x))
    
    print(race)
    print(eda_df[race].value_counts(), "\n")

Hispanic
No     71
Yes    53
Name: Hispanic, dtype: int64 

AmerInd
No    124
Name: AmerInd, dtype: int64 

Asian
No     102
Yes     22
Name: Asian, dtype: int64 

Black
No     78
Yes    46
Name: Black, dtype: int64 

White
No     105
Yes     19
Name: White, dtype: int64 

Hawaiian
No     123
Yes      1
Name: Hawaiian, dtype: int64 

Multinational
No     106
Yes     18
Name: Multinational, dtype: int64 



#### <font color=darkviolet>Limited English Proficiency</font>

In [None]:
eda_df["LimitedEnglish"] = eda_df.LimitedEnglish.transform(lambda x: yes_no_labels(x))

print(eda_df["LimitedEnglish"].value_counts(), "\n")

No     117
Yes      7
Name: LimitedEnglish, dtype: int64 



#### <font color=darkviolet>Eligibility</font>

In [None]:
eda_df["Eligibility"] = eda_df.Eligibility.transform(lambda x: eligibility_labels(x))

print(eda_df["Eligibility"].value_counts(), "\n")

Low Income & First Generation    96
Low Income                       15
First Generation                 13
Name: Eligibility, dtype: int64 



#### <font color=darkviolet>Academic Need</font>

In [None]:
eda_df["AcademNeed"] = eda_df.AcademNeed.transform(lambda x: academic_need_labels(x))

print(eda_df["AcademNeed"].value_counts())

Predominately Low Income Community               50
Interested in Math/Science                       34
Lack of Career Goals                             17
Lack of Opportunity/Support                      10
Low GPA                                           5
Lack of Confidence/Self Esteem/Social Skills      4
Low Test Scores                                   2
Low GPA & Low Test Scores                         1
Low Test Scores & Low Educational Aspirations     1
Name: AcademNeed, dtype: int64


#### <font color=darkviolet>Upward Bound Entry Grade Level</font>

In [None]:
eda_df["Grade_EnteredUB"] = eda_df.Grade_EnteredUB.transform(lambda x: grade_entered_labels(x))

print(eda_df["Grade_EnteredUB"].value_counts())

Sophomore    67
Freshman     51
Senior        3
Junior        3
Name: Grade_EnteredUB, dtype: int64


#### <font color=darkviolet>Participation</font>

In [None]:
eda_df["Participation"] = eda_df.Participation.transform(lambda x: participation_labels(x))

print(eda_df["Participation"].value_counts())

Academic Year Only      60
Not Active This Year    51
Full Year               13
Name: Participation, dtype: int64


#### <font color=darkviolet>ELA Academic Achievement/Proficiency</font>

In [None]:
eda_df["AcademAch_ELA"] = eda_df.AcademAch_ELA.transform(lambda x: categorical_labels(x))

print(eda_df["AcademAch_ELA"].value_counts())

Yes               121
No                  2
Not Applicable      1
Name: AcademAch_ELA, dtype: int64


#### <font color=darkviolet>Math Academic Achievement/Proficiency</font>

In [None]:
eda_df["AcademAch_Math"] = eda_df.AcademAch_Math.transform(lambda x: categorical_labels(x))

print(eda_df["AcademAch_Math"].value_counts())

Yes               118
No                  5
Not Applicable      1
Name: AcademAch_Math, dtype: int64


#### <font color=darkviolet>Employed</font>

In [None]:
eda_df["Employed"] = eda_df.Employed.transform(lambda x: categorical_labels(x))

print(eda_df["Employed"].value_counts())

No                69
Not Applicable    43
Yes               12
Name: Employed, dtype: int64


#### <font color=darkviolet>Participated in Cultural Activities</font>



In [None]:
eda_df["CulturalAct"] = eda_df.CulturalAct.transform(lambda x: categorical_labels(x))

print(eda_df["CulturalAct"].value_counts())

Yes               90
Not Applicable    32
No                 2
Name: CulturalAct, dtype: int64


#### <font color=darkviolet>Community Service</font>

In [None]:
eda_df["CommServ"] = eda_df.CommServ.transform(lambda x: categorical_labels(x))

print(eda_df["CommServ"].value_counts())

Yes               47
Not Applicable    46
No                31
Name: CommServ, dtype: int64


#### <font color=darkviolet>Received Limited English Proficiency Instruction</font>

In [None]:
eda_df["LEPServs"] = eda_df.LEPServs.transform(lambda x: categorical_labels(x))

print(eda_df["LEPServs"].value_counts())

Not Applicable    124
Name: LEPServs, dtype: int64


#### <font color=darkviolet>Took AP Courses</font>

In [None]:
eda_df["AP"] = eda_df.AP.transform(lambda x: course_labels(x))

print(eda_df["AP"].value_counts())

Took At Least One       82
Took No Such Courses    42
Name: AP, dtype: int64


#### <font color=darkviolet>Took Honors Courses</font>

In [None]:
eda_df["Honors"] = eda_df.Honors.transform(lambda x: course_labels(x))

print(eda_df["Honors"].value_counts())

Took At Least One       111
Took No Such Courses     13
Name: Honors, dtype: int64


#### <font color=darkviolet>Received Certificate or Diploma</font>

In [None]:
eda_df["Cert_or_Diploma"] = eda_df.Cert_or_Diploma.transform(lambda x: degree_labels(x))

print(eda_df["Cert_or_Diploma"].value_counts())

Pursued Another Kind of Postsecondary Credential    119
Yes                                                   5
Name: Cert_or_Diploma, dtype: int64


#### <font color=darkviolet>Associate Degree</font>

In [None]:
eda_df["AssocDeg"] = eda_df.AssocDeg.transform(lambda x: degree_labels(x))

print(eda_df["AssocDeg"].value_counts())

Pursued Another Kind of Postsecondary Credential    87
Dropped Out                                         18
Currently Enrolled                                   8
Yes                                                  6
Transferred to Four-Year                             5
Name: AssocDeg, dtype: int64


#### <font color=darkviolet>Bachelor's Degree</font>

In [None]:
eda_df["BachDeg"] = eda_df.BachDeg.transform(lambda x: degree_labels(x))

print(eda_df["BachDeg"].value_counts())

Yes                                                 67
Dropped Out                                         34
Pursued Another Kind of Postsecondary Credential    12
Currently Enrolled                                  11
Name: BachDeg, dtype: int64


#### <font color=darkviolet>Graduated College</font>

In [None]:
eda_df["PS_Graduated"] = eda_df.PS_Graduated.transform(lambda x: categorical_labels(x))

print(eda_df["PS_Graduated"].value_counts())

Yes    69
No     55
Name: PS_Graduated, dtype: int64


#### <font color=darkviolet>Pursued a STEM Degree</font>

In [None]:
eda_df["STEMDeg"] = eda_df.STEMDeg.transform(lambda x: stem_degree_labels(x))

print(eda_df["STEMDeg"].value_counts())

No                                   73
Yes; 'Hard' Science                  32
Yes; Psychology or Social Science    11
Not Applicable                        8
Name: STEMDeg, dtype: int64


#### <font color=darkviolet>Attended a Charter High School</font>

In [None]:
eda_df["Charter"] = eda_df.Charter.transform(lambda x: categorical_labels(x))

print(eda_df["Charter"].value_counts())

No    124
Name: Charter, dtype: int64


#### <font color=darkviolet>Attended a Magnet High School</font>

In [None]:
eda_df["Magnet"] = eda_df.Magnet.transform(lambda x: categorical_labels(x))

print(eda_df["Magnet"].value_counts())

Not Applicable    124
Name: Magnet, dtype: int64


#### <font color=darkviolet>Attended a Title I High School</font>

In [None]:
eda_df["Title1_School"] = eda_df.Title1_School.transform(lambda x: categorical_labels(x))

print(eda_df["Title1_School"].value_counts())

Yes    123
No       1
Name: Title1_School, dtype: int64


#### <font color=darkviolet>High School Had a Title I School-Wide Program</font>

In [None]:
eda_df["Title1_SchoolWide"] = eda_df.Title1_SchoolWide.transform(lambda x: categorical_labels(x))

print(eda_df["Title1_SchoolWide"].value_counts())

Yes               72
No                51
Not Applicable     1
Name: Title1_SchoolWide, dtype: int64


#### <font color=darkviolet>Student's City</font>

In [None]:
# Extract the City's Name From the Full Address
eda_df["Student_City"] = eda_df["Student_FullAddress"].transform(lambda x: x.split(" ")[-2])

# Remove the Comma
eda_df["Student_City"] = eda_df["Student_City"].transform(lambda x: x[:-1])

eda_df["Student_City"]

ID
0      Paterson
1      Paterson
2      Paterson
3      Paterson
4      Paterson
         ...   
119    Paterson
120    Paterson
121    Paterson
122    Paterson
123      Newark
Name: Student_City, Length: 124, dtype: object

#### <font color=darkviolet>High School's City</font>

In [None]:
# Extract the City's Name From the Full Address
eda_df["HS_City"] = eda_df["HS_FullAddress"].transform(lambda x: x.split(" ")[-2])

# Remove the Comma and Make the Entire String Lowercase
eda_df["HS_City"] = eda_df["HS_City"].transform(lambda x: x[:-1].lower())

# Capitalize Only the First Letter 
eda_df["HS_City"] = eda_df["HS_City"].transform(lambda x: str(x[0]).upper() + str(x[1:]))

eda_df["HS_City"]

ID
0      Paterson
1      Paterson
2      Paterson
3      Paterson
4      Paterson
         ...   
119      Newark
120      Newark
121    Paterson
122      Newark
123      Newark
Name: HS_City, Length: 124, dtype: object

## <font color=royalblue>Write the Final Dataset to a .csv File</font>



In [None]:
eda_df

Unnamed: 0_level_0,Gender,Hispanic,AmerInd,Asian,Black,White,Hawaiian,Multinational,LimitedEnglish,Eligibility,...,Student_AssaultsCateg,Student_PropertyCrimesCateg,Student_BurglariesCateg,Student_LarceniesCateg,Student_VehicleTheftsCateg,Distance_to_HS,Distance_to_PS,PS_Graduated,Student_City,HS_City
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Male,No,No,No,Yes,No,No,No,No,Low Income & First Generation,...,High,Low,High,Low,High,1.527317,2.651641,No,Paterson,Paterson
1,Female,Yes,No,No,No,No,No,No,No,Low Income,...,High,High,High,Low,High,3.227670,27.026511,Yes,Paterson,Paterson
2,Male,Yes,No,No,No,No,No,No,No,First Generation,...,High,High,High,Low,High,1.803466,5.922479,No,Paterson,Paterson
3,Male,No,No,No,Yes,No,No,No,No,Low Income & First Generation,...,High,Low,High,Low,High,1.287508,20.718053,No,Paterson,Paterson
4,Female,Yes,No,No,No,No,No,No,No,First Generation,...,Low,Low,High,Low,High,4.410934,2.378576,Yes,Paterson,Paterson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Male,Yes,No,No,No,No,No,No,No,Low Income & First Generation,...,Low,Low,Low,Low,High,19.355286,20.589324,No,Paterson,Newark
120,Female,No,No,No,No,Yes,No,No,No,Low Income & First Generation,...,Low,Low,High,Low,High,22.010710,161.404493,No,Paterson,Newark
121,Male,Yes,No,No,No,No,No,No,No,Low Income & First Generation,...,Low,Low,Low,Low,High,2.954628,18.060702,Yes,Paterson,Paterson
122,Female,Yes,No,No,No,No,No,No,Yes,Low Income & First Generation,...,High,High,High,Low,High,21.245041,88.935781,Yes,Paterson,Newark


In [None]:
# Write the DataFrame to a .csv File
eda_df.to_csv("/content/drive/MyDrive/Thesis/Codes/Final Datasets/cjb_final_dataset_eda.csv", index_label="ID")