# GCS Driveline Sorting

## 1. Load Data:

In [2]:
# Load in libraries
import pandas as pd

# CHANGE EACH YEAR
FILENAME = 'DrivelineData_2023.xls'

# Load in Excel file and rename columns:
xl = pd.ExcelFile(FILENAME)
df = xl.parse(sheet_name=0, names=['student', 'grade', 'parent'])
df = df.sort_values(['student', 'parent'])
df.head()

Unnamed: 0,student,grade,parent
1,"Abbott, Mercy",8,"Abbott, Joshua and Michelle"
2,"Akers, Nash",6,"Akers, Trent and Necole"
3,"Allen, Brody",7,"Allen, Joshua and Carissa"
4,"Allen, Neveya",5,"Allen, Joshua and Carissa"
5,"Allison, Bennett",1,"Allison, Brian and Lily"


## 2. Clean and Format Data:

We need to do a few things before we can assign release groups:
1. Remove duplicate students. 
2. We need to convert 'PK4' and 'K5' to number that we can sort on. We'll make K5 equal to 0
and we'll make PK4 -1. We'll convert them back at the end.
3. Convert grade to an integer. Right now they are all stored as string objects.

In [3]:
# Drop duplicates:
starting_len = len(df)
df.drop_duplicates('student', inplace=True)
ending_len = len(df)
print(f'Dropped {starting_len - ending_len} duplicate students.')

# Drop rows with empty grades
df.dropna(subset=['grade'], inplace=True)
no_grade_len = len(df)
print(f'Dropped {ending_len - no_grade_len} student(s) with missing grades.')

# Assign a new field called 'grade_new' that converts K5 and PK4 to 0 and -1 respectivly
df['grade_new'] = df['grade'].apply(lambda x: -1 if x=='PK4' else (0 if x=='K5' else int(x)))
df.head(10)

Dropped 149 duplicate students.
Dropped 102 student(s) with missing grades.


Unnamed: 0,student,grade,parent,grade_new
1,"Abbott, Mercy",08,"Abbott, Joshua and Michelle",8
2,"Akers, Nash",06,"Akers, Trent and Necole",6
3,"Allen, Brody",07,"Allen, Joshua and Carissa",7
4,"Allen, Neveya",05,"Allen, Joshua and Carissa",5
5,"Allison, Bennett",01,"Allison, Brian and Lily",1
6,"Allison, Donovan",PK4,"Allison, Brian and Lily",-1
7,"Allison, Mila",03,"Allison, Brian and Lily",3
12,"Arredondo, Peyton",05,"Arredondo, Krista and Benjamin",5
13,"Arreola, Elias",01,"Arreola, Elias and Arlene",1
14,"Aston, Ryken",08,"Aston, Ian and Kimberly",8


In [4]:
# Assign each student the 'max' of the families grade:
transform = 'min' # min for youngest, max for oldest
df['release_grade'] = df.groupby('parent')['grade_new'].transform(transform)
df.head(10)

Unnamed: 0,student,grade,parent,grade_new,release_grade
1,"Abbott, Mercy",08,"Abbott, Joshua and Michelle",8,8
2,"Akers, Nash",06,"Akers, Trent and Necole",6,6
3,"Allen, Brody",07,"Allen, Joshua and Carissa",7,5
4,"Allen, Neveya",05,"Allen, Joshua and Carissa",5,5
5,"Allison, Bennett",01,"Allison, Brian and Lily",1,-1
6,"Allison, Donovan",PK4,"Allison, Brian and Lily",-1,-1
7,"Allison, Mila",03,"Allison, Brian and Lily",3,-1
12,"Arredondo, Peyton",05,"Arredondo, Krista and Benjamin",5,5
13,"Arreola, Elias",01,"Arreola, Elias and Arlene",1,1
14,"Aston, Ryken",08,"Aston, Ian and Kimberly",8,8


In [5]:
# Convert 0 and -1 back to K5 and PK4
df['release_grade'] = df['release_grade'].apply(lambda x: 'PK4' if x==-1 else ('K5' if x==0 else x))
# Drop the grade_new field we created. We won't need it anymore. 
df.drop(columns='grade_new', inplace=True)

In [6]:
# Create the 'has_sibling' field.
family_count_dict = df.groupby('parent').count()['student'].to_dict() 
df['has_sibling'] = [1 if family_count_dict[x] > 1 else 0 for x in df['parent']]

Now that we have the grade and the sibling flag let's create some example groupings. We'll make five groups:
2. K-2
3. 3-5
4. 6-8 No Siblings 

In [13]:
def release_group(row):
    if row['release_grade'] in ['PK4', 'K5', 1, 2]:
        return 'K-2'
    elif row['release_grade'] in [3, 4, 5]:
        return '3-5'
    elif (row['release_grade'] in [6, 7, 8]):
        return '6-8 No Siblings'  
    else:
        return 'No Group'

df['release_group'] = df.apply(release_group, axis=1)

In [14]:
df['release_group'].value_counts()

K-2                288
3-5                135
6-8 No Siblings    100
Name: release_group, dtype: int64

In [16]:
# Count by release group
df_group = df.groupby('release_group').agg({'student':'count', 'parent':'nunique'}) 

# Count by release grade
df_grade = df.groupby('release_grade').agg({'student':'count', 'parent':'nunique'})

# Write to Excel
SAVE_FILE_PATH = 'driveline2023_results_v1'

writer = pd.ExcelWriter('{}.xlsx'.format(SAVE_FILE_PATH), engine='xlsxwriter') 

# Write each dataframe to a different worksheet.
df.to_excel(writer, sheet_name='Raw Data')
df_group.to_excel(writer, sheet_name='Possible Group Summary') 
df_grade.to_excel(writer, sheet_name='Release Grade Summary')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

  writer.save()
