# Anonymization of iLearn Data

**PURPOSE** <br>
As of 02.02.2023, the ALMA IDs assigned to students are not mapped to their student IDs. For example, in one CSV, student 912345678 is mapped to ALMA ID 1, while in another CSV, 
another student ID is mapped to ALMA ID 1. 

This script aims to solve this problem through a program that takes in two inputs: (1) the CSV that needs to be modified, and (2) a CSV or TXT file that holds 
the mappings of the ALMA IDs and student IDS.


## Implementation

### V0 in Jupyter

In [68]:
import pandas as pd
import numpy as np
import pickle
import random
import json
import datetime
import re
import glob
import sys
import os
from datetime import datetime

# datetime object containing current date and time
 
# print("now =", now)

# dd/mm/YY H:M:S
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
print("date and time =", dt_string)
dt_string

date and time = 25_02_2023_15:59:51


'25_02_2023_15:59:51'

In [40]:
# get the alma keys path
key_dir = "alma_id_mapping.txt"

In [41]:
def jsonKeys2int(x):
    """
    Helper function to reconstruct json keys into ints. Used to reconstruct key file to a dictionary
    :param: x = jsonified dictionary from a text file
    """
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [64]:
def generate_folder(dir_path):
    folder_dic = {"season": "", "year": "", "course_name": "", "section": ""}
    output_path = ""
   
    try:
        season = re.search("Fall|Spring|FALL|SPRING", dir_path).group()
        # print(season)
        year = re.search("20\d+", dir_path).group()
        # print(year)
        course_name = re.search("((PHYS|CHEM|MATH|BIO|ASTR|CSC|SCI))\d+", dir_path).group()
        course_name = re.search("^[a-zA-Z]+", course_name).group() + " " + re.search("\d+", course_name).group()
        # print(course_name)
        section = "Section " + re.search("(?<=-)\d+", dir_path).group()
        # print(section)
        folder_dic["season"] = season
        folder_dic["year"] = year
        folder_dic["course_name"] = course_name
        folder_dic["section"] = section
    except:
        print("Error in get_folder_names(). New folder creation unsuccessful.")

    if os.path.isdir(f"./output/{folder_dic['season']} {folder_dic['year']}/{folder_dic['course_name']}/{folder_dic['section']}"):
        print("Folder already exists")
    else:
        os.makedirs(f"./output/{folder_dic['season']} {folder_dic['year']}/{folder_dic['course_name']}/{folder_dic['section']}")
        print(f"Successfully created ./output/{folder_dic['season']} {folder_dic['year']}/{folder_dic['course_name']}/{folder_dic['section']} folder")
    
    out_path = f"./output/{folder_dic['season']} {folder_dic['year']}/{folder_dic['course_name']}/{folder_dic['section']}"
    
    return out_path

In [69]:
def anonymize_csv(input_path, output_path, key_path, file_type="csv" first=False):
    print("Anonymization CSV function")
    print("input_path", input_path)
    print("output_path", output_path)
    print("key_path", key_path)
    # Step 1: Reading
    csv_to_search = pd.read_csv(input_path)                 # read the csv
    sfsu_id_list = csv_to_search["SF State ID"].to_list()   # get list of SF State IDs
    csv_modify = csv_to_search.copy()                       # make a copy of the input csv
    csv_name = os.path.basename(input_path)[:-4]                 # get the name of the csv
    id_mapping = {}
    
    with open(key_path) as key_file:                        # read the alma key file
        data = key_file.read()

    if data:
        id_mapping = json.loads(data)
        id_mapping = jsonKeys2int(id_mapping)               # reconstruct key file from string to dictionary
    
    original_id_mapping = id_mapping.copy()
    
    with open(f"./id_mapping_history/id_mapping_{dt_string}.txt", 'w') as key_file:     # save a copy of last alma id mapping
        key_file.write(json.dumps(original_id_mapping))
        print(f"Copy of id_mapping on {dt_string} saved")
    
    # Step 2: ID assignment
    ## The idea: first ID read is 1, second ID is alma ID 2 
    if first:                                               # default false. first="true" if alma key file is empty.
        last_alma_id = 0                                    # the first sfsu id read is mapped to alma id 1
    else:
        last_alma_id = max(id_mapping.values())             # the max id of the alma key file is the previous id file 
        
    for i in sfsu_id_list:                                  # for each id
        if i not in id_mapping.keys():                      # if current id is not in the alma key file
            last_alma_id += 1                                  # assign an alma id that is previous alma id + 1
            id_mapping[i] = last_alma_id                       # save the new alma id 
    
    for i in csv_modify.index:                              # for each row/index in the copy/to-be-modified csv
        csv_modify.loc[i, ["Alma ID"]] = id_mapping[csv_modify.loc[i, ["SF State ID"]].item()] # assign the "Alma ID" of that row to the Alma ID mapped to that row's "SF State ID"
    
    # Step 3: Return modified csv and updated id_mapping
    with open('alma_id_mapping.txt', 'w') as key_file:     # update the alma key file 
        key_file.write(json.dumps(id_mapping))
    
    # print("id_mapping", id_mapping)
    
    csv_modify.drop(columns="SF State ID", inplace=True)   # drop the SF State ID in the modified csv
    csv_modify.to_csv(f"{output_path}/" + str(csv_name) + "_anonymized.csv", index=False) # output modified csv to the output folder
        
    out_file_name = f"{output_path}/" + str(csv_name) + "_anonymized.csv"
    return csv_modify, out_file_name, id_mapping

## Testing the program with two essays

In [25]:
essay_4 = pd.read_csv("./input/Spring2020_ASTR0116-01_Essay4.csv")
essay_5 = pd.read_csv("./input/Spring2020_ASTR0116-01_Essay5.csv")

In [26]:
essay_4

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,SF State ID,Essay: 4
0,2020,Spring,ASTR 0116,--,1,1,918338062,
1,2020,Spring,ASTR 0116,--,1,2,920060541,Noah Word What I do when life gets challengin...
2,2020,Spring,ASTR 0116,--,1,3,912329098,I am fortunate enough to be the first in my fa...


In [27]:
essay_5

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,SF State ID,Essay: 5
0,2020,Spring,ASTR 0116,--,1,1,920375492,
1,2020,Spring,ASTR 0116,--,1,2,918338062,
2,2020,Spring,ASTR 0116,--,1,3,920060541,Noah Word What I do when life gets challengin...
3,2020,Spring,ASTR 0116,--,1,4,912329098,I am fortunate enough to be the first in my fa...


**Problem:** Alma IDs are not consistent throughout each CSV. For example, SF State ID 920060541 is Alma ID 2 in essay_4, but Alma ID 3 in essay_5. We want 920060541 to be mapped to one singular ALMA ID.

In [31]:
essay_4_modified, id_map = anonymize_csv("./input/Spring2020_ASTR0116-01_Essay4.csv", "alma_id_mapping.txt")

id_mapping {918794739: 1, 918317587: 2, 920976365: 3, 920968110: 4, 916713413: 5, 918734211: 6, 918375840: 7, 918574896: 8, 921484275: 9, 920231751: 10, 921210794: 11, 918543605: 12, 921360437: 13, 917633722: 14, 920264615: 15, 920826878: 16, 916809496: 17, 915562523: 18, 920279825: 19, 918207802: 20, 917691000: 21, 920515164: 22, 920550602: 23, 918414814: 24, 920247468: 25, 920456755: 26, 918560076: 27, 915221260: 28, 920132782: 29, 920838448: 30, 920339911: 31, 916931852: 32, 917489864: 33, 917005939: 34, 917903017: 35, 920829296: 36, 918507608: 37, 918389347: 38, 918445572: 39, 920476320: 40, 916870011: 41, 918254758: 42, 917448771: 43, 921002066: 44, 921236456: 45, 918253328: 46, 918604575: 47, 918575169: 48, 918745144: 49, 915477386: 50, 918442582: 51, 910939554: 52, 918453567: 53, 917547662: 54, 917698410: 55, 920161265: 56, 921171378: 57, 920361036: 58, 915342316: 59, 920271440: 60, 917648165: 61, 920703495: 62, 916995188: 63, 918460756: 64, 918210493: 65, 918323671: 66, 9181999

In [32]:
essay_4_modified

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,Essay: 4
0,2020,Spring,ASTR 0116,--,1,226,
1,2020,Spring,ASTR 0116,--,1,227,Noah Word What I do when life gets challengin...
2,2020,Spring,ASTR 0116,--,1,228,I am fortunate enough to be the first in my fa...


In [110]:
essay_4_modified.drop(columns="SF State ID", inplace=True)

In [115]:
essay_4_modified

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,Essay: 4
0,2020,Spring,ASTR 0116,--,1,1,
1,2020,Spring,ASTR 0116,--,1,2,Noah Word What I do when life gets challengin...
2,2020,Spring,ASTR 0116,--,1,3,I am fortunate enough to be the first in my fa...


In [72]:
max(essay_4_modified["Alma ID"].to_list())

3

In [33]:
essay_5_modified, id_map = anonymize_csv("./input/Spring2020_ASTR0116-01_Essay5.csv", "./alma_id_mapping.txt")

id_mapping {918794739: 1, 918317587: 2, 920976365: 3, 920968110: 4, 916713413: 5, 918734211: 6, 918375840: 7, 918574896: 8, 921484275: 9, 920231751: 10, 921210794: 11, 918543605: 12, 921360437: 13, 917633722: 14, 920264615: 15, 920826878: 16, 916809496: 17, 915562523: 18, 920279825: 19, 918207802: 20, 917691000: 21, 920515164: 22, 920550602: 23, 918414814: 24, 920247468: 25, 920456755: 26, 918560076: 27, 915221260: 28, 920132782: 29, 920838448: 30, 920339911: 31, 916931852: 32, 917489864: 33, 917005939: 34, 917903017: 35, 920829296: 36, 918507608: 37, 918389347: 38, 918445572: 39, 920476320: 40, 916870011: 41, 918254758: 42, 917448771: 43, 921002066: 44, 921236456: 45, 918253328: 46, 918604575: 47, 918575169: 48, 918745144: 49, 915477386: 50, 918442582: 51, 910939554: 52, 918453567: 53, 917547662: 54, 917698410: 55, 920161265: 56, 921171378: 57, 920361036: 58, 915342316: 59, 920271440: 60, 917648165: 61, 920703495: 62, 916995188: 63, 918460756: 64, 918210493: 65, 918323671: 66, 9181999

In [34]:
essay_5_modified

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,Essay: 5
0,2020,Spring,ASTR 0116,--,1,229,
1,2020,Spring,ASTR 0116,--,1,226,
2,2020,Spring,ASTR 0116,--,1,227,Noah Word What I do when life gets challengin...
3,2020,Spring,ASTR 0116,--,1,228,I am fortunate enough to be the first in my fa...


In [39]:
essay_5_modified

Unnamed: 0,Year,Semester,Class,Type,Section,Alma ID,Essay: 5
0,2020,Spring,ASTR 0116,--,1,4,
1,2020,Spring,ASTR 0116,--,1,1,
2,2020,Spring,ASTR 0116,--,1,2,Noah Word What I do when life gets challengin...
3,2020,Spring,ASTR 0116,--,1,3,I am fortunate enough to be the first in my fa...


**Comment**: Alma IDs are now consistent throughout essay 4 and essay 5. For essay 5, SF State ID 920060541 maintains ALMA ID 2.

## Running the program with the entire input directory

In [44]:
# get all input paths from the input folder
input_dir = sorted(glob.glob("./input/*.csv"))

In [50]:
input_dir

['./input/Fall2020_PHYS0102-02_Essay1.csv',
 './input/Fall2020_PHYS0102-02_Essay2.csv',
 './input/Fall2020_PHYS0102-02_Essay3.csv',
 './input/Fall2020_PHYS0102-02_Essay4.csv',
 './input/Fall2020_PHYS0102-02_Essay5.csv',
 './input/Fall2020_PHYS0112-07_Essay1.csv',
 './input/Fall2020_PHYS0112-07_Essay2.csv',
 './input/Fall2020_PHYS0112-07_Essay3.csv',
 './input/Fall2020_PHYS0112-07_Essay4.csv',
 './input/Fall2020_PHYS0112-07_Essay5.csv',
 './input/Fall2020_PHYS0122-01_Essay1.csv',
 './input/Fall2020_PHYS0122-01_Essay2.csv',
 './input/Fall2020_PHYS0122-01_Essay3.csv',
 './input/Fall2020_PHYS0122-01_Essay4.csv',
 './input/Fall2020_PHYS0122-01_Essay5.csv',
 './input/Fall2020_PHYS0122-01_Essay6.csv',
 './input/Fall2020_PHYS0122-04_Essay1.csv',
 './input/Fall2020_PHYS0122-04_Essay2.csv',
 './input/Fall2020_PHYS0122-04_Essay3.csv',
 './input/Fall2020_PHYS0122-04_Essay4.csv',
 './input/Fall2020_PHYS0122-04_Essay5.csv',
 './input/Fall2020_PHYS0122-04_Essay6.csv',
 './input/Fall2020_PHYS0122-08_E

In [66]:
for i in input_dir:
    print("Input: " + i)
    # if input_dir.index(i) == 0:
    #     anonymize_csv(i, key_dir, first=True)
    o = generate_folder(i)
    anonymize_csv(i, o, key_dir)

Input: ./input/Fall2020_PHYS0102-02_Essay1.csv
Folder already exists
Copy of id_mapping on 25_02_2023_15:59:51 saved
id_mapping {918794739: 1, 918317587: 2, 920976365: 3, 920968110: 4, 916713413: 5, 918734211: 6, 918375840: 7, 918574896: 8, 921484275: 9, 920231751: 10, 921210794: 11, 918543605: 12, 921360437: 13, 917633722: 14, 920264615: 15, 920826878: 16, 916809496: 17, 915562523: 18, 920279825: 19, 918207802: 20, 917691000: 21, 920515164: 22, 920550602: 23, 918414814: 24, 920247468: 25, 920456755: 26, 918560076: 27, 915221260: 28, 920132782: 29, 920838448: 30, 920339911: 31, 916931852: 32, 917489864: 33, 917005939: 34, 917903017: 35, 920829296: 36, 918507608: 37, 918389347: 38, 918445572: 39, 920476320: 40, 916870011: 41, 918254758: 42, 917448771: 43, 921002066: 44, 921236456: 45, 918253328: 46, 918604575: 47, 918575169: 48, 918745144: 49, 915477386: 50, 918442582: 51, 910939554: 52, 918453567: 53, 917547662: 54, 917698410: 55, 920161265: 56, 921171378: 57, 920361036: 58, 915342316:

In [73]:
with open(key_dir) as key_file:                        # read the alma key file
        data = key_file.read()

if data:
    id_mapping = json.loads(data)
    id_mapping = jsonKeys2int(id_mapping) 

In [77]:
len(id_mapping)

790

In [71]:
id_mapping

{918794739: 1,
 918317587: 2,
 920976365: 3,
 920968110: 4,
 916713413: 5,
 918734211: 6,
 918375840: 7,
 918574896: 8,
 921484275: 9,
 920231751: 10,
 921210794: 11,
 918543605: 12,
 921360437: 13,
 917633722: 14,
 920264615: 15,
 920826878: 16,
 916809496: 17,
 915562523: 18,
 920279825: 19,
 918207802: 20,
 917691000: 21,
 920515164: 22,
 920550602: 23,
 918414814: 24,
 920247468: 25,
 920456755: 26,
 918560076: 27,
 915221260: 28,
 920132782: 29,
 920838448: 30,
 920339911: 31,
 916931852: 32,
 917489864: 33,
 917005939: 34,
 917903017: 35,
 920829296: 36,
 918507608: 37,
 918389347: 38,
 918445572: 39,
 920476320: 40,
 916870011: 41,
 918254758: 42,
 917448771: 43,
 921002066: 44,
 921236456: 45,
 918253328: 46,
 918604575: 47,
 918575169: 48,
 918745144: 49,
 915477386: 50,
 918442582: 51,
 910939554: 52,
 918453567: 53,
 917547662: 54,
 917698410: 55,
 920161265: 56,
 921171378: 57,
 920361036: 58,
 915342316: 59,
 920271440: 60,
 917648165: 61,
 920703495: 62,
 916995188: 63,
 

In [31]:
a = generate_folder("./input/Spring2020_ASTR0116-01_Essay4.csv")
a

Folder already exists


'./output/Spring 2020/ASTR 0116/Section 01'

In [19]:
input_dir

['./input/Fall2020_PHYS0102-02_Essay1.csv',
 './input/Fall2020_PHYS0102-02_Essay2.csv',
 './input/Fall2020_PHYS0102-02_Essay3.csv',
 './input/Fall2020_PHYS0102-02_Essay4.csv',
 './input/Fall2020_PHYS0102-02_Essay5.csv',
 './input/Fall2020_PHYS0112-07_Essay1.csv',
 './input/Fall2020_PHYS0112-07_Essay2.csv',
 './input/Fall2020_PHYS0112-07_Essay3.csv',
 './input/Fall2020_PHYS0112-07_Essay4.csv',
 './input/Fall2020_PHYS0112-07_Essay5.csv',
 './input/Fall2020_PHYS0122-01_Essay1.csv',
 './input/Fall2020_PHYS0122-01_Essay2.csv',
 './input/Fall2020_PHYS0122-01_Essay3.csv',
 './input/Fall2020_PHYS0122-01_Essay4.csv',
 './input/Fall2020_PHYS0122-01_Essay5.csv',
 './input/Fall2020_PHYS0122-01_Essay6.csv',
 './input/Fall2020_PHYS0122-04_Essay1.csv',
 './input/Fall2020_PHYS0122-04_Essay2.csv',
 './input/Fall2020_PHYS0122-04_Essay3.csv',
 './input/Fall2020_PHYS0122-04_Essay4.csv',
 './input/Fall2020_PHYS0122-04_Essay5.csv',
 './input/Fall2020_PHYS0122-04_Essay6.csv',
 './input/Fall2020_PHYS0122-08_E