Cleaning course data to be able to perform analysis on

In [51]:
#import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MinMaxScaler

In [42]:
def clean_course(course):
    """
    Cleans a single course entry.
    """
    # Normalize text fields
    course["Course_Description"] = course.get("Course_Description", "").capitalize().strip()
    course["Course_Teacher"] = course.get("Course_Teacher", "").replace("Professor, ", "").strip()
    course["Course_Exam_Difficulty"] = course.get("Course_Exam_Difficulty", "").capitalize()
    course["Course_Required_Math_Level"] = course.get("Course_Required_Math_Level", "").replace("Very high", "High")
    
    # Convert numerical fields
    if "Course_Credit" in course:
        course["Course_Credit"] = int(course["Course_Credit"]) if course["Course_Credit"].isdigit() else None
    if "Course_Class_Hours" in course:
        course["Course_Class_Hours"] = int(course["Course_Class_Hours"]) if course["Course_Class_Hours"].isdigit() else None
    if "Homework_mandatorily required_hours" in course:
        course["Homework_mandatorily required_hours"] = int(course["Homework_mandatorily required_hours"]) if course["Homework_mandatorily required_hours"].isdigit() else None

    # Normalize boolean fields
    if "With_Course_Videos" in course:
        course["With_Course_Videos"] = course["With_Course_Videos"].lower() == "yes"
    
    # Return cleaned course
    return course

def clean_courses(file_path, output_path):
    """
    Reads a JSON file with courses, cleans each course, and saves the cleaned data to a new file.
    """
    try:
        # Load the JSON file
        with open(file_path, "r", encoding="utf-8") as file:
            courses = json.load(file)
        
        # Clean each course
        cleaned_courses = [clean_course(course) for course in courses]

        # Save the cleaned courses to a new file
        with open(output_path, "w", encoding="utf-8") as file:
            json.dump(cleaned_courses, file, indent=4, ensure_ascii=False)

        print(f"Cleaned courses saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [44]:
#clean courses
input_file = 'files/courses.json'
output_file = 'files/cleaned_courses.json'

clean_courses(input_file, output_file)


Cleaned courses saved to files/cleaned_courses.json


In [66]:
#feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def extract_course_concepts(courses, num_concepts=20):
    """
    Extracts top keywords from course descriptions using TF-IDF.
    """
    descriptions = [course["Course_Description"] for course in courses]

    # Apply TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    feature_names = vectorizer.get_feature_names_out()

    # Extract top keywords for each course
    for i, course in enumerate(courses):
        tfidf_scores = tfidf_matrix[i].toarray()[0]
        top_indices = tfidf_scores.argsort()[-num_concepts:][::-1]
        top_concepts = [feature_names[idx] for idx in top_indices]
        course["Extracted_Concepts"] = top_concepts

    return courses

def encode_categorical_features(courses):
    """
    Encodes categorical features as sets for Jaccard similarity.
    """
    for course in courses:
        course["Encoded_Language"] = {course["Course_Language"]}
        course["Encoded_Module"] = {course["Course_Module"]}
    return courses

def scale_numerical_features(courses, numerical_fields):
    """
    Scales numerical features using Min-Max Scaling.
    """
    scaler = MinMaxScaler()
    numerical_data = pd.DataFrame(courses)[numerical_fields]
    scaled_data = scaler.fit_transform(numerical_data)

    # Update courses with scaled values
    for i, course in enumerate(courses):
        for j, field in enumerate(numerical_fields):
            course[field] = scaled_data[i, j]
    return courses

def convert_sets_to_lists(courses):
    """
    Converts all sets in the course data to lists for JSON serialization.
    """
    for course in courses:
        for key, value in course.items():
            if isinstance(value, set):
                course[key] = list(value)
    return courses


def feature_engineering_pipeline(courses):
    """
    Full feature engineering pipeline for courses.
    """
    # Step 1: Extract course concepts
    courses = extract_course_concepts(courses)

    # Step 2: Encode categorical features
    courses = encode_categorical_features(courses)

    # Step 3: Scale numerical features
    numerical_fields = ["Course_Credit", "Course_Class_Hours", "Homework_mandatorily required_hours"]
    courses = scale_numerical_features(courses, numerical_fields)

    return courses



In [71]:
#feature engineering and storing courses in diff file
# Load cleaned data
with open(output_file, "r", encoding="utf-8") as file:
    courses = json.load(file)

# Apply feature engineering pipeline
processed_courses = feature_engineering_pipeline(courses)

# Convert sets to lists for JSON compatibility
processed_courses = convert_sets_to_lists(processed_courses)

# Save processed data
with open("files/processed_courses.json", "w", encoding="utf-8") as file:
    json.dump(processed_courses, file, indent=4, ensure_ascii=False)

print("Feature engineering completed and saved!")


Feature engineering completed and saved!


In [70]:
processed_courses = 'files/processed_courses.json'

# Load JSON data from a file
with open(processed_courses, 'r') as file:
    data = json.load(file)

# Convert JSON data to DataFrame
df = pd.json_normalize(data)

# Set the index to course id
df.set_index('Course_ID', inplace=True)

# Display the first 5 records
print(df.head())

                              Course_Name  \
Course_ID                                   
ZKD50032         Advanced Image Synthesis   
ZKD50056              Cloud, Web & Mobile   
ZKD41171              Distributed Systems   
ZKD50039               Learning Analytics   
ZKD50001   Modeling of Concurrent Systems   

                                          Course_Description            Term  \
Course_ID                                                                      
ZKD50032   Abstract:\nThis lecture covers the fundamental...  Winter 2024/25   
ZKD50056   Cloud computing, web applications and mobile a...  Winter 2024/25   
ZKD41171     Distributed systems and distributed algorithms.  Winter 2024/25   
ZKD50039   Learning Analytics (LA) has attracted a great ...  Winter 2024/25   
ZKD50001   Transition systems, behavioral equivalences: l...  Winter 2024/25   

          Course_Language                  Course_Teacher  Course_Credit  \
Course_ID                                      