# KDD Cup MOOC Dataset

- dataset preparation and exploration
- dataset preparation step is similar to XuetangX dataset
- this dataset doesn't provide user demographics (age, sex, education, etc.)

In [3]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
import math
from sklearn.preprocessing import StandardScaler

In [4]:
#Modified version of original researcher's .sh file. Edited and implemented with the assistance of ChatGPT-4o

import os
import urllib.request
import tarfile
import zipfile
from tkinter import Tk, filedialog

#Hide the main tkinter window
root = Tk()
root.withdraw()

#Ask the user to select the destination folder
download_folder = filedialog.askdirectory(title="Select Destination Folder to Download Data")

#If no folder is selected, exit the script
if not download_folder:
    print("No folder selected. Exiting...")
    exit()

#Define file URLs and paths relative to the selected folder
files_info = {
    f"{download_folder}/prediction_data.tar.gz": "http://lfs.aminer.cn/misc/moocdata/data/prediction_data.tar.gz",
    f"{download_folder}/user_info.csv": "http://lfs.aminer.cn/misc/moocdata/data/user_info.csv",
    f"{download_folder}/course_info.csv": "http://lfs.aminer.cn/misc/moocdata/data/course_info.csv",
    f"{download_folder}/kddcup15.zip": "http://lfs.aminer.cn/misc/moocdata/data/kddcup15.zip",
    f"{download_folder}/kdd2_test.csv": "https://bitbucket.org/lics229/mooc-dropout-prediction/raw/8742cb34f2453955c474aa0a50df72d1d59b39f5/data/test/FeatureVectorWithLabel.csv",
    f"{download_folder}/kdd2_train.csv": "https://bitbucket.org/lics229/mooc-dropout-prediction/raw/8742cb34f2453955c474aa0a50df72d1d59b39f5/data/train/FeatureVectorWithLabel.csv",
}

#Create the directory if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

#Function to download a file
def download_file(url, file_path):
    print(f"Downloading {file_path}...")
    urllib.request.urlretrieve(url, file_path)
    print(f"Downloaded {file_path} successfully.")

#Function to extract a tar.gz file
def extract_tar(file_path, extract_to=download_folder):
    print(f"Extracting files from {file_path}...")
    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall(path=extract_to)
    print(f"Done extracting files from {file_path}.")

#Function to extract a zip file
def extract_zip(file_path, extract_to= download_folder):
    print(f"Extracting files from {file_path}...")
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Done extracting files from {file_path}.")

#Loop through files and check if they exist, download and extract if necessary
for file_path, url in files_info.items():
    if os.path.exists(file_path):
        print(f"{file_path} exists.")
    else:
        download_file(url, file_path)
        
        # Check for compressed file types and extract
        if file_path.endswith(".tar.gz"):
            extract_tar(file_path)
        elif file_path.endswith(".zip"):
            extract_zip(file_path)

print("All done...")


Downloading C:/Users/chanc/Downloads/Module 5 Data/prediction_data.tar.gz...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/prediction_data.tar.gz successfully.
Extracting files from C:/Users/chanc/Downloads/Module 5 Data/prediction_data.tar.gz...


  tar.extractall(path=extract_to)


Done extracting files from C:/Users/chanc/Downloads/Module 5 Data/prediction_data.tar.gz.
Downloading C:/Users/chanc/Downloads/Module 5 Data/user_info.csv...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/user_info.csv successfully.
Downloading C:/Users/chanc/Downloads/Module 5 Data/course_info.csv...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/course_info.csv successfully.
Downloading C:/Users/chanc/Downloads/Module 5 Data/kddcup15.zip...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/kddcup15.zip successfully.
Extracting files from C:/Users/chanc/Downloads/Module 5 Data/kddcup15.zip...
Done extracting files from C:/Users/chanc/Downloads/Module 5 Data/kddcup15.zip.
Downloading C:/Users/chanc/Downloads/Module 5 Data/kdd2_test.csv...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/kdd2_test.csv successfully.
Downloading C:/Users/chanc/Downloads/Module 5 Data/kdd2_train.csv...
Downloaded C:/Users/chanc/Downloads/Module 5 Data/kdd2_train.csv successfully.
All done...


In [5]:
#Unzip all folders within the kddcup15 folder
active_folder = download_folder + "/kddcup15"
for file in os.listdir(active_folder):
    if file.endswith(".zip"):
        with zipfile.ZipFile(active_folder + "/" + file, 'r') as zip_ref:
            zip_ref.extractall(active_folder)



In [6]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 20

In [7]:
# load training log
train_df = pd.read_csv(os.path.join(active_folder, 'train/log_train.csv'))

In [8]:
train_df.head()

Unnamed: 0,enrollment_id,time,source,event,object
0,1,2014-06-14T09:38:29,server,navigate,Oj6eQgzrdqBMlaCtaq1IkY6zruSrb71b
1,1,2014-06-14T09:38:39,server,access,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
2,1,2014-06-14T09:38:39,server,access,qxvBNYTfiRkNcCvM0hcGwG6hvHdQwnd4
3,1,2014-06-14T09:38:48,server,access,2cmZrZW2h6Il91itO3e89FGcABLWhf3W
4,1,2014-06-14T09:41:49,browser,problem,RMtgC2bTAqEeftenUUyia504wsyzeZWf


In [9]:
train_df.tail()

Unnamed: 0,enrollment_id,time,source,event,object
8157272,200901,2014-07-24T14:11:31,browser,page_close,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
8157273,200901,2014-07-24T14:11:32,browser,video,HdMvr3A6vQzym6Xl0tOXpNbfHOyohlKE
8157274,200901,2014-07-24T14:11:41,browser,page_close,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
8157275,200904,2014-07-24T15:07:50,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG
8157276,200905,2014-07-24T15:41:51,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG


In [10]:
# read the ground truch for training data
train_truth_df = pd.read_csv(os.path.join(active_folder, 'train/truth_train.csv'), header=None, index_col=0)

In [11]:
train_truth_df.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
1,0
3,0
4,0
5,0
6,0


In [12]:
train_truth_df.columns = ['truth']

In [13]:
train_truth_df.index.name = 'enrollment_id'

In [14]:
train_truth_df.tail()

Unnamed: 0_level_0,truth
enrollment_id,Unnamed: 1_level_1
200898,1
200900,1
200901,1
200904,1
200905,1


In [15]:
# load test logs
test_df = pd.read_csv(os.path.join(active_folder, 'test/log_test.csv'))
test_truth_df = pd.read_csv(os.path.join(active_folder, 'test/truth_test.csv'), header=None, index_col=0)

In [16]:
test_truth_df.columns = ['truth']
test_truth_df.index.name = 'enrollment_id'

# cobmine train and test truth
all_truth_df = pd.concat([train_truth_df, test_truth_df])

# combine train and test logs
all_log_df = pd.concat([train_df, test_df])

In [17]:
all_log_df.head()

Unnamed: 0,enrollment_id,time,source,event,object
0,1,2014-06-14T09:38:29,server,navigate,Oj6eQgzrdqBMlaCtaq1IkY6zruSrb71b
1,1,2014-06-14T09:38:39,server,access,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
2,1,2014-06-14T09:38:39,server,access,qxvBNYTfiRkNcCvM0hcGwG6hvHdQwnd4
3,1,2014-06-14T09:38:48,server,access,2cmZrZW2h6Il91itO3e89FGcABLWhf3W
4,1,2014-06-14T09:41:49,browser,problem,RMtgC2bTAqEeftenUUyia504wsyzeZWf


In [18]:
all_log_df.tail(10)

Unnamed: 0,enrollment_id,time,source,event,object
5387837,200894,2014-07-24T10:49:10,server,access,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
5387838,200894,2014-07-24T10:49:10,server,access,jcmKbpHQYLyzZZ34cJgLu7F4pgiCsoXV
5387839,200894,2014-07-24T10:54:17,browser,page_close,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl
5387840,200894,2014-07-24T10:54:17,browser,video,HdMvr3A6vQzym6Xl0tOXpNbfHOyohlKE
5387841,200894,2014-07-24T10:57:03,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG
5387842,200899,2014-07-24T13:37:53,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG
5387843,200902,2014-07-24T14:22:33,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG
5387844,200903,2014-07-24T14:24:29,server,navigate,9Mur5ciTV9IBFfcPaz5c3nC1lrZaxBvG
5387845,200903,2014-07-24T14:24:41,server,navigate,Oj6eQgzrdqBMlaCtaq1IkY6zruSrb71b
5387846,200903,2014-07-24T14:25:26,server,navigate,LMYZjRiU5C2N9ih1oYVNmOe5jFu2XLwv


In [19]:
# remove duplicate enroll_ids
train_enroll_ids = list(set(list(train_df['enrollment_id'])))
test_enroll_ids = list(set(list(test_df['enrollment_id'])))

In [20]:
# let's check total # of records on train and test datasets
print(len(train_enroll_ids))
print(len(test_enroll_ids))

120542
80362


In [21]:
# count all the actions for each user
user_action_count_df = all_log_df.groupby('enrollment_id').count()[['event']]

In [22]:
user_action_count_df.head(10)

Unnamed: 0_level_0,event
enrollment_id,Unnamed: 1_level_1
1,314
2,875
3,288
4,99
5,633
6,23
7,479
8,353
9,97
10,2


In [23]:
# give columns names
user_action_count_df.columns = ['action_count']

In [24]:
user_action_count_df.head(10)

Unnamed: 0_level_0,action_count
enrollment_id,Unnamed: 1_level_1
1,314
2,875
3,288
4,99
5,633
6,23
7,479
8,353
9,97
10,2


In [25]:
user_events_df = all_log_df[['event']].drop_duplicates()

In [26]:
user_events_df

Unnamed: 0,event
0,navigate
1,access
4,problem
12,page_close
69,video
484,discussion
2655,wiki


In [27]:
user_events_list = user_events_df['event'].values.tolist()

In [28]:
user_events_list

['navigate', 'access', 'problem', 'page_close', 'video', 'discussion', 'wiki']

In [29]:
sources_df = all_log_df[['source']].drop_duplicates()

In [30]:
sources_df

Unnamed: 0,source
0,server
4,browser


In [31]:
sources_list = sources_df['source'].values.tolist()

In [32]:
sources_list

['server', 'browser']

In [33]:
# Create composite action keys for efficient counting
all_log_df['source_event'] = all_log_df['source'] + '_' + all_log_df['event']

# Get unique combinations that we're interested in
source_event_combinations = [f'{source}_{action}' for source in sources_list for action in user_events_list]

# Create a crosstab (pivot table) of enrollment_id and source_event combinations
action_counts = pd.crosstab(
    index=all_log_df['enrollment_id'],
    columns=all_log_df['source_event'],
    values=1,
    aggfunc='sum'
).fillna(0)

# Rename columns to add "_count" suffix
action_counts.columns = [f"{col}_count" for col in action_counts.columns]

# Create a DataFrame with all required columns, using zeros for missing ones
# Get unique enrollment IDs
all_enrollment_ids = action_counts.index.unique()

# Create an empty DataFrame with the right index
complete_action_counts = pd.DataFrame(index=all_enrollment_ids)

# Add all required columns with zeros as default
for combo in source_event_combinations:
    column_name = f"{combo}_count"
    if column_name in action_counts.columns:
        complete_action_counts[column_name] = action_counts[column_name]
    else:
        complete_action_counts[column_name] = 0

# Merge with user_action_count_df in one operation
user_action_count_df = pd.merge(user_action_count_df, complete_action_counts, 
                               left_index=True, right_index=True, how='left')

In [34]:
# for source in sources_list:
#     for action in user_events_list:
#         action_label = f'{source}_{action}_count'
#         action_ = ((all_log_df['source'] == source) & (all_log_df['event'] == action)).astype(int)
#         #print(action_label)
#         #print(action_)
#         all_log_df[action_label] = action_
#         action_count = all_log_df.groupby(['enrollment_id']).sum()[[action_label]]
#         user_action_count_df = pd.merge(user_action_count_df, action_count, left_index=True, right_index=True)

In [35]:
user_action_count_df.head(10)

Unnamed: 0_level_0,action_count,server_navigate_count,server_access_count,server_problem_count,server_page_close_count,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count
enrollment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,314,25.0,86.0,8.0,0,0,0.0,0.0,0,21.0,79.0,66.0,29.0,0,0
2,875,87.0,223.0,4.0,0,0,324.0,12.0,0,16.0,16.0,143.0,50.0,0,0
3,288,14.0,45.0,3.0,0,0,26.0,0.0,0,34.0,135.0,22.0,9.0,0,0
4,99,15.0,64.0,1.0,0,0,0.0,0.0,0,0.0,5.0,10.0,4.0,0,0
5,633,30.0,106.0,32.0,0,0,34.0,0.0,0,120.0,138.0,87.0,86.0,0,0
6,23,5.0,12.0,0.0,0,0,0.0,0.0,0,0.0,2.0,2.0,2.0,0,0
7,479,20.0,81.0,20.0,0,0,33.0,0.0,0,122.0,74.0,60.0,69.0,0,0
8,353,20.0,108.0,7.0,0,0,7.0,1.0,0,19.0,43.0,90.0,58.0,0,0
9,97,12.0,65.0,3.0,0,0,0.0,0.0,0,6.0,3.0,6.0,2.0,0,0
10,2,2.0,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0


In [36]:
user_action_count_df.describe()

Unnamed: 0,action_count,server_navigate_count,server_access_count,server_problem_count,server_page_close_count,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count
count,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0,200904.0
mean,67.420878,8.359679,19.618539,1.384213,0.0,0.0,5.357395,0.7621,0.0,6.096335,9.018432,10.258701,6.565484,0.0,0.0
std,139.918512,12.90472,38.431194,5.138802,0.0,0.0,35.715282,4.400607,0.0,21.728713,29.694653,20.919278,14.58131,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,17.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0
75%,67.0,10.0,21.0,1.0,0.0,0.0,2.0,1.0,0.0,3.0,5.0,10.0,6.0,0.0,0.0
max,7697.0,649.0,3641.0,483.0,0.0,0.0,5321.0,1041.0,0.0,1122.0,877.0,694.0,536.0,0.0,0.0


In [37]:
user_action_count_df = pd.merge(user_action_count_df, all_truth_df, left_index=True, right_index=True)

In [38]:
user_action_count_df.head(10)

Unnamed: 0_level_0,action_count,server_navigate_count,server_access_count,server_problem_count,server_page_close_count,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count,truth
enrollment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,314,25.0,86.0,8.0,0,0,0.0,0.0,0,21.0,79.0,66.0,29.0,0,0,0
2,875,87.0,223.0,4.0,0,0,324.0,12.0,0,16.0,16.0,143.0,50.0,0,0,0
3,288,14.0,45.0,3.0,0,0,26.0,0.0,0,34.0,135.0,22.0,9.0,0,0,0
4,99,15.0,64.0,1.0,0,0,0.0,0.0,0,0.0,5.0,10.0,4.0,0,0,0
5,633,30.0,106.0,32.0,0,0,34.0,0.0,0,120.0,138.0,87.0,86.0,0,0,0
6,23,5.0,12.0,0.0,0,0,0.0,0.0,0,0.0,2.0,2.0,2.0,0,0,0
7,479,20.0,81.0,20.0,0,0,33.0,0.0,0,122.0,74.0,60.0,69.0,0,0,1
8,353,20.0,108.0,7.0,0,0,7.0,1.0,0,19.0,43.0,90.0,58.0,0,0,0
9,97,12.0,65.0,3.0,0,0,0.0,0.0,0,6.0,3.0,6.0,2.0,0,0,1
10,2,2.0,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,1


In [39]:
user_action_count_df.shape

(200904, 16)

In [40]:
numeric_features = [c for c in user_action_count_df.columns if 'count' in c or 'time' in c or 'num' in c]

In [41]:
numeric_features

['action_count',
 'server_navigate_count',
 'server_access_count',
 'server_problem_count',
 'server_page_close_count',
 'server_video_count',
 'server_discussion_count',
 'server_wiki_count',
 'browser_navigate_count',
 'browser_access_count',
 'browser_problem_count',
 'browser_page_close_count',
 'browser_video_count',
 'browser_discussion_count',
 'browser_wiki_count']

In [42]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(user_action_count_df[numeric_features])

In [43]:
for i, n_f in enumerate(numeric_features):
    print(i, n_f)
    user_action_count_df[n_f] = scaled_df[:,i]

0 action_count
1 server_navigate_count
2 server_access_count
3 server_problem_count
4 server_page_close_count
5 server_video_count
6 server_discussion_count
7 server_wiki_count
8 browser_navigate_count
9 browser_access_count
10 browser_problem_count
11 browser_page_close_count
12 browser_video_count
13 browser_discussion_count
14 browser_wiki_count


In [44]:
user_action_count_df.head(10)

Unnamed: 0_level_0,action_count,server_navigate_count,server_access_count,server_problem_count,server_page_close_count,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count,truth
enrollment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1.76231,1.289479,1.727285,1.287421,0.0,0.0,-0.150003,-0.173181,0.0,0.685899,2.356712,2.664597,1.538584,0.0,0.0,0
2,5.771796,6.093934,5.292106,0.509028,0.0,0.0,8.921766,2.553722,0.0,0.455788,0.235113,6.345421,2.978788,0.0,0.0,0
3,1.576487,0.437075,0.660441,0.314429,0.0,0.0,0.577978,-0.173181,0.0,1.284187,4.242578,0.561268,0.166962,0.0,0.0,0
4,0.225697,0.514567,1.154832,-0.074767,0.0,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.135325,-0.012367,-0.175944,0.0,0.0,0
5,4.042214,1.676935,2.247697,5.957782,0.0,0.0,0.801973,-0.173181,0.0,5.242094,4.343606,3.668458,5.447708,0.0,0.0,0
6,-0.317478,-0.260346,-0.198239,-0.269366,0.0,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.236354,-0.39479,-0.313106,0.0,0.0,0
7,2.94157,0.902023,1.597182,3.622602,0.0,0.0,0.773973,-0.173181,0.0,5.334138,2.188331,2.377779,4.281829,0.0,0.0,1
8,2.041044,0.902023,2.299738,1.092823,0.0,0.0,0.045992,0.054061,0.0,0.593855,1.144369,3.811867,3.527437,0.0,0.0,0
9,0.211403,0.282093,1.180853,0.314429,0.0,0.0,-0.150003,-0.173181,0.0,-0.004434,-0.202678,-0.203578,-0.313106,0.0,0.0,1
10,-0.467565,-0.492819,-0.510486,-0.269366,0.0,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.303706,-0.490396,-0.450268,0.0,0.0,1


In [45]:
user_action_count_df.loc[train_enroll_ids].to_csv(os.path.join(active_folder, 'kdd_train_normalized_features.csv'))
user_action_count_df.loc[test_enroll_ids].to_csv(os.path.join(active_folder, 'kdd_test_normalized_features.csv'))

In [46]:
# save single file with all features
user_action_count_df.to_csv(os.path.join(active_folder, 'kdd_all_normalized_features.csv'))

In [47]:
user_action_count_df.columns

Index(['action_count', 'server_navigate_count', 'server_access_count',
       'server_problem_count', 'server_page_close_count', 'server_video_count',
       'server_discussion_count', 'server_wiki_count',
       'browser_navigate_count', 'browser_access_count',
       'browser_problem_count', 'browser_page_close_count',
       'browser_video_count', 'browser_discussion_count', 'browser_wiki_count',
       'truth'],
      dtype='object')

## KDD Cup (Extended Features Dataset) Processing

In [57]:
#Merge the kdd2_train.csv and kdd2_test.csv files
train_df = pd.read_csv(os.path.join(download_folder, 'kdd2_train.csv'))
test_df = pd.read_csv(os.path.join(download_folder, 'kdd2_test.csv'))
all_df = pd.concat([train_df, test_df])
#Rename the label column to 'truth'
all_df.rename(columns={'label':'truth'}, inplace=True)
all_df.to_csv(os.path.join(download_folder, 'kdd_expanded_all.csv'))

In [58]:
all_df

Unnamed: 0,enrollment_id,truth,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,act_cnt_day_03,act_cnt_day_04,act_cnt_day_05,act_cnt_day_06,...,act_cnt_hour_21,act_cnt_hour_20,act_cnt_hour_23,server_access,server_outlink_percent,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.161290,0,21,0,4,0,0,0,0,13,0,0,0,...,0,0,6,86,0,0,0,0,0,5,1,4,6,2,0
1,135300,1,0.000000,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,131075,1,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0.250000,0,7,0,1,0,0,0,0,0,20,11,0,...,0,0,7,64,0,0,0,0,0,4,4,0,5,0,0
4,5,0,0.166667,0,77,0,0,0,0,2,0,145,0,0,0,...,34,23,63,106,0,0,0,0,0,16,2,13,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24008,131052,1,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,2,0,0,0,0,0
24009,131058,1,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0
24010,131061,1,1.000000,0,17,0,0,0,0,17,15,0,0,0,0,...,0,0,0,15,0,0,0,0,0,0,2,0,0,0,0
24011,131066,1,0.000000,0,0,0,0,0,0,0,0,30,0,0,0,...,0,0,0,23,0,0,0,0,0,4,1,0,0,0,0


In [59]:
#Describe the dataset
all_df.describe()


Unnamed: 0,enrollment_id,truth,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,act_cnt_day_03,act_cnt_day_04,act_cnt_day_05,act_cnt_day_06,...,act_cnt_hour_21,act_cnt_hour_20,act_cnt_hour_23,server_access,server_outlink_percent,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
count,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,...,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0,96408.0
mean,100399.511576,0.793264,0.084532,0.0,10.113569,0.0,0.46326,0.0,0.0,3.90396,3.440358,2.913441,2.40645,2.162362,2.131877,...,0.141119,0.134066,0.601423,19.581394,0.0,0.0,0.0,0.0,0.0,0.877583,1.057049,0.753713,0.826238,0.233705,0.0
std,57973.78823,0.404967,0.191439,0.0,33.99137,0.0,0.972427,0.0,0.0,17.03347,14.936313,13.041771,12.084697,11.25907,11.390678,...,2.66548,2.691949,5.304163,38.176002,0.0,0.0,0.0,0.0,0.0,1.982218,2.003151,1.914626,1.9444,0.824688,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50149.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,100252.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,150749.25,1.0,0.086957,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
max,200904.0,1.0,5.0,0.0,3604.0,0.0,11.0,0.0,0.0,433.0,442.0,313.0,790.0,430.0,635.0,...,238.0,275.0,387.0,3641.0,0.0,0.0,0.0,0.0,0.0,81.0,86.0,81.0,71.0,22.0,0.0


In [60]:
#Drop non-numeric and unnecessary columns
non_features = ['enrollment_id', 'truth']
numeric_features = all_df.drop(columns=non_features).select_dtypes(include=['number'])

#Identify features with binary values (0 or 1) to potentially exclude from scaling
binary_features = [col for col in numeric_features.columns if set(all_df[col].unique()).issubset({0, 1})]

#Identify continuous features for scaling
features_to_scale = [col for col in numeric_features.columns if col not in binary_features]
#Display first 10 features to be scaled for confirmation
features_to_scale[:10] 



['avg_chapter_delays',
 'act_cnt_weekDay_01',
 'parallel_enrollments',
 'act_cnt_day_01',
 'act_cnt_day_02',
 'act_cnt_day_03',
 'act_cnt_day_04',
 'act_cnt_day_05',
 'act_cnt_day_06',
 'act_cnt_day_07']

In [61]:
#Scale the continuous features
#Apply Standard Scaler to the selected continuous features
scaler = StandardScaler()
all_df_scaled = all_df.copy()

#Scale the continuous features
all_df_scaled[features_to_scale] = scaler.fit_transform(all_df[features_to_scale])




In [62]:
all_df_scaled

Unnamed: 0,enrollment_id,truth,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,act_cnt_day_03,act_cnt_day_04,act_cnt_day_05,act_cnt_day_06,...,act_cnt_hour_21,act_cnt_hour_20,act_cnt_hour_23,server_access,server_outlink_percent,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.400957,0,0.320272,0,3.637042,0,0,-0.229195,-0.230336,0.773408,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,1.017805,1.739809,0,0,0,0,0,2.079710,-0.028480,1.695529,2.660866,2.141785,0
1,135300,1,-0.441564,0,-0.297535,0,-0.476398,0,0,-0.229195,-0.230336,-0.223394,-0.116383,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,-0.512927,0,0,0,0,0,-0.442730,-0.527696,-0.393663,-0.424935,-0.283387,0
2,131075,1,-0.441564,0,-0.297535,0,-0.476398,0,0,-0.229195,-0.230336,-0.223394,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,-0.460538,0,0,0,0,0,0.061758,-0.527696,-0.393663,-0.424935,-0.283387,0
3,4,0,0.864344,0,-0.091599,0,0.551962,0,0,-0.229195,-0.230336,-0.223394,1.455861,0.784939,-0.187161,...,-0.052943,-0.049803,1.206337,1.163528,0,0,0,0,0,1.575222,1.469168,-0.393663,2.146566,-0.283387,0
4,5,0,0.429041,0,1.967758,0,-0.476398,0,0,-0.111778,-0.230336,10.894786,-0.199133,-0.192056,-0.187161,...,12.702797,8.494236,11.764138,2.263701,0,0,0,0,0,7.629078,0.470736,6.396209,0.089366,4.566957,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24008,131052,1,-0.441564,0,-0.297535,0,-0.476398,0,0,-0.229195,-0.230336,-0.223394,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,-0.434343,0,0,0,0,0,0.566246,-0.527696,-0.393663,-0.424935,-0.283387,0
24009,131058,1,-0.441564,0,-0.297535,0,-0.476398,0,0,-0.229195,-0.230336,-0.223394,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,-0.460538,0,0,0,0,0,0.566246,-0.527696,-0.393663,-0.424935,-0.283387,0
24010,131061,1,4.782067,0,0.202595,0,-0.476398,0,0,0.768846,0.773933,-0.223394,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,-0.120008,0,0,0,0,0,-0.442730,0.470736,-0.393663,-0.424935,-0.283387,0
24011,131066,1,-0.441564,0,-0.297535,0,-0.476398,0,0,-0.229195,-0.230336,2.076919,-0.199133,-0.192056,-0.187161,...,-0.052943,-0.049803,-0.113388,0.089549,0,0,0,0,0,1.575222,-0.028480,-0.393663,-0.424935,-0.283387,0


In [63]:
# Save the scaled data to a new CSV file
all_df_scaled.to_csv(os.path.join(download_folder, 'kdd_expanded_all_scaled.csv'))
