In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
df = pd.read_csv("./app/src/main/resources/keystrokes.csv")
df = df[(df.EventType == "File.Edit")]
df["Key"] = df.SubjectID + "_" + df.AssignmentID + "_" + df.CodeStateSection
df = df[["Key", "EditType", "SourceLocation", "InsertText", "DeleteText", "ClientTimestamp"]]
df = df.replace(np.nan, "NaN")
df

Unnamed: 0,Key,EditType,SourceLocation,InsertText,DeleteText,ClientTimestamp
0,Student1_Assign10_task1.py,Insert,0.0,# @@@@@@@@@@@@@\n# CS1400 - MW1\n# Assignment ??,,1636696834375
1,Student1_Assign10_task1.py,Insert,46.0,\n,,1636696835921
3,Student1_Assign10_task1.py,Insert,47.0,\n,,1636696839138
6,Student1_Assign10_task1.py,Insert,48.0,from modules.wordinator import Wordinator\ndef...,,1636696840750
8,Student1_Assign10_task1.py,Insert,2820.0,\n,,1636696848989
...,...,...,...,...,...,...
2088851,Student9_Assign9_test.py,Insert,243.0,F,,1636155787631
2088853,Student9_Assign9_test.py,Insert,244.0,a,,1636155787737
2088855,Student9_Assign9_test.py,Insert,245.0,l,,1636155787941
2088857,Student9_Assign9_test.py,Insert,246.0,s,,1636155788077


In [3]:
df.groupby("Key").count().sort_values(by="EditType", ascending=False).head(50).mean()

EditType           5480.62
SourceLocation     5480.62
InsertText         5480.62
DeleteText         5480.62
ClientTimestamp    5480.62
dtype: float64

In [4]:
to_drop = df[(df.EditType == "NaN")]
to_drop

Unnamed: 0,Key,EditType,SourceLocation,InsertText,DeleteText,ClientTimestamp
7630,Student1_Assign6_flukytester.py,,0.0,,,1634606481580
28217,Student10_Assign10_Chessboard.py,,0.0,,,1636738782004
28219,Student10_Assign10_blobber.py,,0.0,,,1636738782013
28221,Student10_Assign10_main.py,,0.0,,,1636738782005
28223,Student10_Assign10_task1.py,,0.0,,,1636738782007
...,...,...,...,...,...,...
2077681,Student9_Assign9_task1.py,,2409.0,,,1636140089288
2077683,Student9_Assign9_task1.py,,1971.0,,,1636140089289
2080805,Student9_Assign9_task1.py,,3803.0,,,1636150704353
2080814,Student9_Assign9_task1.py,,4191.0,,,1636150710842


In [5]:
to_drop = to_drop.Key.unique()
print(f"Bad Files: {len(to_drop)}")

Bad Files: 573


In [6]:
good_files = df[(~df.Key.isin(to_drop))]

counts = good_files.groupby("Key").count().sort_values(by="EditType", ascending=False)
to_keep = []
for row in counts.iloc:
    if ".txt" in row.name.lower():
        continue
    if "plan" in row.name.lower():
        continue
    if "starter" in row.name.lower():
        continue
        
    # only count tasks with over 25 edits
    if row.EditType < 25:
        continue

    to_keep.append(row.name)
good_files = good_files[good_files.Key.isin(to_keep)]

good_files

Unnamed: 0,Key,EditType,SourceLocation,InsertText,DeleteText,ClientTimestamp
281,Student1_Assign12_task1.py,Insert,0.0,"def main():\n baseList = list(range(1, 101)...",,1638570621931
283,Student1_Assign12_task1.py,Insert,92.0,',,1638570621971
285,Student1_Assign12_task1.py,Insert,94.0,',,1638570622065
287,Student1_Assign12_task1.py,Insert,95.0,,,1638570622369
289,Student1_Assign12_task1.py,Insert,170.0,,,1638570625139
...,...,...,...,...,...,...
2088851,Student9_Assign9_test.py,Insert,243.0,F,,1636155787631
2088853,Student9_Assign9_test.py,Insert,244.0,a,,1636155787737
2088855,Student9_Assign9_test.py,Insert,245.0,l,,1636155787941
2088857,Student9_Assign9_test.py,Insert,246.0,s,,1636155788077


In [7]:
good_files.groupby("Key").count().sort_values("EditType", ascending=False)[:25]

Unnamed: 0_level_0,EditType,SourceLocation,InsertText,DeleteText,ClientTimestamp
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Student26_Assign7_chessboard.py,10914,10914,10914,10914,10914
Student26_Assign9_Task1.py,8268,8268,8268,8268,8268
Student22_Assign6_task2.py,8241,8241,8241,8241,8241
Student4_Assign6_task1.py,7489,7489,7489,7489,7489
Student30_Assign6_task1.py,7394,7394,7394,7394,7394
Student39_Assign6_Task1.py,7261,7261,7261,7261,7261
Student43_Assign8_pattern.py,6815,6815,6815,6815,6815
Student22_Assign6_task1.py,6680,6680,6680,6680,6680
Student42_Assign10_test.py,6644,6644,6644,6644,6644
Student39_Assign8_pattern.py,6266,6266,6266,6266,6266


In [8]:
out_files = good_files.replace("NaN", "")
out_files.SourceLocation = out_files.SourceLocation.astype(int)
out_files.to_csv("./app/src/main/resources/keystrokes_quoted.csv", quoting=csv.QUOTE_NONNUMERIC, index=False, header=True, quotechar='"')