**Part One: Working with Student Grade Data**

**Step 1: Load the Student Grade Files**

In [None]:
import numpy as np

# Load each grade file as string data
hwd = np.loadtxt("./lab2-hw.txt", dtype=str)
testd = np.loadtxt("./lab2-test.txt", dtype=str)
quizd = np.loadtxt("./lab2-quiz.txt", dtype=str)
projd = np.loadtxt("./lab2-project.txt", dtype=str)


**Step 2: Convert Strings to Floats**

In [None]:
# Exclude headers and names, convert scores to floats
hw1 = (hwd[1:, 1:]).astype(float)
test1 = (testd[1:, 1:]).astype(float)
quiz1 = (quizd[1:, 1:]).astype(float)
proj1 = (projd[1:, 1:]).astype(float)


**Step 3: Compute Averages for Each Category**

In [None]:
# Calculate the average score for each student
hw_ave = np.average(hw1, axis=1)
test_ave = np.average(test1, axis=1)
quiz_ave = np.average(quiz1, axis=1)
proj_ave = np.average(proj1, axis=1)


**Step 4: Normalize Scores to a 100-Point Scale**

In [None]:
# Scale all scores to a percentage basis (out of 100)
hw_w = hw_ave * (100/50)    # Homework out of 50 points
test_w = test_ave * (100/100)  # Tests already out of 100
quiz_w = quiz_ave * (100/10)   # Quizzes out of 10 points
proj_w = proj_ave * (100/100)  # Project already out of 100


**Step 5: Build the Grade Matrix**

In [None]:
# Combine all categories into a single matrix
grades = np.column_stack((hw_w, test_w, quiz_w, proj_w))
print(grades)  # Display the matrix of all student scores


[[92.  94.  90.  90. ]
 [80.4 90.  88.  95. ]
 [73.2 83.  80.  85. ]
 [83.6 87.5 94.  71. ]
 [88.4 74.5 88.  82. ]
 [71.2 80.  76.  70. ]]


**Step 6: Define the Weight Vector**

In [None]:
# Weighting for final grade calculation
# Homework 30%, Tests 40%, Quizzes 10%, Project 20%
wgt = [0.3, 0.4, 0.1, 0.2]


**Step 7: Apply Matrix Multiplication**

In [None]:
# Multiply grade matrix by weight vector to compute final grades
final_grade = np.matmul(grades, wgt)
print(final_grade)  # Display the final grades for all students


[92.2  87.92 80.16 83.68 81.52 74.96]


**Bonus Step: Build a Final Grades Table**

In [None]:
import pandas as pd

# Assume student names are in the first column of hwd
student_names = hwd[1:, 0]

# Build a DataFrame with all scores and final grade
df_grades = pd.DataFrame({
    "Student": student_names,
    "Homework (%)": hw_w.round(2),
    "Tests (%)": test_w.round(2),
    "Quizzes (%)": quiz_w.round(2),
    "Project (%)": proj_w.round(2),
    "Final Grade": final_grade.round(2)
})

print(df_grades)

  Student  Homework (%)  Tests (%)  Quizzes (%)  Project (%)  Final Grade
0       A          92.0       94.0         90.0         90.0        92.20
1       B          80.4       90.0         88.0         95.0        87.92
2       C          73.2       83.0         80.0         85.0        80.16
3       D          83.6       87.5         94.0         71.0        83.68
4       E          88.4       74.5         88.0         82.0        81.52
5       F          71.2       80.0         76.0         70.0        74.96


**Part Two: Cleaning Goodreads Data**

**Step 1: Load the Dataset**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the Goodreads dataset from Pickle file
df = pd.read_pickle('lab2-T2-data.pkl')

**Step 2: Explore the Data**

In [None]:
# Display first few rows
df.head(3)

# Get dataset information: shape, column names, data types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           671 non-null    object
 1   subtitle        218 non-null    object
 2   series          122 non-null    object
 3   author          671 non-null    object
 4   my_rating       640 non-null    object
 5   avg_rating      671 non-null    object
 6   publisher       669 non-null    object
 7   binding         671 non-null    object
 8   pages           671 non-null    int64 
 9   year_published  671 non-null    object
 10  month_read      671 non-null    object
 11  month_read_num  671 non-null    int64 
 12  year_read       671 non-null    int64 
 13  bookshelf       671 non-null    object
dtypes: int64(3), object(11)
memory usage: 73.5+ KB


**Step 3: Check for Missing Values**

In [None]:
# Count missing values in a single column
df['my_rating'].isnull().sum()

# Count missing values for all columns
for col in df.columns:
    print(col, df[col].isnull().sum())

title 0
subtitle 453
series 549
author 0
my_rating 31
avg_rating 0
publisher 2
binding 0
pages 0
year_published 0
month_read 0
month_read_num 0
year_read 0
bookshelf 0


**Step 4: Drop Rows Missing Critical Fields**

In [None]:
# Remove rows missing ratings or publishing year
df1 = df.dropna(subset=['my_rating', 'avg_rating', 'year_published'])

**Step 5: Missing Non-Critical Fields**

In [None]:
# Replace NaN with blank strings in text fields
df1.loc[:, ['subtitle','series','publisher']] = df1[['subtitle','series','publisher']].fillna(' ')

**Step 6: Remove Invalid Records**

In [None]:
# Drop rows with negative years or ratings outside 0–5
df1 = df1[(df1['year_published'] > 0) &
          (df1['avg_rating'] >= 0) & (df1['avg_rating'] <= 5)]


**Step 7: Group Data by Author**

In [None]:
# Group books by author
group_author = df1.groupby('author')

# Count how many books each author published
df_num = group_author['author'].count()

# Find the most prolific author
max_count = 0
max_author = ''
for author in df_num.index:
    if df_num[author] > max_count:
        max_count = df_num[author]
        max_author = author

print(max_author, max_count)


Grisham, John 22


**Step 8: Display Books by Most Prolific Author**

In [None]:
# Print all books and years for the top author
for author, subset in df1.groupby('author'):
    if author == max_author:
        print(author)
        for index, row in subset.iterrows():
            print(row['title'], row['year_published'])


Grisham, John
A Time to Kill 1989
Calico Joe 2012
Ford County 2008
Playing for Pizza 2007
Rogue Lawyer 2015
Sycamore Row 2013
The Associate 2009
The Brethren 2000
The Broker 2005
The Chamber 1994
The Confession 2010
The King of Torts 2003
The Last Juror 2004
The Litigators 2011
The Partner 1997
The Racketeer 2012
The Rooster Bar 2018
The Street Lawyer 1998
The Summons 2002
The Testament 1999
The Whistler 2016
Theodore Boone 2010
