-
Notifications
You must be signed in to change notification settings - Fork 0
/
RateGradeInfl.R
107 lines (79 loc) · 7.54 KB
/
RateGradeInfl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(ggthemes)
library(car)
library(boot)
library(stats)
setwd('~/Documents/DataScienceAcademy/Python/ScrapingProject/') # comment this out
DOEdata <- fread('./AggData/DOEschoolData.csv') #DOE College Scorecard data
inflation <- fread('./AggData/inflation.csv') # data on grade inflation
schools <- fread('./AggData/schools.csv') # information/classification on schools
ratings <- fread('./AggData/ratings.csv') # Rate My Prof reviews
# Make Histograms for both Quality and Difficulty
NumRate <- ggplot(data=ratings) + geom_histogram(aes(rat), col = 'red', fill = 'blue', binwidth = 0.5, bins = 9) + labs(x = 'Quality Ratings', y = 'Frequency of Occurrence',
title = 'Quality: 138,747 Reviews') + theme(plot.title=element_text(size=16, face="bold", color="darkgreen"))
NumRate
NumDiff <- ggplot(data=ratings) + geom_histogram(aes(diff), col = 'red', fill = 'blue', binwidth = 1, bins = 5) + labs(x = 'Difficulty Ratings', y = 'Frequency of Occurrence',
title = 'Difficulty: 138,747 Reviews') + theme(plot.title=element_text(size=16, face="bold", color="darkgreen"))
NumDiff
# Aggregate Ratings by instructor for more detailed analysis
aggRatings <- ratings %>% group_by(name, hot, school, dept) %>% summarise(meanRat = mean(rat), meanDiff = mean(diff), N = n())
aggRatingsHot <- ratings %>% group_by(as.factor(hot)) %>% summarise(meanRat = mean(rat), meanDiff = mean(diff), N = n())
# Create plot that explores relationship between percieved quality and difficulty
RateDiff <- ggplot() + geom_point(data=aggRatings, aes(x = meanDiff, y = meanRat, color = aggRatings$N), shape = 7, alpha = 0.4) + labs(x = 'Mean Difficulty Rating per Instructor', y = 'Mean Quality Rating per Instructor',
title = '138,747 Rate My Professor Reviews Averaged by Instructor', color = '# Reviews') + geom_smooth(data=aggRatings, aes(x = meanDiff, y = meanRat), method = 'lm', formula = y ~ poly(x,3))
+ theme(plot.title=element_text(size=16, face="bold", color="darkgreen"))
RateDiff
EZQ <- lm(rat~(hot - diff), ratings)
summary(EZQ)
RatDiffTest <- cor.test(ratings$rat, ratings$diff)
HotNot <- ggplot(data=aggRatings, aes(meanDiff, meanRat, color = factor(aggRatings$hot))) + geom_point(shape = 7, alpha = 0.7) + labs(x = 'Mean Difficulty Rating per Instructor', y = 'Mean Quality Rating per Instructor',
title = '138,747 Rate My Professor Reviews Averaged by Instructor', color = 'Hot?') + theme(plot.title=element_text(size=12, face="bold", color="darkgreen")) + scale_colour_manual(name="Hot \n or \n Not?", values =c("blue", "red"))
HotNot
HistHotRat <- ggplot(ratings, aes(x = rat, color = hot)) + geom_histogram(binwidth = 0.5, bins = 9, alpha = 0.5, position = 'identity') + labs(x = 'Quality Rating', y = 'Frequency of Occurrence',
title = 'Quality: 138,747 Reviews') + theme(plot.title=element_text(size=16, face="bold", color="darkgreen")) + scale_colour_manual(name="Hot \n or \n Not?", values =c("blue", "red"))
HistHotRat
HistHotDiff <- ggplot(ratings,aes(x = diff, color = hot)) + geom_histogram(alpha = 0.5, position = 'identity', binwidth = 1, bins = 5)+ labs(x = 'Difficulty Rating', y = 'Frequency of Occurrence',
title = 'Difficulty: 138,747 Reviews') + theme(plot.title=element_text(size=16, face="bold", color="darkgreen")) + scale_colour_manual(name="Hot \n or \n Not?", values =c("blue", "red"))
HistHotDiff
DoeAgg <- DOEdata %>% mutate(delta = (exp - rev)) %>% group_by(school, delta) %>% summarise(Rate = mean(rate, na.rm = T), SAT = mean(SAT, na.rm = T))
SchoolaggRatings <- ratings %>% group_by(school) %>% summarise(meanRat = mean(rat), meanDiff = mean(diff), N = n())
SchoolSpendComp <- inner_join(SchoolaggRatings, DoeAgg, by = 'school')
SpendOut1 <- ggplot(data=SchoolSpendComp, aes(x = delta, y = meanRat, label = school, size = Rate, fill = Rate)) + geom_point(shape = 23, alpha = 0.7) + labs(x = 'Difference Between Instructional Expenditures \n and Costs per FTE Equivalent', y = 'Mean Quality Rating per Instructor',
title = 'Examining the Influence of Net \n Instructional Expenditures on \n Quality Rating') + theme(plot.title=element_text(size=16, face="bold", color="darkgreen")) + scale_color_gradientn(colours = rainbow(3))
SpendOut1
SpendOut2 <- ggplot(data=SchoolSpendComp, aes(x = delta, y = meanDiff, label = school, size = Rate, fill = Rate)) + geom_point(shape = 23, alpha = 0.7) +
labs(x = 'Difference Between Instructional Expenditures \n and Costs per FTE Equivalent', y = 'Mean Difficulty Rating per Instructor',title = 'Examining the Influence of Net \n Instructional Expenditures on \n Difficulty Rating') +
theme(plot.title=element_text(size=16, face="bold", color="darkgreen")) + scale_color_gradientn(colours = rainbow(3))
SpendOut2
model1 = lm(meanRat~delta, data=SchoolSpendComp)
model2 = lm(meanDiff~delta, data=SchoolSpendComp)
model1
model2
InfluenceRat <- influencePlot(model1, id.method="identify", main="Influence Plot for Model of \n Mean Quality Rating vs \n Net Instructional Expenditures", sub="Circle size is proportial to Cook's Distance")
InfluenceDiff <- influencePlot(model1, id.method="identify", main="Influence Plot for Model of \n Mean Difficulty Rating vs \n Net Instructional Expenditures", sub="Circle size is proportial to Cook's Distance")
GradeInflRecent <- inflation %>% filter(year > 2005) %>% group_by(school) %>% summarise(meanGPA = mean(GPA, na.rm=T))
SchoolsGradesRat <- inner_join(SchoolSpendComp, GradeInflRecent, by = 'school')
InflGradesRat <- ggplot(SchoolsGradesRat, aes(x = meanGPA, y = meanRat)) + geom_point(shape = 7, alpha = 0.7, size = SchoolsGradesRat$Rate, fill = SchoolsGradesRat$Rate) +
labs(x = 'Mean GPA', y = 'Mean Quality Rating per Instructor', title = 'Quality Rating and GPA') +
theme(plot.title=element_text(size=12, face="bold", color="darkgreen")) + geom_smooth(method = 'lm')
InflGradesRat
InflGradesDiff <- ggplot(SchoolsGradesRat, aes(x = meanGPA, y = meanDiff, label = school)) + geom_point(shape = 7, alpha = 0.7, size = SchoolsGradesRat$Rate, fill = SchoolsGradesRat$Rate)+
labs(x = 'Mean GPA', y = 'Mean Difficulty Rating per Instructor', title = 'Difficulty Rating and GPA') +
theme(plot.title=element_text(size=12, face="bold", color="darkgreen")) + geom_smooth(method = 'lm')
InflGradesDiff
test <- ratings %>% group_by(dept) %>% summarise(meanRat = mean(rat), meanDiff = mean(diff))
### all below for future analyses
test$dept <- trimws(gsub(".*the\\s*|department*", "", test$dept), which = 'right')
test$class = grepl(PSM, test$dept)
PSM = 'Actuarial|Aerospace|Astronomy|Chemistry|Engineering|Computer|Earth|Electrical|Industrial|Macromolecular|Materials|Mathematics|
Nuclear|Physics|Statistics'
test$classif = grepl('Actuarial|Aerospace|Astronomy|Chemistry|Engineering|Computer|Earth|Electrical|Industrial|Macromolecular|Materials|Mathematics|Nuclear|Physics|Statistics', test$dept)
PStestPlotRat <- ggplot(test, aes(x=classif, y = meanRat)) + geom_boxplot(notch = TRUE, color="orange", fill='blue', alpha=0.2 ) + labs(x = 'Physical Science/Math or Other', y = 'Mean Quality Rating per Instructor', title = 'Quality Rating and Disciplines') +
theme(plot.title=element_text(size=12, face="bold", color="darkgreen"))
PStestPlotRat
PStestPlotDiff <- ggplot(test, aes(x=classif, y = meanDiff)) + geom_boxplot(notch = TRUE, color="orange", fill='blue', alpha=0.2 ) + labs(x = 'Physical Science/Math or Other', y = 'Mean Difficulty Rating per Instructor', title = 'Difficulty Rating and Disciplines') +
theme(plot.title=element_text(size=12, face="bold", color="darkgreen"))
PStestPlotDiff