# Topic Modeling을 통한 강의평 분석

### Topic 개수 정하기

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# load libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from utils import *

In [3]:
# load dataset
data = pd.read_csv('../data/2020LectureReview.csv', index_col=0)

In [4]:
# make before/after COVID-19 column
data['COVID-19'] = data['ReviewSemester'].apply(lambda x: 'before' if x not in ['20년 1학기 수강자', '20년 2학기 수강자'] else 'after')

In [5]:
data.head()

Unnamed: 0,LectureUrl,LectureName,LectureProf,ReviewStar,ReviewSemester,ReviewText,COVID-19
0,https://yonsei.everytime.kr/lecture/view/3640,INTERNATIONAL MONEY AND CAPITAL MARKETS,함준호,100.0,20년 1학기 수강자,시험 잘 멋본 것 같은데 에이제로 떠서 놀람 감사합니다 교수님,after
1,https://yonsei.everytime.kr/lecture/view/3640,INTERNATIONAL MONEY AND CAPITAL MARKETS,함준호,80.0,20년 1학기 수강자,"에 뭐,,, 그냥 괜찮았는데 자습기간을 안 주고 전범위 시험본다그러니까 당황스러웠네...",after
2,https://yonsei.everytime.kr/lecture/view/3640,INTERNATIONAL MONEY AND CAPITAL MARKETS,함준호,80.0,19년 1학기 수강자,한국은행 금융통화위원을 지내고 오신 교수님이시라 그런지 간간히 들려주시는 교수님의 ...,before
3,https://yonsei.everytime.kr/lecture/view/3640,INTERNATIONAL MONEY AND CAPITAL MARKETS,함준호,100.0,19년 1학기 수강자,교수님 너무 똑똑하시고 피피티 위주로 수업하시기는 하는데 현장경험이 풍부하셔서 그런...,before
4,https://yonsei.everytime.kr/lecture/view/943471,학교교실의현장연구방법입문,박순용,80.0,20년 1학기 수강자,매주 온라인 강의를 늦게 올려주셔서 불편했지만 수업 내용은 무난하고 좋았습니다. 매...,after


## Task1 : 전체 강의평에 대하여 토픽 모델링 진행 후 코로나 전/후 강의평 토픽 비교

### Preprocessing

In [6]:
data_, text_tokenized_ = Preprocessor(data, list(data['ReviewText'].values))

### Topic Modeling (LDA) for Choosing TopicNum

In [7]:
# 5 Topics
print('============ Topic List (N=5) ============')
corpus_, lda_ = ModelingLDA(text_tokenized_, N=5)

(0, '0.042*"과제" + 0.017*"발표" + 0.017*"시간" + 0.013*"시험" + 0.013*"기말"')
(1, '0.016*"그냥" + 0.014*"모르" + 0.013*"사람" + 0.011*"학기" + 0.011*"시간"')
(2, '0.022*"생각" + 0.010*"학생" + 0.008*"배우" + 0.008*"지식" + 0.008*"이해"')
(3, '0.045*"시험" + 0.028*"문제" + 0.020*"공부" + 0.012*"시간" + 0.010*"열심히"')
(4, '0.032*"학생" + 0.028*"학점" + 0.019*"최고" + 0.018*"학기" + 0.017*"재밌"')


In [8]:
# 10 Topics
print('============ Topic List (N=10) ============')
corpus_, lda_ = ModelingLDA(text_tokenized_, N=10)

(0, '0.036*"그냥" + 0.023*"모르" + 0.019*"시간" + 0.019*"ㅋㅋ" + 0.015*"ㅋㅋㅋ"')
(1, '0.047*"프로젝트" + 0.027*"조교" + 0.027*"과제" + 0.024*"코딩" + 0.024*"파이썬"')
(2, '0.023*"생각" + 0.013*"학생" + 0.012*"부분" + 0.010*"이해" + 0.009*"아니"')
(3, '0.041*"발표" + 0.017*"시간" + 0.015*"생각" + 0.015*"주제" + 0.012*"학생"')
(4, '0.094*"과제" + 0.030*"기말" + 0.029*"중간" + 0.020*"시험" + 0.020*"학기"')
(5, '0.039*"학점" + 0.035*"시험" + 0.026*"열심히" + 0.023*"과제" + 0.021*"공부"')
(6, '0.033*"재밌" + 0.027*"기독교" + 0.027*"흥미" + 0.024*"학생" + 0.017*"학기"')
(7, '0.060*"시험" + 0.051*"문제" + 0.019*"공부" + 0.016*"시간" + 0.015*"기말"')
(8, '0.035*"수강" + 0.032*"과목" + 0.032*"학기" + 0.029*"학점" + 0.015*"전공"')
(9, '0.073*"영어" + 0.059*"질문" + 0.050*"학생" + 0.044*"설명" + 0.032*"친절"')


In [9]:
# 15 Topics
print('============ Topic List (N=15) ============')
corpus_, lda_ = ModelingLDA(text_tokenized_, N=15)

(0, '0.084*"학생" + 0.037*"질문" + 0.025*"학기" + 0.019*"친절" + 0.017*"영상"')
(1, '0.183*"영어" + 0.037*"실력" + 0.036*"비율" + 0.027*"채워" + 0.025*"발음"')
(2, '0.034*"설명" + 0.030*"이해" + 0.014*"공부" + 0.013*"질문" + 0.013*"부분"')
(3, '0.062*"ㅋㅋ" + 0.050*"ㅋㅋㅋ" + 0.050*"그냥" + 0.027*"마세요" + 0.025*"학점"')
(4, '0.050*"발표" + 0.027*"과제" + 0.022*"시간" + 0.019*"조별" + 0.017*"주제"')
(5, '0.069*"기말" + 0.067*"중간" + 0.062*"과제" + 0.034*"퀴즈" + 0.021*"점수"')
(6, '0.086*"최고" + 0.072*"기독교" + 0.061*"교양" + 0.053*"재미있" + 0.051*"흥미"')
(7, '0.073*"과제" + 0.056*"학점" + 0.042*"시험" + 0.026*"모임" + 0.026*"부담"')
(8, '0.086*"채플" + 0.036*"계절" + 0.035*"패스" + 0.033*"갑니다" + 0.032*"그냥"')
(9, '0.048*"시험" + 0.041*"공부" + 0.029*"열심히" + 0.018*"그냥" + 0.016*"피피티"')
(10, '0.030*"생각" + 0.015*"학생" + 0.009*"보다" + 0.009*"지식" + 0.009*"아니"')
(11, '0.107*"ㅎㅎ" + 0.090*"ㅠㅠ" + 0.056*"개꿀" + 0.041*"천사" + 0.040*"이쁠"')
(12, '0.093*"문제" + 0.081*"시험" + 0.021*"족보" + 0.018*"시간" + 0.018*"나오"')
(13, '0.069*"시간" + 0.041*"학기" + 0.014*"학생" + 0.012*"내내" + 0.012*"다음"')
(14, '0.0

In [10]:
# 20 Topics
print('============ Topic List (N=20) ============')
corpus_, lda_ = ModelingLDA(text_tokenized_, N=20)

(0, '0.045*"채점" + 0.044*"기준" + 0.023*"조교" + 0.019*"단어" + 0.017*"점수"')
(1, '0.243*"과제" + 0.038*"부담" + 0.036*"조별" + 0.035*"모임" + 0.027*"시험"')
(2, '0.074*"재밌" + 0.055*"배우" + 0.039*"학점" + 0.033*"추천" + 0.027*"관심"')
(3, '0.098*"시험" + 0.045*"공부" + 0.034*"족보" + 0.031*"피피티" + 0.028*"필기"')
(4, '0.057*"모르" + 0.041*"영어" + 0.025*"그냥" + 0.018*"별로" + 0.015*"근데"')
(5, '0.075*"채플" + 0.055*"대면" + 0.050*"영상" + 0.046*"비대" + 0.039*"온라인"')
(6, '0.029*"생각" + 0.016*"학생" + 0.009*"보다" + 0.009*"학기" + 0.009*"아니"')
(7, '0.016*"최악" + 0.016*"학교" + 0.014*"이런" + 0.011*"더라" + 0.011*"강연"')
(8, '0.045*"학점" + 0.036*"과목" + 0.030*"사람" + 0.030*"그냥" + 0.023*"다른"')
(9, '0.103*"시간" + 0.033*"출석" + 0.020*"출결" + 0.017*"그냥" + 0.010*"으면"')
(10, '0.071*"문제" + 0.055*"시험" + 0.022*"공부" + 0.018*"이해" + 0.016*"설명"')
(11, '0.094*"흥미" + 0.053*"갑니다" + 0.049*"실험" + 0.043*"지정" + 0.038*"완벽"')
(12, '0.083*"학생" + 0.054*"최고" + 0.051*"학기" + 0.028*"ㅎㅎ" + 0.026*"감사"')
(13, '0.125*"질문" + 0.109*"친절" + 0.070*"주심" + 0.057*"설명" + 0.042*"착하"')
(14, '0.118*"

## Task2 : 코로나 전/후 강의평에 대하여 각각 토픽 모델링 진행 후 코로나 전/후 강의평 토픽 비교

### Preprocessing

In [11]:
before_data = data[data['COVID-19']=='before']
after_data = data[data['COVID-19']=='after']

In [12]:
before_text = list(before_data['ReviewText'].values)
after_text = list(after_data['ReviewText'].values)

In [13]:
before_data, before_text_tokenized = Preprocessor(before_data, before_text)
after_data, after_text_tokenized = Preprocessor(after_data, after_text)

### Topic Modeling (LDA)

In [14]:
# 5 Topics
print('========== Topic List before COVID-19 (N=5) ==========')
before_corpus, before_lda = ModelingLDA(before_text_tokenized, N=5)

(0, '0.038*"과제" + 0.021*"시간" + 0.019*"발표" + 0.016*"모임" + 0.016*"시험"')
(1, '0.028*"그냥" + 0.021*"족보" + 0.020*"학점" + 0.013*"ㅋㅋ" + 0.012*"시험"')
(2, '0.019*"생각" + 0.013*"학생" + 0.009*"아니" + 0.007*"학기" + 0.007*"사람"')
(3, '0.020*"학생" + 0.017*"재밌" + 0.014*"최고" + 0.013*"학점" + 0.012*"학기"')
(4, '0.047*"시험" + 0.032*"문제" + 0.021*"공부" + 0.011*"시간" + 0.010*"설명"')


In [15]:
# 10 Topics
print('========== Topic List before COVID-19 (N=10) ==========')
before_corpus, before_lda = ModelingLDA(before_text_tokenized, N=10)

(0, '0.021*"학점" + 0.021*"사람" + 0.020*"그냥" + 0.016*"모르" + 0.016*"아니"')
(1, '0.065*"시험" + 0.035*"공부" + 0.027*"문제" + 0.018*"족보" + 0.017*"열심히"')
(2, '0.102*"문제" + 0.061*"영어" + 0.045*"시험" + 0.020*"연습" + 0.013*"진도"')
(3, '0.086*"과제" + 0.027*"출석" + 0.027*"시험" + 0.027*"학점" + 0.019*"모임"')
(4, '0.022*"관심" + 0.016*"생각" + 0.015*"지식" + 0.015*"흥미" + 0.011*"역사"')
(5, '0.050*"발표" + 0.028*"과제" + 0.024*"모임" + 0.020*"조별" + 0.015*"보고서"')
(6, '0.025*"설명" + 0.024*"이해" + 0.016*"부분" + 0.013*"공부" + 0.011*"생각"')
(7, '0.041*"시간" + 0.024*"학기" + 0.015*"생각" + 0.014*"학생" + 0.014*"채플"')
(8, '0.038*"학생" + 0.020*"열심히" + 0.019*"친절" + 0.017*"재밌" + 0.016*"ㅎㅎ"')
(9, '0.036*"중간" + 0.036*"기말" + 0.031*"시험" + 0.026*"점수" + 0.021*"과제"')


In [16]:
# 15 Topics
print('========== Topic List before COVID-19 (N=15) ==========')
before_corpus, before_lda = ModelingLDA(before_text_tokenized, N=15)

(0, '0.044*"그냥" + 0.040*"모르" + 0.024*"학점" + 0.019*"아니" + 0.019*"사람"')
(1, '0.051*"경제학" + 0.047*"경제" + 0.043*"단점" + 0.039*"장점" + 0.018*"유일"')
(2, '0.097*"출석" + 0.068*"출결" + 0.025*"결석" + 0.021*"부르" + 0.020*"지각"')
(3, '0.042*"학기" + 0.039*"학생" + 0.015*"ㅎㅎ" + 0.014*"사랑" + 0.010*"감사"')
(4, '0.034*"시간" + 0.027*"발표" + 0.024*"주제" + 0.018*"보고서" + 0.016*"토론"')
(5, '0.072*"과제" + 0.029*"모임" + 0.028*"시험" + 0.021*"발표" + 0.018*"시간"')
(6, '0.062*"친절" + 0.044*"질문" + 0.042*"설명" + 0.029*"착하" + 0.023*"ㅠㅠ"')
(7, '0.180*"ㅋㅋ" + 0.144*"ㅋㅋㅋ" + 0.090*"개꿀" + 0.021*"포인트" + 0.016*"들으"')
(8, '0.023*"흥미" + 0.022*"관심" + 0.020*"지식" + 0.016*"재밌" + 0.015*"추천"')
(9, '0.062*"최고" + 0.023*"그저" + 0.021*"음악" + 0.019*"학년" + 0.018*"교양"')
(10, '0.060*"시험" + 0.051*"공부" + 0.033*"열심히" + 0.020*"학점" + 0.018*"그냥"')
(11, '0.102*"중간" + 0.098*"기말" + 0.061*"퀴즈" + 0.030*"학점" + 0.026*"점수"')
(12, '0.055*"영어" + 0.037*"시간" + 0.026*"채플" + 0.020*"수강" + 0.018*"학기"')
(13, '0.045*"생각" + 0.039*"학생" + 0.019*"사람" + 0.018*"아니" + 0.015*"본인"')
(14, '0.062

In [17]:
# 20 Topics
print('========== Topic List before COVID-19 (N=20) ==========')
before_corpus, before_lda = ModelingLDA(before_text_tokenized, N=20)

(0, '0.031*"지식" + 0.017*"역사" + 0.015*"관심" + 0.010*"텍스트" + 0.010*"분야"')
(1, '0.067*"학생" + 0.056*"질문" + 0.048*"친절" + 0.032*"설명" + 0.019*"착하"')
(2, '0.121*"무난" + 0.088*"채플" + 0.087*"그냥" + 0.034*"패스" + 0.028*"채우"')
(3, '0.057*"공부" + 0.045*"시험" + 0.031*"열심히" + 0.017*"어렵" + 0.017*"학점"')
(4, '0.038*"문제" + 0.037*"시험" + 0.030*"부분" + 0.029*"설명" + 0.023*"이해"')
(5, '0.031*"생각" + 0.028*"학생" + 0.012*"학기" + 0.011*"시간" + 0.010*"아니"')
(6, '0.149*"ㅎㅎ" + 0.031*"ㅎㅎㅎ" + 0.022*"힐링" + 0.019*"노래" + 0.016*"여러분"')
(7, '0.077*"발표" + 0.032*"기말" + 0.027*"시간" + 0.027*"보고서" + 0.025*"토론"')
(8, '0.047*"최고" + 0.042*"과목" + 0.038*"학점" + 0.033*"전공" + 0.032*"배우"')
(9, '0.162*"문제" + 0.089*"시험" + 0.073*"족보" + 0.033*"나오" + 0.023*"연습"')
(10, '0.049*"모르" + 0.042*"그냥" + 0.024*"사람" + 0.022*"아니" + 0.019*"학점"')
(11, '0.058*"생각" + 0.044*"재밌" + 0.036*"흥미" + 0.034*"나름" + 0.031*"학점"')
(12, '0.074*"시험" + 0.033*"피피티" + 0.030*"외우" + 0.030*"필기" + 0.027*"공부"')
(13, '0.092*"출석" + 0.074*"시간" + 0.064*"출결" + 0.024*"결석" + 0.020*"부르"')
(14, '0.07

In [18]:
# 5 Topics
print('========== Topic List after COVID-19 (N=5) ==========')
after_corpus, after_lda = ModelingLDA(after_text_tokenized, N=5)

(0, '0.042*"시험" + 0.030*"문제" + 0.016*"과제" + 0.015*"공부" + 0.014*"퀴즈"')
(1, '0.025*"학생" + 0.024*"학점" + 0.019*"생각" + 0.013*"학기" + 0.012*"과제"')
(2, '0.015*"모르" + 0.014*"학생" + 0.012*"그냥" + 0.010*"생각" + 0.010*"아니"')
(3, '0.034*"과제" + 0.017*"시간" + 0.015*"기말" + 0.015*"발표" + 0.014*"중간"')
(4, '0.032*"학점" + 0.023*"그냥" + 0.020*"학기" + 0.015*"최고" + 0.012*"비대"')


In [19]:
# 10 Topics
print('========== Topic List after COVID-19 (N=10) ==========')
after_corpus, after_lda = ModelingLDA(after_text_tokenized, N=10)

(0, '0.022*"학점" + 0.020*"열심히" + 0.019*"발표" + 0.017*"힘들" + 0.014*"점수"')
(1, '0.040*"학생" + 0.030*"학점" + 0.028*"학기" + 0.024*"과제" + 0.017*"시험"')
(2, '0.044*"기말" + 0.042*"중간" + 0.027*"레포트" + 0.022*"점수" + 0.015*"보고서"')
(3, '0.038*"영상" + 0.010*"퀄리티" + 0.009*"올려" + 0.009*"역학" + 0.008*"계속"')
(4, '0.045*"기독교" + 0.020*"코딩" + 0.014*"파이썬" + 0.014*"패스" + 0.012*"최고"')
(5, '0.058*"문제" + 0.045*"시험" + 0.016*"공부" + 0.016*"시간" + 0.016*"퀴즈"')
(6, '0.027*"생각" + 0.015*"주제" + 0.011*"진행" + 0.011*"채플" + 0.010*"개인"')
(7, '0.102*"과제" + 0.027*"시간" + 0.022*"출석" + 0.021*"기말" + 0.020*"매주"')
(8, '0.034*"시험" + 0.028*"학점" + 0.026*"공부" + 0.023*"그냥" + 0.013*"과목"')
(9, '0.019*"모르" + 0.017*"학기" + 0.017*"학생" + 0.013*"평가" + 0.011*"그냥"')


In [20]:
# 15 Topics
print('========== Topic List after COVID-19 (N=15) ==========')
after_corpus, after_lda = ModelingLDA(after_text_tokenized, N=15)

(0, '0.062*"문제" + 0.061*"시험" + 0.021*"공부" + 0.014*"퀴즈" + 0.013*"시간"')
(1, '0.051*"최고" + 0.038*"학기" + 0.033*"그저" + 0.023*"학점" + 0.023*"수강"')
(2, '0.061*"과제" + 0.036*"출석" + 0.026*"매주" + 0.025*"제출" + 0.025*"영상"')
(3, '0.067*"과제" + 0.053*"프로젝트" + 0.026*"코딩" + 0.018*"파이썬" + 0.018*"조교"')
(4, '0.021*"단어" + 0.017*"인가" + 0.012*"영어" + 0.011*"전공" + 0.011*"친구"')
(5, '0.030*"발표" + 0.025*"과제" + 0.016*"조별" + 0.015*"생각" + 0.013*"시간"')
(6, '0.046*"성적" + 0.045*"기준" + 0.041*"학점" + 0.041*"평가" + 0.035*"학기"')
(7, '0.062*"그냥" + 0.061*"ㅋㅋ" + 0.046*"ㅋㅋㅋ" + 0.045*"채플" + 0.039*"개꿀"')
(8, '0.055*"기말" + 0.051*"중간" + 0.049*"과제" + 0.035*"시험" + 0.022*"학점"')
(9, '0.024*"모르" + 0.023*"그냥" + 0.019*"학생" + 0.018*"아니" + 0.017*"사람"')
(10, '0.075*"학생" + 0.024*"학기" + 0.022*"배려" + 0.021*"부담" + 0.019*"학점"')
(11, '0.056*"학점" + 0.026*"과제" + 0.021*"과목" + 0.020*"힘들" + 0.019*"공부"')
(12, '0.105*"시간" + 0.015*"투자" + 0.011*"배우" + 0.011*"생각" + 0.010*"가지"')
(13, '0.025*"생각" + 0.016*"지식" + 0.015*"관심" + 0.012*"영화" + 0.011*"역사"')
(14, '0.099*

In [21]:
# 20 Topics
print('========== Topic List after COVID-19 (N=20) ==========')
after_corpus, after_lda = ModelingLDA(after_text_tokenized, N=20)

(0, '0.077*"과제" + 0.074*"기말" + 0.066*"중간" + 0.036*"대체" + 0.032*"출석"')
(1, '0.034*"아니" + 0.032*"사람" + 0.027*"학점" + 0.025*"생각" + 0.016*"그냥"')
(2, '0.034*"모르" + 0.023*"그냥" + 0.018*"건지" + 0.018*"최악" + 0.015*"다가"')
(3, '0.119*"코로나" + 0.105*"학기" + 0.062*"온라인" + 0.032*"조교" + 0.022*"비대"')
(4, '0.034*"생각" + 0.024*"교양" + 0.023*"보다" + 0.022*"과목" + 0.018*"전공"')
(5, '0.043*"그저" + 0.042*"배우" + 0.040*"유익" + 0.036*"재밌" + 0.030*"철학"')
(6, '0.111*"학생" + 0.036*"질문" + 0.033*"생각" + 0.021*"배려" + 0.016*"노력"')
(7, '0.027*"시간" + 0.024*"프로젝트" + 0.014*"초반" + 0.013*"만들" + 0.012*"화학"')
(8, '0.049*"문제" + 0.029*"시험" + 0.019*"시간" + 0.012*"퀴즈" + 0.011*"설명"')
(9, '0.055*"채플" + 0.032*"코딩" + 0.026*"그냥" + 0.025*"물리" + 0.022*"파이썬"')
(10, '0.053*"과목" + 0.046*"최고" + 0.041*"ㅋㅋ" + 0.041*"학점" + 0.040*"그냥"')
(11, '0.064*"학점" + 0.058*"기준" + 0.048*"모르" + 0.040*"점수" + 0.035*"성적"')
(12, '0.035*"주제" + 0.021*"논문" + 0.018*"리딩" + 0.017*"생각" + 0.016*"분석"')
(13, '0.093*"대면" + 0.046*"비대" + 0.043*"시험" + 0.041*"학점" + 0.020*"깔끔"')
(14, '0.099