# Open Syllabus Data Preprocessing

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
spacy.prefer_gpu(0)
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
from tqdm import tqdm
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim import corpora, models
from gensim.utils import simple_preprocess
from collections import Counter
import re
from ast import literal_eval
import csv

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [2]:
data = pd.read_pickle('OA_Prep_230918.pkl')

# from 2004 to 2019
data2 = data[(data['year'] >= 2004) & (data['year'] <= 2019)]
# sampling 30,000 cases in a year
sampled_df = data2.groupby('year').apply(lambda x: x.sample(min(len(x), 30000), replace=False, random_state=42)).reset_index(drop=True)

df = sampled_df

Unnamed: 0,id,display_name,title,title_highlight,code,institution,description,description_highlight,topic_outline,credits,learning_outcomes,year,class,text,tokens,category,graduate,original
0,4810363376059,Object-Oriented Programming and Reuse,Object-Oriented Programming and Reuse,,CS 412/512,"{'id': '19694', 'display_name': 'Villanova Uni...",This course will explore the techniques of obj...,,,,Students will gain facility in an object-orien...,2000,ComputerScience,Object-Oriented Programming and Reuse This cou...,"[object, programming, reuse, course, technique...",Engineering and Technology,1,True
1,1503238564723,C++ Language for Programmers,C++ Language for Programmers,,CSC 225,"{'id': '17779', 'display_name': 'DePaul Univer...","The student designs, codes and documents progr...",,,,,2000,ComputerScience,C++ Language for Programmers The student desig...,"[language, programmer, student, code, program,...",Engineering and Technology,-1,True
2,326417520379,Microcomputer Systems Programming,Microcomputer Systems Programming,,3720,"{'id': '20074', 'display_name': 'Utah State Un...",Â Advanced assembly language and systems prog...,,"I. Modern Architectures, Advanced Assembly Pro...",,,2000,ComputerScience,Microcomputer Systems Programming Â Advanced ...,"[microcomputer, system, programming, advanced,...",Engineering and Technology,0,True
3,9165460233360,Introduction to Computer Science I,Introduction to Computer Science I,,CS 10,"{'id': '17420', 'display_name': 'Westmont Coll...",This course provides an introduction to comput...,,,,,2000,ComputerScience,Introduction to Computer Science I This course...,"[introduction, computer, science, course, intr...",Engineering and Technology,-1,True
4,10411000737303,Object-Oriented Software Design and Construction,Object-Oriented Software Design and Construction,,CS 2704,"{'id': '20168', 'display_name': 'Virginia Tech...",The purpose of this course is to provide a mea...,,,3,,2000,ComputerScience,Object-Oriented Software Design and Constructi...,"[object, software, design, construction, purpo...",Engineering and Technology,-1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2426740,12077448060301,PUBLIC OPINION AND POLITICAL BEHAVIOR,PUBLIC OPINION AND POLITICAL BEHAVIOR,,POL3853,"{'id': '264', 'display_name': 'Yonsei Universi...",This course provides an overview of the nature...,,,3,,2022,Korea,PUBLIC OPINION AND POLITICAL BEHAVIOR This cou...,"[public, opinion, political, behavior, course,...",,0,True
2426741,6949257094087,STRATEGIC MANAGEMENT,STRATEGIC MANAGEMENT,,IEE3293,"{'id': '264', 'display_name': 'Yonsei Universi...",Strategic Management deals with the organizati...,,,3,Understanding how firms gain and sustain compe...,2022,Korea,STRATEGIC MANAGEMENT Strategic Management deal...,"[strategic, management, strategic, management,...",,0,True
2426742,12386685702458,User Experience and Psychology,User Experience and Psychology,,PSYC 493,"{'id': '582', 'display_name': 'Korea Universit...",The course provides a survey of Human Factors ...,,,,,2022,Korea,User Experience and Psychology The course prov...,"[user, experience, psychology, course, survey,...",,0,False
2426743,1417339211261,Macroeconomic Theory,Macroeconomic Theory,,ECON8022,"{'id': '696', 'display_name': 'Sungkyunkwan Un...",This course introduces contemporary theory for...,,,6,Be familiar with the main macroeconomic models...,2022,Korea,Macroeconomic Theory This course introduces co...,"[macroeconomic, theory, course, contemporary, ...",,1,True


In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['-'])

## Lemmatization

In [6]:
def lemmatization(texts, allowed_postags): #https://spacy.io/api/annotation#
        texts_out = []
        print('starting lemmatization \n')
        for sent in tqdm(texts):
                if sent!=None:
                        doc = nlp(sent)
                        if allowed_postags != None:
                                texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
                        else:
                                texts_out.append([token.lemma_ for token in doc])
                else:
                        texts_out.append(None)
        return texts_out

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
        # return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigram(data_words,bigram_min_count):
        bigram = gensim.models.Phrases(data_words, min_count=bigram_min_count)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        # making into a biagram
        data_words_bigrams = [bigram_mod[doc] for doc in remove_stopwords(data_words)] # after removing the stopwords, bigram
        # data_lemmatized = remove_stopwords(lemmatization(data_words_bigrams,allowed_postags=['NOUN','PROPN','ADJ','ADV','VERB'])) #lemm 이후 불용어 추가 제거
        return data_words_bigrams

### VERB

In [8]:
lemmatized = lemmatization(df['learning_outcomes'].str.lower().tolist(),['VERB']) # only using the verbs

bigrams = make_bigram(lemmatized,2)  # making biagrams appearing over 2 times

bigram_list = [word for sent in bigrams for word in sent if "_" in word]
Counter(bigram_list).most_common(50) 

df['lemmatized_outcomes'] = lemmatized
df.to_csv("lemmatized_learningoutcomes.csv")

with open('lemmatized_learningoutcomes.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(lemmatized)

lemmatization 시작 



100%|██████████| 480000/480000 [4:16:40<00:00, 31.17it/s]   


### NOUN, PRONOUN, ADJ

In [42]:
lemmatized = lemmatization(df['title'].str.lower().tolist(),['NOUN','PROPN','ADJ']) # 명사, 대명사, 형용사 활용

import csv

with open('lemmatized_title.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(lemmatized)

file_path = "/mnt/hdd/bjkim/KISDI/OpenSyllabus/opensyllabus_paper/lemmatized_title.csv"

with open(file_path, mode='r') as file:
    csv_reader = csv.reader(file, delimiter=',')
    
    for row in csv_reader:
        print(row)  


lemmatization 시작 



100%|██████████| 480000/480000 [2:11:26<00:00, 60.87it/s]  


VERB 까지

In [None]:
# lemmatized2 = lemmatization(df['title'].str.lower().tolist(),['NOUN','PROPN','ADJ','VERB']) # 명사, 대명사, 형용사 활용

lemmatization 시작 



100%|██████████| 480000/480000 [2:05:16<00:00, 63.86it/s]   


In [None]:
bigrams = make_bigram(lemmatized,100)  # N회 이상 등장한 단어만 bigram 처리

# bigram 단어 확인
bigram_list = [word for sent in bigrams for word in sent if "_" in word]
Counter(bigram_list).most_common(50) # 상위 N개

df['title_tokens'] = pd.Series(bigrams)

NameError: name 'lemmatized' is not defined

In [None]:
# bigrams = make_bigram(lemmatized2,100)  # N회 이상 등장한 단어만 bigram 처리
# #
# # bigram 단어 확인
# bigram_list = [word for sent in bigrams for word in sent if "_" in word]
# Counter(bigram_list).most_common(50) # 상위 N개

# df['title_tokens2'] = pd.Series(bigrams)

In [None]:
df.to_pickle("tokenized_title2.pkl")

In [None]:
df.to_pickle("tokenized_title.pkl")

In [None]:
title = pd.read_pickle("/mnt/hdd/bjkim/KISDI/OpenSyllabus/opensyllabus_paper/tokenized_title2.pkl")
title

Unnamed: 0,id,display_name,title,title_highlight,code,institution,description,description_highlight,topic_outline,credits,learning_outcomes,year,class,text,tokens,category,graduate,original,title_tokens2
0,6262062326506,Multi-level Analysis of Survey Data,Multi-level Analysis of Survey Data,,SURVMETH 790,"{'id': '18429', 'display_name': 'University of...",Students are introduced to an increasingly com...,,,3,,2004,Mathematics,Multi-level Analysis of Survey Data Students a...,"[multi, level, analysis, survey, datum, studen...",Natural Sciences,1,True,"[multi, level, analysis, survey, datum]"
1,6966436956976,Class Guitar,Class Guitar,,MUS 130,"{'id': '19460', 'display_name': 'Oral Roberts ...",A lecture/demonstration teaching method design...,,,1,acquire the basic understanding of guitar. dev...,2004,Music,Class Guitar A lecture/demonstration teaching ...,"[class, guitar, lecture, demonstration, teachi...",Arts and Humanities,-1,False,"[class, guitar]"
2,11665131195894,Greek Historiography,Greek Historiography,,Greek 303,"{'id': '20152', 'display_name': 'University of...",This course is designed as a capstone experien...,,,,,2004,Classics,Greek Historiography This course is designed a...,"[greek, historiography, course, capstone, expe...",Arts and Humanities,0,False,"[greek, historiography]"
3,15590731307182,Evaluation and Impact of Public Organizations,Evaluation and Impact of Public Organizations,,PA 513,"{'id': '18641', 'display_name': 'Park Universi...",An examination of the evaluation of the output...,,,3,Identify an issue or problem and the program o...,2004,Education,Evaluation and Impact of Public Organizations ...,"[evaluation, impact, public, organization, exa...",Social Sciences,1,True,"[evaluation, impact, public, organization]"
4,1735166804574,URBAN GEOGRAPHY AND PLANNING,URBAN GEOGRAPHY AND PLANNING,,GEOG 3334,"{'id': '18506', 'display_name': 'University of...",This course examines processes of urbanization...,,,,,2004,Geography,URBAN GEOGRAPHY AND PLANNING This course exami...,"[urban, geography, planning, course, process, ...",Natural Sciences,0,True,"[urban, geography, planning]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479995,6382321406943,Feminist Practice of Oral History,Feminist Practice of Oral History,,WOMENSTD 425,"{'id': '18429', 'display_name': 'University of...",Do you have a Grandma/Lola/Auntie/Role Model y...,,,3,Students will learn different strategies of ho...,2019,History,Feminist Practice of Oral History Do you have ...,"[feminist, practice, oral, history, grandma, l...",Arts and Humanities,0,False,"[feminist, practice, oral, history]"
479996,13984413529295,Effective Media Analysis,Effective Media Analysis,,STCO 546,"{'id': '20123', 'display_name': 'Liberty Unive...",Using social media platforms to build a compet...,,,,,2019,MilitaryScience,Effective Media Analysis Using social media pl...,"[effective, medium, analysis, social, medium, ...",Social Sciences,1,False,"[effective, media, analysis]"
479997,575525638547,Directed Research,Directed Research,,PSYC-495,"{'id': '20123', 'display_name': 'Liberty Unive...",Majors in Psychology will engage in directed r...,,,,Identify a gap in the literature related to an...,2019,Psychology,Directed Research Majors in Psychology will en...,"[research, major, psychology, research, superv...",Social Sciences,0,False,"[direct, research]"
479998,4853313065239,Health Informatics,Health Informatics,,INFO-505,"{'id': '20123', 'display_name': 'Liberty Unive...",Focused on the foundations of healthcare infor...,,,,Integrate biblical principles within the conte...,2019,Medicine,Health Informatics Focused on the foundations ...,"[health, informatic, foundation, healthcare_in...",Medicine and Health Sciences,1,False,"[health, informatic]"


In [None]:
description = pd.read_pickle("/mnt/hdd/bjkim/KISDI/OpenSyllabus/opensyllabus_paper/OA_Tokenized_0118.pkl")
description

Unnamed: 0,id,display_name,title,title_highlight,code,institution,description,description_highlight,topic_outline,credits,learning_outcomes,year,class,text,tokens,category,graduate,original,description_tokens2
0,6262062326506,Multi-level Analysis of Survey Data,Multi-level Analysis of Survey Data,,SURVMETH 790,"{'id': '18429', 'display_name': 'University of...",Students are introduced to an increasingly com...,,,3,,2004,Mathematics,Multi-level Analysis of Survey Data Students a...,"[multi, level, analysis, survey, datum, studen...",Natural Sciences,1,True,"[student, introduce, common, statistical, tech..."
1,6966436956976,Class Guitar,Class Guitar,,MUS 130,"{'id': '19460', 'display_name': 'Oral Roberts ...",A lecture/demonstration teaching method design...,,,1,acquire the basic understanding of guitar. dev...,2004,Music,Class Guitar A lecture/demonstration teaching ...,"[class, guitar, lecture, demonstration, teachi...",Arts and Humanities,-1,False,"[lecture_demonstration, teaching, method, desi..."
2,11665131195894,Greek Historiography,Greek Historiography,,Greek 303,"{'id': '20152', 'display_name': 'University of...",This course is designed as a capstone experien...,,,,,2004,Classics,Greek Historiography This course is designed a...,"[greek, historiography, course, capstone, expe...",Arts and Humanities,0,False,"[course, design, capstone, experience, classic..."
3,15590731307182,Evaluation and Impact of Public Organizations,Evaluation and Impact of Public Organizations,,PA 513,"{'id': '18641', 'display_name': 'Park Universi...",An examination of the evaluation of the output...,,,3,Identify an issue or problem and the program o...,2004,Education,Evaluation and Impact of Public Organizations ...,"[evaluation, impact, public, organization, exa...",Social Sciences,1,True,"[examination, evaluation, output, public, orga..."
4,1735166804574,URBAN GEOGRAPHY AND PLANNING,URBAN GEOGRAPHY AND PLANNING,,GEOG 3334,"{'id': '18506', 'display_name': 'University of...",This course examines processes of urbanization...,,,,,2004,Geography,URBAN GEOGRAPHY AND PLANNING This course exami...,"[urban, geography, planning, course, process, ...",Natural Sciences,0,True,"[course, examine, process, urbanization, certa..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479995,6382321406943,Feminist Practice of Oral History,Feminist Practice of Oral History,,WOMENSTD 425,"{'id': '18429', 'display_name': 'University of...",Do you have a Grandma/Lola/Auntie/Role Model y...,,,3,Students will learn different strategies of ho...,2019,History,Feminist Practice of Oral History Do you have ...,"[feminist, practice, oral, history, grandma, l...",Arts and Humanities,0,False,"[grandma, lola, auntie, role, model, want, lea..."
479996,13984413529295,Effective Media Analysis,Effective Media Analysis,,STCO 546,"{'id': '20123', 'display_name': 'Liberty Unive...",Using social media platforms to build a compet...,,,,,2019,MilitaryScience,Effective Media Analysis Using social media pl...,"[effective, medium, analysis, social, medium, ...",Social Sciences,1,False,"[use, social, medium, platform, build, competi..."
479997,575525638547,Directed Research,Directed Research,,PSYC-495,"{'id': '20123', 'display_name': 'Liberty Unive...",Majors in Psychology will engage in directed r...,,,,Identify a gap in the literature related to an...,2019,Psychology,Directed Research Majors in Psychology will en...,"[research, major, psychology, research, superv...",Social Sciences,0,False,"[major, psychology, engage, direct, research, ..."
479998,4853313065239,Health Informatics,Health Informatics,,INFO-505,"{'id': '20123', 'display_name': 'Liberty Unive...",Focused on the foundations of healthcare infor...,,,,Integrate biblical principles within the conte...,2019,Medicine,Health Informatics Focused on the foundations ...,"[health, informatic, foundation, healthcare_in...",Medicine and Health Sciences,1,False,"[focus, foundation, healthcare, informatic, st..."


In [46]:
lemmatized_description = lemmatization(df['description'].str.lower().tolist(),['NOUN','PROPN','ADJ']) # 명사, 대명사, 형용사 활용

import csv

with open('lemmatized_description.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(lemmatized_description)


bigrams_2 = make_bigram(lemmatized_description,100)  # N회 이상 등장한 단어만 bigram 처리

bigrams_3 = make_bigram(lemmatized_description,1000)

# bigram 단어 확인
bigram_list_2 = [word for sent in bigrams_2 for word in sent if "_" in word]
Counter(bigram_list_2).most_common(50) # 상위 N개
bigram_list_3 = [word for sent in bigrams_3 for word in sent if "_" in word]
Counter(bigram_list_3).most_common(50) # 상위 N개

lemmatization 시작 



100%|██████████| 480000/480000 [5:30:20<00:00, 24.22it/s]   


[('united_states', 17069),
 ('decision_making', 13443),
 ('critical_thinking', 9797),
 ('health_care', 9307),
 ('problem_solving', 9278),
 ('real_world', 6427),
 ('criminal_justice', 6142),
 ('special_attention', 4965),
 ('twentieth_century', 4921),
 ('wide_range', 4911),
 ('financial_statement', 4831),
 ('small_group', 4502),
 ('particular_attention', 4024),
 ('civil_war', 3988),
 ('differential_equation', 3593),
 ('hypothesis_testing', 3500),
 ('race_ethnicity', 3309),
 ('african_american', 3305),
 ('wide_variety', 3299),
 ('mental_health', 3268),
 ('latin_america', 3221),
 ('short_story', 2944),
 ('faculty_member', 2910),
 ('point_view', 2878),
 ('everyday_life', 2838),
 ('nineteenth_century', 2817),
 ('second_half', 2787),
 ('long_term', 2751),
 ('first_half', 2688),
 ('oral_presentation', 2574),
 ('life_cycle', 2458),
 ('high_school', 2373),
 ('close_reading', 2352),
 ('middle_east', 2338),
 ('cold_war', 2300),
 ('linear_equation', 2293),
 ('mass_medium', 2263),
 ('new_testament',

In [None]:
lemmatized_description2 = lemmatization(df['description'].str.lower().tolist(),['NOUN','PROPN','ADJ','VERB']) # 명사, 대명사, 형용사 활용

bigrams_2 = make_bigram(lemmatized_description2,100)  # N회 이상 등장한 단어만 bigram 처리

# bigram 단어 확인
bigram_list_2 = [word for sent in bigrams_2 for word in sent if "_" in word]
Counter(bigram_list_2).most_common(50) # 상위 N개

df['description_tokens2'] = pd.Series(bigrams_2)

df.to_pickle('OA_Tokenized_0118.pkl')

lemmatization 시작 



100%|██████████| 480000/480000 [5:11:45<00:00, 25.66it/s]   


# grouped 로 만들기

In [9]:
df = pd.read_pickle('OA_Tokenized_1226.pkl')

In [10]:
df_essential=df[['title','description', 'learning_outcomes', 'year', 'class', 'category',
       'graduate', 'title_tokens', 'description_tokens']]
df_essential.to_pickle("OA_Tokenized_essential.pkl")

In [None]:
import pandas as pd

# Group and aggregate the tokens for title and description
grouped_titles = df.groupby(['category', 'year'])['title_tokens'].agg(lambda x: sum(x, [])).reset_index()
grouped_descriptions = df.groupby(['category', 'year'])['description_tokens'].agg(lambda x: sum(x, [])).reset_index()

# Merge the two grouped DataFrames
grouped = pd.merge(grouped_titles, grouped_descriptions, on=['category', 'year'])

# Combine title and description tokens into a single column
grouped['combined_tokens'] = grouped.apply(lambda row: row['title_tokens'] + row['description_tokens'], axis=1)


In [11]:
grouped = pd.read_pickle("year_title_description.pkl")
grouped

Unnamed: 0,category,year,title_tokens,description_tokens,combined_tokens
0,Arts and Humanities,2004,"[class, guitar, greek, historiography, introdu...","[lecture_demonstration, teaching, method, stud...","[class, guitar, greek, historiography, introdu..."
1,Arts and Humanities,2005,"[stranger, creative, tension, german, cinema, ...","[jews, resilience, ambient, contempt, country,...","[stranger, creative, tension, german, cinema, ..."
2,Arts and Humanities,2006,"[theological, study, seminar, seminar, early, ...","[capstone, academic, experience, degree, maste...","[theological, study, seminar, seminar, early, ..."
3,Arts and Humanities,2007,"[environmental, history, north_america, music,...","[course, idea, nature, wilderness, environment...","[environmental, history, north_america, music,..."
4,Arts and Humanities,2008,"[elementary, spanish, theory, writing, late, i...","[introduction, hispanic, language, culture, ta...","[elementary, spanish, theory, writing, late, i..."
...,...,...,...,...,...
75,Social Sciences,2015,"[introduction, sociology, cognitive, behaviora...","[introduction, sociological_perspective, cours...","[introduction, sociology, cognitive, behaviora..."
76,Social Sciences,2016,"[infant, toddler, practicum, seminar, tech, re...","[hour_week, teaching, experience, infant_toddl...","[infant, toddler, practicum, seminar, tech, re..."
77,Social Sciences,2017,"[interperson, group, global, inequality, power...","[principle, appropriate, effective, communicat...","[interperson, group, global, inequality, power..."
78,Social Sciences,2018,"[basic, social, work, research, field, experie...","[course, content, logic, inquiry, necessity, e...","[basic, social, work, research, field, experie..."


In [29]:
bigram_combined =[]

for i in range(len(grouped)):
    for token in grouped['combined_tokens'][i]:
        if "_" in token:
            bigram_combined.append(token)
        else:
            pass

In [33]:
dic =Counter(bigram_combined)

filtered_words = {word: count for word, count in dic.items() if count < 100}

In [34]:
filtered_words

{'rights_movement': 16,
 'renaissance_baroque': 97,
 'japan_korea': 56,
 'war_reconstruction': 28,
 'television_show': 98,
 'citation_format': 14,
 'park_university': 60,
 'sculpture_architecture': 36,
 'report_proposal': 95,
 'america_caribbean': 43,
 'editing_proofreading': 96,
 'letter_resume': 72,
 'photoshop_illustrator': 79,
 'israeli_conflict': 20,
 'syntax_semantics': 81,
 'greeks_romans': 93,
 'nutritional_need': 96,
 'revising_editing': 44,
 'ensemble_repertoire': 58,
 'workshop_meeting': 28,
 'texts_the': 2,
 'confessions_to': 2,
 'marital_status': 37,
 'parent_friend': 13,
 'siâˆ_â': 2,
 'modular_programming': 58,
 'adolescence_adulthood': 92,
 'strength_endurance': 54,
 'response_reflection': 1,
 'manual_mental': 6,
 'eng_bem': 1,
 'sociolinguistic_sociocultural': 15,
 'augmented_sixth': 4,
 'typical_workplace': 1,
 'momentum_energy': 85,
 'integrity_ethical': 10,
 'argumentation_college': 2,
 'trade_agreement': 85,
 't_aught': 1,
 'union_movement': 3,
 's_c': 3,
 'natural

In [None]:
title['description_tokens2'] = description['description_tokens2']

In [None]:
title.to_pickle("OA_Tokenized_0118.pkl")

In [None]:
lemmatized = [l for l in lemmatized if l!=None]

In [None]:
pd.Series(lemmatized)

0       [student, appropriate, statistic, datum, contr...
1       [student, history, english, language, modern, ...
2                         [student, financial, statement]
3       [awareness, cultural, linguistic, difference, ...
4       [student, history, america, eye, american, min...
                              ...                        
1750    [film, television, story, unique, storytelling...
1751    [understanding, positive, psychological, appro...
1752    [firm, competitive, advantage, strategic, busi...
1753                                                   []
1754    [familiar, main, macroeconomic, model, economi...
Length: 1755, dtype: object

In [None]:
bigrams = make_bigram(lemmatized,100)  # N회 이상 등장한 단어만 bigram 처리

In [None]:
# bigram 단어 확인
bigram_list = [word for sent in bigrams for word in sent if "_" in word]
Counter(bigram_list).most_common(100) # 상위 N개

[('compound_complex', 238),
 ('peer_editing', 172),
 ('professional_journal', 131),
 ('punctuation_spelling', 125),
 ('brainstorming_outlining', 118),
 ('drafting_self', 118),
 ('outlining_drafting', 7)]

In [None]:
df.loc[~pd.isna(df['learning_outcomes']),'lo_tokens'] = pd.Series(bigrams).values

In [None]:
df['lo_tokens']

2436884                                                  NaN
2436885                                                  NaN
2436886                                                  NaN
2436887                                                  NaN
2436888                                                  NaN
                                 ...                        
2444562                                                  NaN
2444563    [firm, competitive, advantage, strategic, busi...
2444564                                                   []
2444565    [familiar, main, macroeconomic, model, economi...
2444566                                                  NaN
Name: lo_tokens, Length: 7683, dtype: object

In [None]:
df.loc[df['lo_tokens'].str.len()==0,'lo_tokens'] = np.nan

In [None]:
# 필요한 컬럼만 따로 저장
# df = pd.read_pickle('./OA_ALL_2000_2022_token.pkl')
df[['id','title','year','class','category','tokens','lo_tokens']].to_pickle('./OA_ALL_Tokens.pkl')

### AI 키워드 포함 따로 뽑기

In [None]:
df = pd.read_pickle('./OA_ALL_2000_2022_token.pkl')

In [None]:
# 키워드 설정
keywords = ["AI", "Artificial Intelligence", "Machine Learning", "Deep Learning", 'NLP', 'Natural Language Processing', 'Data Science','Algorithm']

# 정규 표현식으로 단어 경계를 설정하여 정확한 일치 검사
pattern = r"\b({})\b".format("|".join(map(re.escape, keywords)))

# title, description 열에서 키워드와 정확히 일치하는 행 추출
filtered_df = df[df["title"].str.contains(pattern, case=False, regex=True) |
                 df["description"].str.contains(pattern, case=False, regex=True) |
                 df["learning_outcomes"].str.contains(pattern, case=False, regex=True)]

# reset index
filtered_df = filtered_df.reset_index(drop=True)

In [None]:
df_token = pd.read_pickle('./OA_ALL_Tokens.pkl')

In [None]:
df_ai = df[df['id'].isin(filtered_df['id'].tolist())]

In [None]:
df_ai = df_ai.drop_duplicates(subset=['description']).reset_index(drop=True)

In [None]:
df_ai

Unnamed: 0,id,display_name,title,title_highlight,code,institution,description,description_highlight,topic_outline,credits,learning_outcomes,year,class,text,tokens
0,9165460233360,Introduction to Computer Science I,Introduction to Computer Science I,,CS 10,"{'id': '17420', 'display_name': 'Westmont Coll...",This course provides an introduction to comput...,,,,,2000,ComputerScience,Introduction to Computer Science I This course...,"[introduction, computer, science, course, intr..."
1,7868380090132,Artificial Intelligence,Artificial Intelligence,,CS430,"{'id': '17621', 'display_name': 'Rollins Colle...",A selective survey of key concepts and applica...,,1 Introduction 2 Stimulus-Response Agents 3 Ne...,,[Learn some of the more important programming ...,2000,ComputerScience,Artificial Intelligence A selective survey of ...,"[artificial_intelligence, selective, survey, k..."
2,4827543253459,Artificial Intelligence in Decision Making,Artificial Intelligence in Decision Making,,CIS 305,"{'id': '19553', 'display_name': 'Clarion Unive...",This course surveys the thinking and some of t...,,,,[describe the major research areas in artifici...,2000,ComputerScience,Artificial Intelligence in Decision Making Thi...,"[artificial_intelligence, decision_making, cou..."
3,3169685886993,Artificial Intelligence,Artificial Intelligence,,CS 263,"{'id': '18307', 'display_name': 'Hampshire Col...",Artificial Intelligence (AI) is a branch of co...,,,,,2000,ComputerScience,Artificial Intelligence Artificial Intelligenc...,"[artificial_intelligence, artificial_intellige..."
4,12034498364754,Artificial Intelligence,Artificial Intelligence,,CPTR 440,"{'id': '18240', 'display_name': 'University of...","An introduction to artificial intelligence, it...",,,,[Understand the artificial intelligence approa...,2000,ComputerScience,Artificial Intelligence An introduction to art...,"[artificial_intelligence, introduction, artifi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10233,4294967310036,Algorithms,Algorithms,,CSED331,"{'id': '16392', 'display_name': 'Pohang Univer...",Algorithms are procedures or methods that solv...,,,,,2020,Korea,Algorithms Algorithms are procedures or method...,"[algorithm, algorithm, procedure, method, prob..."
10234,14224931703163,Machine Learning in Medicine,Machine Learning in Medicine,,DHC5036,"{'id': '696', 'display_name': 'Sungkyunkwan Un...",This class will introduce machine learning the...,,,,,2020,Korea,Machine Learning in Medicine This class will i...,"[machine, learning, medicine, class, machine, ..."
10235,4930622460217,Data Analytics and Visualization,Data Analytics and Visualization,,M3239,"{'id': '376', 'display_name': 'Seoul National ...",Businesses and organizations today collect and...,,,,[Explain pros and cons of various visual repre...,2021,Korea,Data Analytics and Visualization Businesses an...,"[datum, analytic, visualization, business, org..."
10236,8022998932659,Biomedical NLP,Biomedical NLP,,IPH5018,"{'id': '696', 'display_name': 'Sungkyunkwan Un...",This class will introduce basic concept of nat...,,,,,2021,Korea,Biomedical NLP This class will introduce basic...,"[biomedical, nlp, class, basic, concept, natur..."


##### 학부/대학원 수업 구분 변수 Merge