In [1]:
import pandas as pd

df = pd.read_csv('./data/final_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,학년,학기,낱말,빈도수,품사
0,1,1,봅시다,595,Verb
1,1,1,말,421,Noun
2,1,1,해,414,Noun
3,1,1,친구,164,Noun
4,1,1,붙임,154,Verb
...,...,...,...,...,...
46829,6,2,나누지,1,Verb
46830,6,2,친해졌다,1,Verb
46831,6,2,좋겠다고,1,Adjective
46832,6,2,가깝게,1,Adjective


In [2]:
pos_count = df['품사'].value_counts()
total_words = len(df)
pos_ratio = pos_count / total_words * 100

pos_ratio

품사
Noun           46.009309
Verb           38.482726
Adjective      13.199812
Adverb          1.908870
Exclamation     0.228466
Eomi            0.053380
Josa            0.044839
Modifier        0.032028
Suffix          0.012811
PreEomi         0.012811
VerbPrefix      0.010676
Conjunction     0.004270
Name: count, dtype: float64

In [3]:
pos_ratio_by_grade = df.groupby('학년')['품사'].value_counts().unstack(fill_value=0)
total_words_by_grade = df['학년'].value_counts().sort_index()
pos_ratio_by_grade = (pos_ratio_by_grade.T / total_words_by_grade).T * 100

pos_ratio_by_grade

품사,Adjective,Adverb,Conjunction,Eomi,Exclamation,Josa,Modifier,Noun,PreEomi,Suffix,Verb,VerbPrefix
학년,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,10.975963,2.577469,0.0,0.086881,0.463365,0.02896,0.02896,50.333044,0.0,0.0,35.476397,0.02896
2,14.595821,2.267941,0.0162,0.080998,0.259193,0.048599,0.032399,42.410497,0.0,0.0162,40.272153,0.0
3,13.650043,2.009888,0.0,0.042992,0.204213,0.032244,0.032244,44.357266,0.032244,0.010748,39.628117,0.0
4,12.65132,1.885295,0.0,0.03969,0.287756,0.03969,0.049613,44.840246,0.009923,0.009923,40.1667,0.019845
5,13.317129,1.688627,0.0,0.038378,0.191889,0.063963,0.025585,48.509658,0.0,0.025585,36.113599,0.025585
6,13.148167,1.558597,0.009991,0.059946,0.119892,0.049955,0.019982,47.497252,0.019982,0.009991,37.506244,0.0


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 학년과 학기별로 그룹화하여 품사별 빈도수의 합계를 계산
grouped_df = df.groupby(['학년', '학기', '품사'])['빈도수'].sum().reset_index()

# 학년과 학기별로 전체 빈도수의 합계를 계산
total_frequency = df.groupby(['학년', '학기'])['빈도수'].sum().reset_index()
total_frequency.rename(columns={'빈도수': '전체 빈도수'}, inplace=True)

# 품사별 빈도수와 전체 빈도수를 합치기
merged_df = pd.merge(grouped_df, total_frequency, on=['학년', '학기'])

# 품사별 비율 계산
merged_df['품사 비율'] = (merged_df['빈도수'] / merged_df['전체 빈도수']) * 100

# 결과 출력
merged_df

Unnamed: 0,학년,학기,품사,빈도수,전체 빈도수,품사 비율
0,1,1,Adjective,958,16793,5.704758
1,1,1,Adverb,301,16793,1.792414
2,1,1,Eomi,2,16793,0.01191
3,1,1,Exclamation,107,16793,0.63717
4,1,1,Modifier,2,16793,0.01191
5,1,1,Noun,11147,16793,66.378848
6,1,1,Verb,4276,16793,25.462991
7,1,2,Adjective,570,4852,11.747733
8,1,2,Adverb,98,4852,2.019786
9,1,2,Eomi,3,4852,0.06183


---

In [5]:
grade = "1"
sem = "2"

df = pd.read_csv('./data/final_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
grade, sem = int(grade),int(sem)
textbook_words_df = df[(df['학년'] == grade) & (df['학기'] == sem)]
textbook_words_df.head()

Unnamed: 0,학년,학기,낱말,빈도수,품사
70,1,2,인물,52,Noun
71,1,2,예시,52,Noun
72,1,2,답안,52,Noun
78,1,2,국활,49,Noun
83,1,2,오전,48,Noun


In [10]:
grade = "1"
sem = "2"

def filter_by_grade_sem(grade, sem):
    df = pd.read_csv('./data/final_data.csv')
    df.drop('Unnamed: 0', axis=1, inplace=True)
    
    grade, sem = int(grade), int(sem)
    textbook_words_df = df[(df['학년'] == grade) & (df['학기'] == sem)]
    
    # 빈도수 또는 다른 측정값으로 정렬
    sorted_df = textbook_words_df.sort_values(by='빈도수', ascending=False)
    
    # 상위 10%에 해당하는 단어만 추출
    top_10_percent_idx = int(0.1 * len(sorted_df))
    top_10_percent_df = sorted_df.iloc[:top_10_percent_idx]
    
    # 상위 10% 중에서 랜덤하게 5개를 선택
    selected_words_df = top_10_percent_df.sample(5)
    
    textbook_words = []
    for index, row in selected_words_df.iterrows():
        word_dict = {
            "낱말": row["낱말"],
            "품사": row["품사"]
        }
        textbook_words.append(word_dict)
    
    return textbook_words

textbook_words = filter_by_grade_sem(grade, sem)
textbook_words

[{'낱말': '되었어요', '품사': 'Verb'},
 {'낱말': '바라보며', '품사': 'Verb'},
 {'낱말': '외쳤어요', '품사': 'Verb'},
 {'낱말': '괴물', '품사': 'Noun'},
 {'낱말': '운전자', '품사': 'Noun'}]

In [7]:
def textbook_words_list(grade,sem,textbook_words,prompt_template):
    input = f'''
    I would like to create a context quiz for the {sem} semester of the {grade} grade.
    For the corresponding collection of words {textbook_words}, create a quiz by considering the part-time and frequency in.
    {prompt_template}
    '''
    return input

In [8]:
prompt_template_4 = f'''
    If the answer to be used is @'낱말': '관용', '빈도수': 67, '품사': 'Noun'@
    낱말을 활용하여 아래와 조건을 따라 문제를 만든다.
    The other option is to create a new one confusingly with the correct answer.
    The options of all problems should not overlap those of other problems.
    Description should be within the category that elementary school {grade} grade {sem}semseter students can understand as much as possible.
    In the description, as in the example, the reason for the answer and what the answer means should be explained.
    Answer is only one.
    ------------------------------------------------------------------------------------------------------------------------
    This is Sample for Context Comprehension Question:

    Question: 아래 문자에서 @ @사이에 들어올 표현을 고르세요.
    content: "눈에는 눈, 이에는 이"라는 말은 우리나라에서 흔히 쓰이지만, 이것은 한국 문화의 @ @입니다.
    
    options: [관용, 신조, 전통, 현상]
    
    Value: 1
    description: 정답은 1입니다. "관용"이 정답인 이유는 문장에서 "우리나라에서 흔히 쓰이지만"이라는 표현이 있어, 일반적인 논리나 해석과는 다르지만 특정 문화나 상황에서는 받아들여지는 표현이나 개념을 설명하고 있습니다.
    ------------------------------------------------------------------------------------------------------------------------
    '''

print(textbook_words_list(grade,sem,textbook_words,prompt_template_4))


    I would like to create a context quiz for the 2 semester of the 1 grade.
    For the corresponding collection of words [{'낱말': '주전자', '품사': 'Noun'}, {'낱말': '대표', '품사': 'Noun'}, {'낱말': '어울린다', '품사': 'Verb'}, {'낱말': '많았어요', '품사': 'Adjective'}, {'낱말': '농장', '품사': 'Noun'}], create a quiz by considering the part-time and frequency in.
    
    If the answer to be used is @'낱말': '관용', '빈도수': 67, '품사': 'Noun'@
    낱말을 활용하여 아래와 조건을 따라 문제를 만든다.
    The other option is to create a new one confusingly with the correct answer.
    The options of all problems should not overlap those of other problems.
    Description should be within the category that elementary school 1 grade 2semseter students can understand as much as possible.
    In the description, as in the example, the reason for the answer and what the answer means should be explained.
    Answer is only one.
    ------------------------------------------------------------------------------------------------------------------------
 

In [9]:
pos_ratio_by_grade = df.groupby('학년')['품사'].value_counts().unstack(fill_value=0)
total_words_by_grade = df['학년'].value_counts().sort_index()
pos_ratio_by_grade = (pos_ratio_by_grade.T / total_words_by_grade).T * 100

pos_ratio_by_grade

품사,Adjective,Adverb,Conjunction,Eomi,Exclamation,Josa,Modifier,Noun,PreEomi,Suffix,Verb,VerbPrefix
학년,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,10.975963,2.577469,0.0,0.086881,0.463365,0.02896,0.02896,50.333044,0.0,0.0,35.476397,0.02896
2,14.595821,2.267941,0.0162,0.080998,0.259193,0.048599,0.032399,42.410497,0.0,0.0162,40.272153,0.0
3,13.650043,2.009888,0.0,0.042992,0.204213,0.032244,0.032244,44.357266,0.032244,0.010748,39.628117,0.0
4,12.65132,1.885295,0.0,0.03969,0.287756,0.03969,0.049613,44.840246,0.009923,0.009923,40.1667,0.019845
5,13.317129,1.688627,0.0,0.038378,0.191889,0.063963,0.025585,48.509658,0.0,0.025585,36.113599,0.025585
6,13.148167,1.558597,0.009991,0.059946,0.119892,0.049955,0.019982,47.497252,0.019982,0.009991,37.506244,0.0
