### 크롤링한 데이터 전처리 & 통합
#### 추천해줄 문제 리스트 추출

In [1]:
import pandas as pd
import numpy as np

from collections import Counter

In [2]:
solved = pd.read_csv("raw_data/all_problem_in_solved.csv")
baekjoon = pd.read_csv("raw_data/all_problem_in_baekjoon.csv")
baekjoon_with_algorithm = pd.read_csv("raw_data/all_problem_with_algorithm.csv")
algorithm = pd.read_csv("raw_data/algorithms.csv")

detail_infos = pd.read_csv("raw_data/all_problem_detail_infos.csv")

In [3]:
# solved.ac에 있는 문제들(레벨 정보)
solved.info()
solved.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18598 entries, 0 to 18597
Data columns (total 4 columns):
id         18598 non-null int64
name       18598 non-null object
level      18598 non-null int64
avg_try    18598 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 581.3+ KB


Unnamed: 0,id,name,level,avg_try
0,1191,게임,0,6.8
1,1192,장갑,0,3.5
2,1203,게임,0,20.25
3,1223,마법의 돌,0,8.0
4,1224,분수,0,6.33


In [4]:
# 백준에 있는 문제들
baekjoon.info()
baekjoon.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18855 entries, 0 to 18854
Data columns (total 6 columns):
id                18855 non-null int64
name              18854 non-null object
information       18426 non-null object
correct_user      18855 non-null int64
submission_cnt    18855 non-null int64
correct_rate      18855 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 884.0+ KB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753
1,1001,A-B,"디버그,분류",97109,159168,72.049
2,1002,터렛,분류,16141,105843,20.562
3,1003,피보나치 함수,분류,23293,113982,30.087
4,1004,어린 왕자,분류,6503,19933,41.555


In [5]:
# 백준에 있는 정보들(알고리즘 정보)
baekjoon_with_algorithm.info()
baekjoon_with_algorithm.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23212 entries, 0 to 23211
Data columns (total 4 columns):
id         23212 non-null int64
name       23212 non-null object
algo       23212 non-null object
algo_id    23212 non-null int64
dtypes: int64(2), object(2)
memory usage: 725.5+ KB


Unnamed: 0,id,name,algo,algo_id
0,1000,A+B,수학,124
1,1001,A-B,수학,124
2,10998,A×B,수학,124
3,10869,사칙연산,수학,124
4,1008,A/B,수학,124


In [6]:
# 문제들 상세 정보
detail_infos.info()
detail_infos.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9267 entries, 0 to 9266
Data columns (total 7 columns):
number              9267 non-null int64
time_limit          9267 non-null object
memory_limit        9267 non-null object
submission_cnt      9267 non-null int64
answer_cnt          9267 non-null int64
correct_user_cnt    9267 non-null int64
answer_rate         9267 non-null object
dtypes: int64(4), object(3)
memory usage: 506.9+ KB


Unnamed: 0,number,time_limit,memory_limit,submission_cnt,answer_cnt,correct_user_cnt,answer_rate
0,1000,2 초,128 MB,383131,163788,118678,43.743%
1,1001,2 초,128 MB,159820,112223,97502,72.056%
2,1002,2 초,128 MB,106189,20605,16191,20.558%
3,1003,0.25 초 (추가 시간 없음),128 MB,114308,29761,23381,30.099%
4,1004,2 초,128 MB,20001,7967,6516,41.572%


In [7]:
detail_infos = detail_infos[['number', 'time_limit', 'memory_limit']]
detail_infos = detail_infos.rename({'number' : 'id'}, axis = 1)

In [8]:
detail_infos.info()
detail_infos.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9267 entries, 0 to 9266
Data columns (total 3 columns):
id              9267 non-null int64
time_limit      9267 non-null object
memory_limit    9267 non-null object
dtypes: int64(1), object(2)
memory usage: 217.3+ KB


Unnamed: 0,id,time_limit,memory_limit
0,1000,2 초,128 MB
1,1001,2 초,128 MB
2,1002,2 초,128 MB
3,1003,0.25 초 (추가 시간 없음),128 MB
4,1004,2 초,128 MB


In [9]:
# 백준 전체 문제 수 / 알고리즘 분류가 있는 문제 수 / 난이도가 설정된 문제 수
all_problems = list(baekjoon['id'].unique())
algorithm_problems = list(baekjoon_with_algorithm['id'].unique())
level_problems = list(solved['id'].unique())

len(all_problems), len(algorithm_problems), len(level_problems)

(18855, 9283, 18598)

In [10]:
# 알고리즘 분류가 되어있는 문제 중 난이도가 측정되지 않은 문제
level_but_not_algo = list(set(algorithm_problems) - set(level_problems))
print(level_but_not_algo)

[4706, 18135, 18127, 18128, 17491, 18132, 18133, 17494, 18134, 15320, 18165, 10303]


In [11]:
# 난이도 설정이 안되어있는 문제들 정보

new_problem_ids = level_but_not_algo
new_problem_names = ['쌍둥이 역설', '겨울나기', '모형 결정', '치삼이의 징검다리 건너기', '고양이 밥주기', '내 이진트리를 돌려줘!!!', '가톨릭대학교에 워터 슬라이드를??', '후임 간식 뺏어먹기',
                    '치삼이의 대모험', '단신쓴짠', 'Dao Robot', 'Headshot']
new_levels = [3, 18, 3, 12, 13, 12, 18, 9, 18, 19, 16, 6]
new_avg_trys = [1.23, 2.00, 1.35, 3.71, 3.00, 1.94, 1.22, 1.50, 1.00, 6.06, 5.17, 1.00]

len(new_problem_ids), len(new_problem_names), len(new_levels), len(new_avg_trys)

(12, 12, 12, 12)

In [12]:
new_df = pd.DataFrame(
    {
        'id' : new_problem_ids,
        'name' : new_problem_names,
        'level' : new_levels,
        'avg_try' : new_avg_trys
    })
new_df.head()

Unnamed: 0,id,name,level,avg_try
0,4706,쌍둥이 역설,3,1.23
1,18135,겨울나기,18,2.0
2,18127,모형 결정,3,1.35
3,18128,치삼이의 징검다리 건너기,12,3.71
4,17491,고양이 밥주기,13,3.0


In [13]:
# 문제들 난이도 정보 추가
solved = solved.append(new_df, ignore_index=True)

In [14]:
solved.info()
solved.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18610 entries, 0 to 18609
Data columns (total 4 columns):
id         18610 non-null int64
name       18610 non-null object
level      18610 non-null int64
avg_try    18610 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 581.7+ KB


Unnamed: 0,id,name,level,avg_try
18605,17494,후임 간식 뺏어먹기,9,1.5
18606,18134,치삼이의 대모험,18,1.0
18607,15320,단신쓴짠,19,6.06
18608,18165,Dao Robot,16,5.17
18609,10303,Headshot,6,1.0


In [15]:
# 텍스트 형태로 제출하는 문제들은 제외
text_problems = ['1237', '2555', '5620', '9073', '9987', '9995', '9999', '10212',
                '10831', '10943', '10944', '10945', '10946', '10947', '10948',
                '11506', '12091', '12092', '12093', '12096', '13062', '14406',
                '14443', '14918', '15547', '15631', '15635', '15636', '15637',
                '15639', '15641', '15643', '15802', '15891', '15913', '15915',
                '16075', '16076', '16647', '17107', '17108', '17110', '17111',
                '17112', '17113', '17116', '17467', '17468', '18169', '18351',
                '18822', '18823', '18824', '18825', '18826', '18828', '18829',
                '19617', '19999']

text_problems = list(map(lambda x : int(x), text_problems))

In [16]:
# 텍스트형태로 제출하는 문제는 제외
baekjoon = baekjoon[~baekjoon['id'].isin(text_problems)]
solved = solved[~solved['id'].isin(text_problems)]
baekjoon_with_algorithm = baekjoon_with_algorithm[~baekjoon_with_algorithm['id'].isin(text_problems)]

In [17]:
solved.info()
solved.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18554 entries, 0 to 18609
Data columns (total 4 columns):
id         18554 non-null int64
name       18554 non-null object
level      18554 non-null int64
avg_try    18554 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 724.8+ KB


Unnamed: 0,id,name,level,avg_try
18605,17494,후임 간식 뺏어먹기,9,1.5
18606,18134,치삼이의 대모험,18,1.0
18607,15320,단신쓴짠,19,6.06
18608,18165,Dao Robot,16,5.17
18609,10303,Headshot,6,1.0


In [18]:
baekjoon.info()
baekjoon.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18796 entries, 0 to 18854
Data columns (total 6 columns):
id                18796 non-null int64
name              18796 non-null object
information       18367 non-null object
correct_user      18796 non-null int64
submission_cnt    18796 non-null int64
correct_rate      18796 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 1.0+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753
1,1001,A-B,"디버그,분류",97109,159168,72.049
2,1002,터렛,분류,16141,105843,20.562
3,1003,피보나치 함수,분류,23293,113982,30.087
4,1004,어린 왕자,분류,6503,19933,41.555


In [19]:
baekjoon_with_algorithm.info()
baekjoon_with_algorithm.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23198 entries, 0 to 23211
Data columns (total 4 columns):
id         23198 non-null int64
name       23198 non-null object
algo       23198 non-null object
algo_id    23198 non-null int64
dtypes: int64(2), object(2)
memory usage: 906.2+ KB


Unnamed: 0,id,name,algo,algo_id
0,1000,A+B,수학,124
1,1001,A-B,수학,124
2,10998,A×B,수학,124
3,10869,사칙연산,수학,124
4,1008,A/B,수학,124


In [20]:
baekjoon['algos'] = ''
for column in algorithm['algo_name_kr'] :
    baekjoon[column] = 0
baekjoon['cnt'] = 0

In [21]:
baekjoon.info()
baekjoon.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18796 entries, 0 to 18854
Columns: 180 entries, id to cnt
dtypes: float64(1), int64(176), object(3)
memory usage: 26.0+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,A-B,"디버그,분류",97109,159168,72.049,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1002,터렛,분류,16141,105843,20.562,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1003,피보나치 함수,분류,23293,113982,30.087,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1004,어린 왕자,분류,6503,19933,41.555,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
algorithm_problems = baekjoon_with_algorithm['id'].unique()
len(algorithm_problems)

9277

In [23]:
all_algorithms = []

for problem_id in algorithm_problems :
    algorithms = list(baekjoon_with_algorithm[baekjoon_with_algorithm['id'] == problem_id]['algo'])
    
    baekjoon.loc[baekjoon['id'] == problem_id, 'algos'] = ",".join(algorithms)
    baekjoon.loc[baekjoon['id'] == problem_id, 'cnt'] = len(algorithms)
    
    all_algorithms.append([problem_id, algorithms])
    
    for algo in algorithms :
        baekjoon.loc[baekjoon['id'] == problem_id, algo] = 1

In [24]:
baekjoon.info()
baekjoon.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18796 entries, 0 to 18854
Columns: 180 entries, id to cnt
dtypes: float64(1), int64(176), object(3)
memory usage: 26.0+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,0,3
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,0,3
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,0,0,0,0,2
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
baekjoon['algo_cnt'] = baekjoon.iloc[:,7:-1].sum(axis=1)

baekjoon.info()
baekjoon.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18796 entries, 0 to 18854
Columns: 181 entries, id to algo_cnt
dtypes: float64(1), int64(177), object(3)
memory usage: 26.1+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,algo_cnt
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,0,0,0,2,2
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,0,0,0,1,1
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [26]:
# 값 잘 들어갔는지 확인
sum(baekjoon['algo_cnt'] != baekjoon['cnt'])

0

In [27]:
# 알고리즘 분류 안된 문제들은 제외
recommend_problems = baekjoon.loc[baekjoon['algo_cnt'] != 0, :]
recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 18854
Columns: 181 entries, id to algo_cnt
dtypes: float64(1), int64(177), object(3)
memory usage: 12.9+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,algo_cnt
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,0,0,0,2,2
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,0,0,0,1,1
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [28]:
unknown = list(set(algorithm_problems) - set(list(recommend_problems['id'].unique())))
unknown

[]

In [29]:
recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 18854
Columns: 181 entries, id to algo_cnt
dtypes: float64(1), int64(177), object(3)
memory usage: 12.9+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,algo_cnt
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,0,0,0,3,3
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,0,0,0,2,2
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,0,0,0,1,1
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [30]:
# 문제 난이도 정보 결합
recommend_problems = pd.merge(recommend_problems, solved, how='left', on='id')

recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 9276
Columns: 184 entries, id to avg_try
dtypes: float64(2), int64(178), object(4)
memory usage: 13.1+ MB


Unnamed: 0,id,name_x,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,algo_cnt,name_y,level,avg_try
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,3,3,A+B STANDARD,1,2.29
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,3,3,A-B STANDARD,1,1.39
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,2,2,터렛,7,4.86
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,1,1,피보나치 함수,8,3.32
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,1,1,어린 왕자,8,2.41


In [31]:
# 문제 상세 정보와 통합
recommend_problems = pd.merge(recommend_problems, detail_infos, how='left', on='id')

recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 9276
Columns: 186 entries, id to memory_limit
dtypes: float64(2), int64(178), object(6)
memory usage: 13.2+ MB


Unnamed: 0,id,name_x,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,algo_cnt,name_y,level,avg_try,time_limit,memory_limit
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,3,3,A+B STANDARD,1,2.29,2 초,128 MB
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,3,3,A-B STANDARD,1,1.39,2 초,128 MB
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,2,2,터렛,7,4.86,2 초,128 MB
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,1,1,피보나치 함수,8,3.32,0.25 초 (추가 시간 없음),128 MB
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,1,1,어린 왕자,8,2.41,2 초,128 MB


In [32]:
recommend_problems.drop(['algo_cnt', 'name_y'], axis=1, inplace = True)

recommend_problems = recommend_problems.rename({'name_x': 'name'}, axis=1)

recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 9276
Columns: 184 entries, id to memory_limit
dtypes: float64(2), int64(177), object(5)
memory usage: 13.1+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,algos,수학,다이나믹 프로그래밍,구현,...,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근,cnt,level,avg_try,time_limit,memory_limit
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,3,1,2.29,2 초,128 MB
1,1001,A-B,"디버그,분류",97109,159168,72.049,"수학,구현,사칙연산",1,0,1,...,0,0,0,0,0,3,1,1.39,2 초,128 MB
2,1002,터렛,분류,16141,105843,20.562,"수학,기하학",1,0,0,...,0,0,0,0,0,2,7,4.86,2 초,128 MB
3,1003,피보나치 함수,분류,23293,113982,30.087,다이나믹 프로그래밍,0,1,0,...,0,0,0,0,0,1,8,3.32,0.25 초 (추가 시간 없음),128 MB
4,1004,어린 왕자,분류,6503,19933,41.555,기하학,0,0,0,...,0,0,0,0,0,1,8,2.41,2 초,128 MB


In [33]:
# 컬럼 순서 바꾸기
columns = list(recommend_problems.columns)
new_columns = columns[:6] + columns[-5:] + columns[6:-5]

len(columns) == len(new_columns)

True

In [34]:
recommend_problem_df = recommend_problems[new_columns]

recommend_problem_df.info()
recommend_problem_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9277 entries, 0 to 9276
Columns: 184 entries, id to 이산 k제곱근
dtypes: float64(2), int64(177), object(5)
memory usage: 13.1+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,cnt,level,avg_try,time_limit,...,통계학,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,3,1,2.29,2 초,...,0,0,0,0,0,0,0,0,0,0
1,1001,A-B,"디버그,분류",97109,159168,72.049,3,1,1.39,2 초,...,0,0,0,0,0,0,0,0,0,0
2,1002,터렛,분류,16141,105843,20.562,2,7,4.86,2 초,...,0,0,0,0,0,0,0,0,0,0
3,1003,피보나치 함수,분류,23293,113982,30.087,1,8,3.32,0.25 초 (추가 시간 없음),...,0,0,0,0,0,0,0,0,0,0
4,1004,어린 왕자,분류,6503,19933,41.555,1,8,2.41,2 초,...,0,0,0,0,0,0,0,0,0,0


In [35]:
recommend_problems = recommend_problem_df['id'].unique()
recommend_problems, len(recommend_problems)

(array([ 1000,  1001,  1002, ..., 21271, 21272, 21291], dtype=int64), 9277)

In [36]:
# 알고리즘 분류도 있고 레벨도 있는데 사용자들이 안 푼 문제 제외

ranker_problem = pd.read_csv("raw_data/ranker_problems.csv")

In [37]:
ranker_problem.info()
ranker_problem.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6322246 entries, 0 to 6322245
Data columns (total 5 columns):
id            int64
user_id       int64
user          object
problem_id    int64
TF            object
dtypes: int64(3), object(2)
memory usage: 241.2+ MB


Unnamed: 0,id,user_id,user,problem_id,TF
0,0,0,koosaga,1000,o
1,1,0,koosaga,1001,o
2,2,0,koosaga,1002,o
3,3,0,koosaga,1003,o
4,4,0,koosaga,1004,o


In [38]:
ranker_problem['problem_id'] = ranker_problem['problem_id'].astype('int64')

In [39]:
ranker_problem.info()
ranker_problem.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6322246 entries, 0 to 6322245
Data columns (total 5 columns):
id            int64
user_id       int64
user          object
problem_id    int64
TF            object
dtypes: int64(3), object(2)
memory usage: 241.2+ MB


Unnamed: 0,id,user_id,user,problem_id,TF
0,0,0,koosaga,1000,o
1,1,0,koosaga,1001,o
2,2,0,koosaga,1002,o
3,3,0,koosaga,1003,o
4,4,0,koosaga,1004,o


In [40]:
ranker_problems = ranker_problem[ranker_problem['problem_id'].isin(recommend_problems)]['problem_id'].unique()
ranker_problems, len(ranker_problems)

(array([ 1000,  1001,  1002, ..., 21188, 21233, 19019], dtype=int64), 9267)

In [41]:
# 알고리즘 분류와 난이도 모두 설정되어있지만 풀지않은 문제들

set(recommend_problems) - set(ranker_problems)

{17631, 21109, 21230, 21244, 21266, 21268, 21270, 21271, 21272, 21291}

In [42]:
recommend_problem_df = recommend_problem_df[recommend_problem_df['id'].isin(ranker_problems)]

In [43]:
recommend_problem_df.info()
recommend_problem_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Columns: 184 entries, id to 이산 k제곱근
dtypes: float64(2), int64(177), object(5)
memory usage: 13.1+ MB


Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,cnt,level,avg_try,time_limit,...,통계학,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,3,1,2.29,2 초,...,0,0,0,0,0,0,0,0,0,0
1,1001,A-B,"디버그,분류",97109,159168,72.049,3,1,1.39,2 초,...,0,0,0,0,0,0,0,0,0,0
2,1002,터렛,분류,16141,105843,20.562,2,7,4.86,2 초,...,0,0,0,0,0,0,0,0,0,0
3,1003,피보나치 함수,분류,23293,113982,30.087,1,8,3.32,0.25 초 (추가 시간 없음),...,0,0,0,0,0,0,0,0,0,0
4,1004,어린 왕자,분류,6503,19933,41.555,1,8,2.41,2 초,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# null값 가지는 행 찾기
recommend_problem_df[pd.isnull(recommend_problem_df).any(axis=1)]

Unnamed: 0,id,name,information,correct_user,submission_cnt,correct_rate,cnt,level,avg_try,time_limit,...,통계학,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근


In [45]:
recommend_problems = recommend_problem_df[['id', 'name']]

recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Data columns (total 2 columns):
id      9267 non-null int64
name    9267 non-null object
dtypes: int64(1), object(1)
memory usage: 217.2+ KB


Unnamed: 0,id,name
0,1000,A+B
1,1001,A-B
2,1002,터렛
3,1003,피보나치 함수
4,1004,어린 왕자


In [46]:
recommend_problems = recommend_problems.rename({'id' : 'number'}, axis=1)
recommend_problem_df = recommend_problem_df.rename({'id' : 'number'}, axis=1)

In [47]:
recommend_problem_df.info()
recommend_problem_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Columns: 184 entries, number to 이산 k제곱근
dtypes: float64(2), int64(177), object(5)
memory usage: 13.1+ MB


Unnamed: 0,number,name,information,correct_user,submission_cnt,correct_rate,cnt,level,avg_try,time_limit,...,통계학,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,3,1,2.29,2 초,...,0,0,0,0,0,0,0,0,0,0
1,1001,A-B,"디버그,분류",97109,159168,72.049,3,1,1.39,2 초,...,0,0,0,0,0,0,0,0,0,0
2,1002,터렛,분류,16141,105843,20.562,2,7,4.86,2 초,...,0,0,0,0,0,0,0,0,0,0
3,1003,피보나치 함수,분류,23293,113982,30.087,1,8,3.32,0.25 초 (추가 시간 없음),...,0,0,0,0,0,0,0,0,0,0
4,1004,어린 왕자,분류,6503,19933,41.555,1,8,2.41,2 초,...,0,0,0,0,0,0,0,0,0,0


In [48]:
recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Data columns (total 2 columns):
number    9267 non-null int64
name      9267 non-null object
dtypes: int64(1), object(1)
memory usage: 217.2+ KB


Unnamed: 0,number,name
0,1000,A+B
1,1001,A-B
2,1002,터렛
3,1003,피보나치 함수
4,1004,어린 왕자


In [49]:
recommend_problem_df.drop(['cnt'], axis=1, inplace = True)

In [50]:
recommend_problems.info()
recommend_problems.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Data columns (total 2 columns):
number    9267 non-null int64
name      9267 non-null object
dtypes: int64(1), object(1)
memory usage: 217.2+ KB


Unnamed: 0,number,name
0,1000,A+B
1,1001,A-B
2,1002,터렛
3,1003,피보나치 함수
4,1004,어린 왕자


In [51]:
recommend_problem_df.info()
recommend_problem_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Columns: 183 entries, number to 이산 k제곱근
dtypes: float64(2), int64(176), object(5)
memory usage: 13.0+ MB


Unnamed: 0,number,name,information,correct_user,submission_cnt,correct_rate,level,avg_try,time_limit,memory_limit,...,통계학,델로네 삼각분할,4차원 이상의 기하학,로프,춤추는 링크,크누스 X,접미사 트리,레드-블랙 트리,탑 트리,이산 k제곱근
0,1000,A+B,"다국어,디버그,분류",118169,381174,43.753,1,2.29,2 초,128 MB,...,0,0,0,0,0,0,0,0,0,0
1,1001,A-B,"디버그,분류",97109,159168,72.049,1,1.39,2 초,128 MB,...,0,0,0,0,0,0,0,0,0,0
2,1002,터렛,분류,16141,105843,20.562,7,4.86,2 초,128 MB,...,0,0,0,0,0,0,0,0,0,0
3,1003,피보나치 함수,분류,23293,113982,30.087,8,3.32,0.25 초 (추가 시간 없음),128 MB,...,0,0,0,0,0,0,0,0,0,0
4,1004,어린 왕자,분류,6503,19933,41.555,8,2.41,2 초,128 MB,...,0,0,0,0,0,0,0,0,0,0


In [52]:
recommend_problem_for_DB = recommend_problem_df[['number', 'name', 'correct_user', 'submission_cnt', 'correct_rate', 'level', 'avg_try']]

recommend_problem_for_DB.info()
recommend_problem_for_DB.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9267 entries, 0 to 9272
Data columns (total 7 columns):
number            9267 non-null int64
name              9267 non-null object
correct_user      9267 non-null int64
submission_cnt    9267 non-null int64
correct_rate      9267 non-null float64
level             9267 non-null int64
avg_try           9267 non-null float64
dtypes: float64(2), int64(4), object(1)
memory usage: 579.2+ KB


Unnamed: 0,number,name,correct_user,submission_cnt,correct_rate,level,avg_try
0,1000,A+B,118169,381174,43.753,1,2.29
1,1001,A-B,97109,159168,72.049,1,1.39
2,1002,터렛,16141,105843,20.562,7,4.86
3,1003,피보나치 함수,23293,113982,30.087,8,3.32
4,1004,어린 왕자,6503,19933,41.555,8,2.41


In [53]:
recommend_problem_for_DB.to_csv("final_data/recommend_problem_infos_for_db.csv", index = False, encoding='utf-8-sig')
pd.to_pickle(recommend_problem_for_DB, "final_data/recommend_problem_infos_for_db.pkl")

In [54]:
recommend_problem_df.to_csv("final_data/recommend_problem_infos.csv", index = False, encoding='utf-8-sig')
pd.to_pickle(recommend_problem_df, "final_data/recommend_problem_infos.pkl")

In [55]:
recommend_problems.to_csv("final_data/recommend_problems.csv", index = False, encoding='utf-8-sig')
pd.to_pickle(recommend_problems, "final_data/recommend_problems.pkl")