forked from krespers/python-TJ-karaoke-songlist-maker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TJ노래방 번호순 추출기 고정버전.py
87 lines (69 loc) · 3.28 KB
/
TJ노래방 번호순 추출기 고정버전.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from multiprocessing import Pool
from bs4 import BeautifulSoup
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_fixed
import time
# Retry decorator with exponential backoff strategy
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def get_url(url):
return requests.get(url)
# Function to connect to the web page and scrape data
def tjmediasongnumberfindingaddressmaker(x):
print(f"Searching for song with query parameter: {x}")
# Send a GET request to the TJ Media website with the given query parameter
req = get_url('https://www.tjmedia.com/tjsong/song_search_list.asp?strType=16&natType=&strText='+str(x)+'&strCond=1&strSize05=100')
# Decode the content of the response to UTF-8 encoding
html = req.content.decode('utf-8', 'replace')
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(html, 'lxml')
# Extract song information from the parsed HTML
songnumber = soup.select('#BoardType1 > table > tbody > tr:nth-child(2) > td:nth-child(1) > span')
# If no songs found, return None
if not songnumber:
print(f"No songs found for query parameter: {x}")
return None
songname = soup.select('#BoardType1 > table > tbody > tr:nth-child(2) > td.left')
singername = soup.select('#BoardType1 > table > tbody > tr:nth-child(2) > td:nth-child(3)')
lyricist = soup.select('#BoardType1 > table > tbody > tr:nth-child(2) > td:nth-child(4)')
composer = soup.select('#BoardType1 > table > tbody > tr:nth-child(2) > td:nth-child(5)')
# Extract text from HTML tags and store them in lists
list1 = []
list2 = []
list3 = []
for data1 in songnumber:
list1.append(data1.text)
for data2 in songname:
list2.append(data2.text)
for data3 in singername:
list3.append(data3.text)
# Combine the lists into a single list
sets = list1 + list2 + list3
print(f"Found {len(sets)//3} songs matching the query parameter: {x}")
return sets
# Function to periodically write data to an Excel file
def write_to_excel(data, file_name):
df = pd.DataFrame.from_records(data, columns=['Song Number', 'Song Name', 'Singer Name'])
df.to_excel(file_name, index=False)
if __name__ == "__main__":
# Create a Pool of processes for parallel processing
pool = Pool()
# Use the Pool to map the function over the range of values
# This will execute the function tjmediasongnumberfindingaddressmaker in parallel for each value in the range
print("Starting parallel processing...")
file_name = 'all_songs.xlsx'
datasets = []
for result in pool.imap_unordered(tjmediasongnumberfindingaddressmaker, range(1, 99999)):
if result:
datasets.append(result)
write_to_excel(datasets, file_name)
print("Parallel processing complete!")
# Combine all datasets into a single list
combined_data = []
for dataset in datasets:
combined_data.extend(dataset)
# Convert the combined data into a DataFrame
df = pd.DataFrame.from_records([combined_data], columns=['Song Number', 'Song Name', 'Singer Name'])
# Write the DataFrame to a single Excel file
df.to_excel(file_name, index=False)
print(f"Excel file '{file_name}' created with all songs.")