## 采集url文本内容

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

class crawlText(object):
    def __init__(self, base_url):
        self.base_url = base_url
        self.visited_urls = set()
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
        }
        self.file_path = 'website_texts.txt'

    # 获取网页信息
    def get_html(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()  # 检查请求是否成功
            html = response.content.decode('utf-8')
            return html
        except (requests.RequestException, requests.exceptions.ChunkedEncodingError) as e:
            print(f'Error fetching {url}: {e}')
            return None

    # 提取并保存网页中的所有文本信息
    def extract_and_save_text(self, html):
        soup = BeautifulSoup(html, 'lxml')
        texts = soup.stripped_strings
        with open(self.file_path, 'a', encoding='utf-8') as file:
            for text in texts:
                file.write(text)
                file.write('\n')

    # 提取网页中的所有链接
    def get_all_links(self, html, base_url):
        soup = BeautifulSoup(html, 'lxml')
        links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(base_url, href)
            # 确保链接是同一域名下的
            if urlparse(full_url).netloc == urlparse(self.base_url).netloc:
                links.add(full_url)
        return links

    # 递归遍历所有链接并提取文本
    def crawl(self, url):
        if url in self.visited_urls:
            return
        print(f'Crawling: {url}')
        self.visited_urls.add(url)
        html = self.get_html(url)
        if html:
            self.extract_and_save_text(html)
            links = self.get_all_links(html, url)
            for link in links:
                self.crawl(link)

if __name__ == '__main__':
    base_url = 'https://shop.10086.cn/mall_871_871.html'
    danmu = crawlText(base_url)
    # 清空之前的文件内容
    if os.path.exists(danmu.file_path):
        os.remove(danmu.file_path)
    danmu.crawl(base_url)


## 清理数据（只保留长度大于100的缅甸语）

In [None]:
import re
from bs4 import BeautifulSoup
import pandas as pd

def is_burmese(text):
    # 缅甸语的 Unicode 范围是 U+1000 到 U+109F
    burmese_pattern = re.compile(r'[\u1000-\u109F]')
    return bool(burmese_pattern.search(text))

def clean_text_from_file(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            # 去除HTML标签
            line = BeautifulSoup(line, "lxml").text
            # 去除首尾空白字符
            line = line.strip()
            # 去除多余的中间空白字符
            line = ' '.join(line.split())
            # 检查行是否包含缅甸语字符且长度不小于100
            if is_burmese(line) and len(line) >= 100:
                data.append(line)
    
    # 将数据转换为 DataFrame 并去重
    df = pd.DataFrame(data, columns=['text'])
    df.drop_duplicates(inplace=True)
    
    return df

# 从文件中清理数据
filename = 'website_texts.txt'
cleaned_df = clean_text_from_file(filename)

# 将清理后的数据写入到CSV文件中
output_filename = 'cleaned_data.csv'
cleaned_df.to_csv(output_filename, index=False, encoding='utf-8')

# 输出结果（可选）
print(cleaned_df)


## 拆分数据集并保存到EXCEL

In [None]:
import pandas as pd

# Load the cleaned CSV file
df = pd.read_csv("cleaned_data.csv")

# Calculate the size of each chunk
chunk_size = len(df) // 6

# Split the DataFrame into 6 chunks and save each chunk to an Excel file
for i in range(6):
    start = i * chunk_size
    if i == 5:
        # Make sure the last chunk includes any remaining rows
        end = len(df)
    else:
        end = (i + 1) * chunk_size
    
    chunk_df = df.iloc[start:end]
    chunk_df.to_excel(f"cleaned_data_part_{i+1}.xlsx", index=False)

print("Data has been split into 6 Excel files.")

## 生成带国家名的运营商短信

In [None]:
from datetime import datetime, timedelta
import random
from googletrans import Translator
import pandas as pd

def ggtran(text, dest='en', src='auto'):
    """
    googletrans api 翻译调用
    :param text: 要翻译的原文
    :param dest: 翻译后输出的语言种类
    :param src: 原文的语言种类（auto为默认识别）
    :return: 翻译后的内容
    """
    translator = Translator()
    try:
        result = translator.translate(text, dest=dest, src=src)
        return result.text
    except Exception as e:
        print(f'Error: {str(e)}')
        return None
                
def generate_sms_messages():
    # 所有国家列表
    countries = ["阿尔巴尼亚", "阿富汗", "阿根廷", "阿联酋", "阿曼", "阿塞拜疆", "埃及", "埃塞俄比亚", "艾伦岛（奥兰府）", "爱奥尼亚", "爱尔兰", "爱沙尼亚", "安的列斯群岛", "奥地利", "奥克尼群岛", "澳大利亚,巴布亚新几内亚", "巴基斯坦", "巴拉圭", "巴勒斯坦", "巴林", "巴拿马", "巴西", "白俄罗斯", "保加利亚", "北爱尔兰,梵蒂冈", "菲律宾", "斐济", "芬兰", "刚果", "刚果民主共和国", "哥伦比亚", "哥斯达黎加", "哥特兰岛", "格鲁吉亚", "古巴", "瓜德罗普岛", "关岛", "圭亚那", "哈萨克斯坦", "海峡群岛", "韩国", "北马其顿", "贝弗敖群岛", "贝宁", "比利时", "波多黎各", "波恩荷尔摩岛,波黑", "波兰", "伯罗奔尼撒", "博茨瓦纳", "布基纳法索", "丹麦", "丹麦措辛厄岛", "丹麦朗厄兰岛", "德国", "多米尼加共和国", "俄罗斯", "厄瓜多尔", "厄兰岛", "法国", "法罗群岛", "法属圭亚那,科特迪瓦（象牙海岸）", "科威特", "科西嘉岛", "克里特岛", "克罗地亚", "库拉索岛和博奈尔岛", "拉脱维亚", "老挝", "立陶宛", "利比里亚", "列支敦士登", "留尼汪岛", "卢森堡", "卢旺达", "罗德岛,郝布里底群岛", "荷兰", "黑山", "洪都拉斯", "基克拉泽", "基里巴斯", "吉布提", "吉尔吉斯斯坦", "几内亚", "几内亚比绍", "加拿大", "加纳", "加蓬", "柬埔寨", "捷克", "喀麦隆", "卡塔尔,罗弗敦群岛", "罗马尼亚", "马德拉群岛", "马尔代夫", "马耳他", "马拉维", "马来西亚", "马里", "马提尼岛", "马约特岛", "曼岛", "毛里求斯", "美国（本土）", "美属维尔京群岛", "蒙古", "孟加拉国", "秘鲁,缅甸", "摩尔多瓦", "摩洛哥", "摩纳哥", "墨西哥", "纳米比亚", "南贝佛兰岛", "南非", "南苏丹", "瑙鲁", "尼泊尔", "尼加拉瓜", "尼日尔", "尼日利亚", "挪威", "葡萄牙", "日本", "瑞典", "瑞士,萨尔瓦多", "萨摩亚", "塞班岛", "塞尔维亚", "塞内加尔", "塞浦路斯", "塞舌尔", "沙特", "设得兰群岛", "圣港岛", "圣马力诺", "斯里兰卡", "斯洛伐克", "斯洛文尼亚", "斯图尔特岛", "斯威士兰,斯雅尔巴群岛", "苏丹", "苏里南", "所罗门群岛", "塔吉克斯坦", "泰国", "坦桑尼亚", "汤加", "特立尼达和多巴哥", "天宁岛", "土耳其", "瓦努阿图", "危地马拉", "文莱", "乌干达", "乌克兰", "乌拉圭,乌兹别克斯坦", "西奥仑群岛", "西班牙", "西班牙福门特拉", "西班牙加那利群岛", "西班牙卡夫雷拉岛", "西班牙卡那利群岛", "西班牙马略卡岛", "西班牙梅诺卡岛", "西班牙美利利亚,西班牙切乌塔", "西班牙伊比沙岛", "西佛里西亚群岛", "希腊", "夏威夷", "新加坡", "新西兰", "匈牙利", "亚美尼亚", "亚述尔群岛", "伊拉克", "伊朗", "以色列", "意大利", "意大利撒丁岛,意大利西西里岛", "印度", "印度尼西亚", "英国", "约旦河西岸", "越南", "赞比亚", "泽西岛", "乍得", "智利", "中非共和国", "中国澳门", "中国台湾", "中国香港", "海地", "也门", "冰岛", "科摩罗", "圣马丁（法属）", "圣巴泰勒米", "北马里亚纳群岛", "罗塔岛"]    
    
    def get_random_dates():
        start_date = datetime(2024, 1, 1)
        random_days = random.randint(0, 364)
        random_hours = random.randint(0, 23)
        random_minutes = random.randint(0, 59)
        
        order_time = start_date + timedelta(days=random_days, hours=random_hours, minutes=random_minutes)
        end_time = order_time + timedelta(days=90)
        
        return (
            order_time.strftime("%Y-%m-%d %H:%M:%S"),
            end_time.strftime("%Y-%m-%d %H:%M:%S"),
            order_time.strftime("%Y-%m-%d")
        )

    def get_random_price():
        return random.randrange(158, 359, 10)

    def get_random_country_group():
        # 随机选择12-15个国家
        group_size = random.randint(12, 15)
        return random.sample(countries, group_size)

    sms_template = """【订购提醒】尊敬的客户，您好！您已于{order_date}通过中国移动线上渠道成功办理15G国漫流量月包_基础包。具体业务内容如下：
1、资费内容：{price}元/次，{countries}
2、生效时间：{start_time}
3、失效时间：{end_time}
若需提前解约您可前往号码归属地沟通100服务厅或拨打10086咨询办理。感谢您的参与！【中国移动】"""

    sms_template_my = """[အော်ဒါသတိပေးချက်] ချစ်လှစွာသော customer၊ မင်္ဂလာပါ။ သင်သည် China Mobile ၏အွန်လိုင်းချန်နယ်များမှ {order_date} တွင် 15G National Comic Traffic Monthly Package_Basic Package အတွက် အောင်မြင်စွာ လျှောက်ထားပြီးဖြစ်သည်။ အထူးစီးပွားရေးလုပ်ငန်းအကြောင်းအရာမှာ အောက်ပါအတိုင်းဖြစ်သည်။
1. စည်းကြပ်ခွန်ပါဝင်မှု-အကြိမ် {price} ယွမ်，{countries}
2. အကျိုးသက်ရောက်ချိန်- {start_time}
3. သက်တမ်းကုန်ချိန်- {end_time}
စာချုပ်ကို ကြိုတင်ဖျက်သိမ်းလိုပါက နံပါတ်ပိုင်ဆိုင်သည့်နေရာကို သွားပြီး 100 ဝန်ဆောင်မှုခန်းမသို့ ဆက်သွယ်မေးမြန်းနိုင်သည် သို့မဟုတ် 10086 သို့ခေါ်ဆို၍ ညှိနှိုင်းဆွေးနွေးနိုင်ပါသည်။ သင်၏ပါဝင်မှုအတွက် ကျေးဇူးတင်ပါသည်။ [တရုတ်မိုဘိုင်း]"""

    # 生成数据
    data = []
    for i in range(150):
        start_time, end_time, order_date = get_random_dates()
        price = get_random_price()
        country_group = get_random_country_group()
        countries_str = ", ".join(country_group)
        
        message_cn = sms_template.format(
            countries=countries_str,
            start_time=start_time,
            end_time=end_time,
            order_date=order_date,
            price=price
        )
        
        message_my = sms_template_my.format(
            countries=ggtran(countries_str, dest='en'),
            start_time=start_time,
            end_time=end_time,
            order_date=order_date,
            price=price
        )
        
        data.append({
            '序号': i + 1,
            '订购日期': order_date,
            '生效时间': start_time,
            '失效时间': end_time,
            '资费(元/次)': price,
            '包含国家': countries_str,
            '短信内容(中文)': message_cn,
            '短信内容(缅甸语)': message_my
        })
    
    # 创建DataFrame并保存到Excel
    df = pd.DataFrame(data)
    excel_path = 'sms_messages1.xlsx'
    df.to_excel(excel_path, index=False, engine='openpyxl')
    print(f"数据已保存到 {excel_path}")
    
    return df

# 生成数据并保存到Excel
messages_df = generate_sms_messages()
messages_df.head()  # 显示前5行数据预览

In [None]:
from datasets import load_dataset

ds = load_dataset("ai4privacy/pii-masking-200k")


