## Get the webpage

In [1]:
import requests

In [2]:
res = requests.get('https://pandas.pydata.org/docs/user_guide/10min.html')
res.status_code

200

In [3]:
content = res.text
content[:1000]

'\n<!DOCTYPE html>\n<html lang="en" data-content_root="../">\n<head>\n<meta charset="utf-8" />\n<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />\n<title>10 minutes to pandas &#8212; pandas 2.2.2 documentation</title>\n<script data-cfasync="false">\n    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";\n    document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";\n  </script>\n\n<link href="../_static/styles/theme.css?digest=5b4479735964841361fd" rel="stylesheet" />\n<link href="../_static/styles/bootstrap.css?digest=5b4479735964841361fd" rel="stylesheet" />\n<link href="../_static/styles/pydata-sphinx-theme.css?digest=5b4479735964841361fd" rel="stylesheet" />\n<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=5b4479735964841361fd" rel="stylesheet" />\n<link rel="preload" as="font" type="font/woff2" crossorigin href="../_s

## split the words

In [4]:
import re

In [5]:
content = content.lower()

In [6]:
words = re.findall(r'\w+', content)

words = [x for x in words if len(x) >= 2 and not str(x).isdigit()]

In [7]:
print(words[:100])

['doctype', 'html', 'html', 'lang', 'en', 'data', 'content_root', 'head', 'meta', 'charset', 'utf', 'meta', 'name', 'viewport', 'content', 'width', 'device', 'width', 'initial', 'scale', 'meta', 'name', 'viewport', 'content', 'width', 'device', 'width', 'initial', 'scale', 'title', 'minutes', 'to', 'pandas', 'pandas', 'documentation', 'title', 'script', 'data', 'cfasync', 'false', 'document', 'documentelement', 'dataset', 'mode', 'localstorage', 'getitem', 'mode', 'document', 'documentelement', 'dataset', 'theme', 'localstorage', 'getitem', 'theme', 'light', 'script', 'link', 'href', '_static', 'styles', 'theme', 'css', 'digest', '5b4479735964841361fd', 'rel', 'stylesheet', 'link', 'href', '_static', 'styles', 'bootstrap', 'css', 'digest', '5b4479735964841361fd', 'rel', 'stylesheet', 'link', 'href', '_static', 'styles', 'pydata', 'sphinx', 'theme', 'css', 'digest', '5b4479735964841361fd', 'rel', 'stylesheet', 'link', 'href', '_static', 'vendor', 'fontawesome', 'css', 'all', 'min', 'css

## construct dataframe

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(words, columns=['Words'])
df.head()

Unnamed: 0,Words
0,doctype
1,html
2,html
3,lang
4,en


In [10]:
df = df['Words'].value_counts().reset_index()
df.head()

Unnamed: 0,Words,count
0,span,5248
1,class,3245
2,go,494
3,div,434
4,quot,386


## read english dictionary 

In [11]:
df_dict = pd.read_csv('./stardict.csv')

  df_dict = pd.read_csv('./stardict.csv')


In [12]:
df_dict.shape

(3402564, 13)

In [13]:
df_dict.head()

Unnamed: 0,word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio
0,'a,eɪ,,na. 一\nn. 英文字母表的第一字母；【乐】A音\nart. 冠以不定冠词主要表示类别\...,,,,,,,,,
1,'A' game,,,[网络] 游戏；一个游戏；一局,,,,,,,,,
2,'Abbāsīyah,,,[地名] 阿巴西耶 ( 埃 ),,,,,,,,,
3,'Abd al Kūrī,,,[地名] 阿卜杜勒库里岛 ( 也门 ),,,,,,,,,
4,'Abd al Mājid,,,[地名] 阿卜杜勒马吉德 ( 苏丹 ),,,,,,,,,


In [14]:
df_dict = df_dict[['word', 'translation']]
df_dict.head()

Unnamed: 0,word,translation
0,'a,na. 一\nn. 英文字母表的第一字母；【乐】A音\nart. 冠以不定冠词主要表示类别\...
1,'A' game,[网络] 游戏；一个游戏；一局
2,'Abbāsīyah,[地名] 阿巴西耶 ( 埃 )
3,'Abd al Kūrī,[地名] 阿卜杜勒库里岛 ( 也门 )
4,'Abd al Mājid,[地名] 阿卜杜勒马吉德 ( 苏丹 )


## connect the data 

In [15]:
df_merge = pd.merge(left=df, right=df_dict, left_on='Words', right_on='word', how='inner')

In [16]:
df_merge.head()

Unnamed: 0,Words,count,word,translation
0,span,5248,span,"n. 指距, 全长, 跨距, 一段时间, 小范围\nvt. 以手指测量, 跨越, 架设, 持续"
1,class,3245,class,"n. 班级, 阶级, 种类, 课\nvt. 分类\n[计] 类别; 类; 种类; 类程"
2,go,494,go,"vi. 去, 走, 达到, 运转, 查阅, 消失, 结束, 放弃, 花费, 流传, 趋于, ..."
3,div,434,div,abbr. 分开（divide）；区分（division）
4,quot,386,quot,"n. 引用语；开价, 报价；行市"


## export dictionary

In [17]:
data = pd.read_csv('./stardict.csv')

  data = pd.read_csv('./stardict.csv')


In [18]:
print(data)

                  word phonetic definition  \
0                   'a       eɪ        NaN   
1             'A' game      NaN        NaN   
2           'Abbāsīyah      NaN        NaN   
3         'Abd al Kūrī      NaN        NaN   
4        'Abd al Mājid      NaN        NaN   
...                ...      ...        ...   
3402559          Zūzan      NaN        NaN   
3402560         Zǎbala      NaN        NaN   
3402561        Zǎbrani      NaN        NaN   
3402562       Zǎbrǎtǎu      NaN        NaN   
3402563        Zǔrnevo      NaN        NaN   

                                               translation  pos  collins  \
0        na. 一\nn. 英文字母表的第一字母；【乐】A音\nart. 冠以不定冠词主要表示类别\...  NaN      NaN   
1                                          [网络] 游戏；一个游戏；一局  NaN      NaN   
2                                          [地名] 阿巴西耶 ( 埃 )  NaN      NaN   
3                                      [地名] 阿卜杜勒库里岛 ( 也门 )  NaN      NaN   
4                                      [地名] 阿卜杜勒马吉德 ( 苏丹 )  NaN    

In [19]:
data.head(100)

Unnamed: 0,word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio
0,'a,eɪ,,na. 一\nn. 英文字母表的第一字母；【乐】A音\nart. 冠以不定冠词主要表示类别\...,,,,,,,,,
1,'A' game,,,[网络] 游戏；一个游戏；一局,,,,,,,,,
2,'Abbāsīyah,,,[地名] 阿巴西耶 ( 埃 ),,,,,,,,,
3,'Abd al Kūrī,,,[地名] 阿卜杜勒库里岛 ( 也门 ),,,,,,,,,
4,'Abd al Mājid,,,[地名] 阿卜杜勒马吉德 ( 苏丹 ),,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,'Amrīyah,,,[地名] 阿姆里耶 ( 苏丹 ),,,,,,,,,
96,'ams,,,n. 美国数学学会\n[网络] 阿姆斯特丹(Amsterdam)；阿尔法磁谱仪(Alpha ...,,,,,,,,,
97,'Amādīyah,,,[地名] 阿马迪耶 ( 伊拉 ),,,,,,,,,
98,'Amīnābād,,,[地名] 阿明阿巴德 ( 伊朗 ),,,,,,,,,
