# General Web Scraping

In [99]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

html_text = requests.get('https://community.infineon.com/?profile.language=en').text
soup = BeautifulSoup(html_text, 'lxml')

product_types = soup.find_all('a', class_='board-link')
#page = soup.find('a', class_='lia-js-data-page-last').text.strip()

data = []
        
for index, product_type in enumerate(product_types):
    # Find the title, body and time elements within the specific product type
    title = product_type.find_next('div', class_='subject').text.strip()
    body = product_type.find_next('div', class_='full-body body').text.strip()
    #time = product_type.find_next('span', class_='time').text.strip().replace(' ', '')
    
    # link
    raw_link = product_type["href"]
    domain_link = 'https://community.infineon.com'
    link = domain_link + raw_link
    
    data.append({
        'product_types': product_type.text.strip(),
        'Title': title,
        'Body': body,
        'href': link
    })

df = pd.DataFrame(data)
df

Unnamed: 0,product_types,Title,Body,href
0,Radar sensor,"Rapid IoT Connect Platform stuck on ""In Progress""","Hello, I am currently using the Rapid IoT Conn...",https://community.infineon.com/t5/Radar-sensor...
1,AURIX™,peak_hold喷油驱动控制,"Hi,\nI am looking for Eval Board for BTS711L1 ...",https://community.infineon.com/t5/AURIX/bd-p/A...
2,Smart Power Switches,Classic PROFET,"Hi,\nI am looking for Eval Board for BTS711L1 ...",https://community.infineon.com/t5/Smart-Power-...
3,MOSFET (Si/SiC),IPB107N20N3 TINA TI MODEL,Please provide a link to the documentation tha...,https://community.infineon.com/t5/MOSFET-Si-Si...
4,AURIX™,TC 397 HSM User Development Guide,Please provide a link to the documentation tha...,https://community.infineon.com/t5/AURIX/bd-p/A...
5,IGBT,驱动芯片,"Hello , i need to know which tool can flash re...",https://community.infineon.com/t5/IGBT/bd-p/IGBT
6,TRAVEO™ T2G,S6J32HEL,"Hello , i need to know which tool can flash re...",https://community.infineon.com/t5/TRAVEO-T2G/b...
7,AURIX™,Performance comparison between TC399XX and TRA...,"Hello,\nDo you have any benchmark that compare...",https://community.infineon.com/t5/AURIX/bd-p/A...
8,Gate Driver ICs,Controlling AC load with two mosfets and 1ED31...,"Hi,I am investigating the possibility of contr...",https://community.infineon.com/t5/Gate-Driver-...
9,Intelligent Power Modules (IPM),Three Phase Sine Wave Power Supply using CIPOS...,"Hi all,I am an Embedded Engineer and I am look...",https://community.infineon.com/t5/Intelligent-...


In [34]:
options = soup.find_all('li', class_='options')

print(f'Number of Types: {len(options)} \n')

data2 = []

for option in options:
    temp = option.text.strip()
    data2.append({'product_types': temp})
    
df_product = pd.DataFrame(data2)
df_product

Number of Types: 72 



Unnamed: 0,product_types
0,PSoC™ 6
1,Wi-Fi Combo
2,Nor Flash
3,USB low-full-high speed peripherals
4,FIRST Robotics Competition (FRC)
...,...
67,Power Management ICs
68,MOTIX™ MCU
69,Legacy microcontrollers
70,Battery Management ICs


# Extend our web scraping according to the product
For example, IGBT...

In [105]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the base URL for the first page
base_url = 'https://community.infineon.com/t5/IGBT/bd-p/IGBT'

parts = base_url.split('/')
name = parts[-1]

def web2csv(base_url):

    data3 = []
    page = 0

    while True:
        if page == 0:
            url = base_url
            print(url)
        else:
            # Construct the URL for the current page
            url = f'{base_url}/page/{page+1}'
            print(url)

        response = requests.get(url, allow_redirects=False)
        # if page is not found, then break
        if response.status_code != 200: 
            break

        # Request HTML content
        html_text = response.text
        soup = BeautifulSoup(html_text, 'lxml')

        # Find all product types on the current page
        product_types = soup.find_all('a', class_='board-link')

        for product_type in product_types:
            # Find the title, body, and time elements within the specific product type
            title = product_type.find_next('div', class_='subject').text.strip()
            body = product_type.find_next('div', class_='full-body body')

            if body == None:
                body = product_type.find_next('div', class_='truncated-body body').text.strip()
            else:
                body = body.text.strip()

            #time = product_type.find_next('span', class_='time').text.strip().replace(' ', '')

            data3.append({
                'Product_types': product_type.text.strip(),
                'Title': title,
                'Body': body
            })

        page+=1

        if page > 3: #set page 20
            break
            
    # Create DataFrame
    df_IGBT = pd.DataFrame(data3)
    df_IGBT.to_csv(f"data/{name}.csv")
    
    return #df_IGBT

In [106]:
web2csv(base_url)

https://community.infineon.com/t5/IGBT/bd-p/IGBT
https://community.infineon.com/t5/IGBT/bd-p/IGBT/page/2
https://community.infineon.com/t5/IGBT/bd-p/IGBT/page/3
https://community.infineon.com/t5/IGBT/bd-p/IGBT/page/4


In [107]:
df_IGBT

Unnamed: 0,Product_types,Title,Body
0,IGBT,驱动芯片,"Hello, \nI am learning about all the infineon'..."
1,IGBT,Differents housing for igbt modules,"Hello, \nI am learning about all the infineon'..."
2,IGBT,Question about Infineon’s Simulation Models,Hello! I am trying to use the PLECS thermal mo...
3,IGBT,Eval-M1-CM610N3,Regarding the principle provided in the thread...
4,IGBT,"IKCM30F60GD , what is the max output current @...","IFCM20U65GD IPM module has normal VFO pin, nor..."
...,...,...,...
195,IGBT,Power cycling curves/ lifetime parameters of I...,"Hi, I am trying to use your IGW25N120H3 IGBT i..."
196,IGBT,Igbt module DF75R12W1H4-B21\n\n\n S...,Double pulse test recommended test chart The c...
197,IGBT,About the location of the IGBT double pulse te...,Double pulse test recommended test chart The c...
198,IGBT,PCsec过程中芯片与焊锡间裂纹\n\n\n Solved,Why are the IGBT parameters inconsistent betwe...


# Explore Input Data (not used)

According to different product, we have different input data. To simplify the process, alternatively, we could use "Instant Data Scraper" chrome extension to do data scraping from the community website. (https://chromewebstore.google.com/detail/instant-data-scraper/ofaokhiedipichpaobibbnahnkdoiiah)

In [5]:
df1 = pd.read_csv('data/PSoC6.csv')
df2 = pd.read_csv('data/Wi-Fi Combo.csv')
df3 = pd.read_csv('data/Nor Flash.csv')
df4 = pd.read_csv('data/USB low-full-high speed peripherals.csv')
df5 = pd.read_csv('data/MOSFET.csv')

In [6]:
df1.loc[:,['board-link','subject-link','truncated-body']].head() #PSoC6

Unnamed: 0,board-link,subject-link,truncated-body
0,PSoC™ 6,使用PSoc 62系列板卡时遇到了无法烧录和调试的问题，似乎是flash的问题,我的板卡型号是Psoc6-evaluationkit-062S2，在我按下板卡上的MODE按...
1,PSoC™ 6,PSoC6 CY8CPROTO-063,"Hello, good afternoon, I am going to work on a..."
2,PSoC™ 6,Encountering Issues with Programming and Debug...,Translated Content:\nBoard Model: Psoc6-evalua...
3,PSoC™ 6,SCB Managing Slave Select Peripheral Lines,- Device Configurator 3.10.0.6117\n- 7e6892ee1...
4,PSoC™ 6,How to properly use dma and spi together with ...,We are having a problem communicating with two...


In [7]:
df2.loc[:,['board-link','subject-link','truncated-body']].head() #Wi-Fi Combo

Unnamed: 0,board-link,subject-link,truncated-body
0,Wi-Fi Combo,Malloc is thread safe ?,"Hello,\nI'm using WICED STUDIO 6.6. and CYW943..."
1,Wi-Fi Combo,CYW943907 - Amazon FreeRTOS OTA support,"Hi, Per Amazon doc, CYW943907AEVAL1F is not su..."
2,Wi-Fi Combo,how to enable or disable the save restore feat...,when I read the register of CHIPCOMMON_SR_CONT...
3,Wi-Fi Combo,HTTPS speed problem,Good afternoon. I am facing HTTPS speed issue ...
4,Wi-Fi Combo,Disable dns server.,"Hi, I have a Laird Sterling EWB. What I try to..."


In [8]:
df3.loc[:,['board-link','subject-link','truncated-body']].head() #Nor Flash

Unnamed: 0,board-link,subject-link,truncated-body
0,Nor Flash,how to use or unprotect the highest address se...,"Hi,\nIam using a S70GL02GT11FHA010 NOR Flash...."
1,Nor Flash,S25FL256LAGNFM010 material specifications,I would like to know what the potting compound...
2,Nor Flash,S29AL016J70TFN020 Thermal Data?,"Hello,\nWe would like to use this memory: S29A..."
3,Nor Flash,Where is the file ---- slld_fll_256l.h,A member from Infineon posted me a zip file re...
4,Nor Flash,S29JL032J60TFI010 of Product Status,How is the production of S29JL032J60TFI010 pro...


In [9]:
df4.loc[:,['board-link','subject-link','truncated-body']].head() #USB low-full-high speed peripherals

Unnamed: 0,board-link,subject-link,truncated-body
0,USB low-full-high speed peripherals,"CY7C65215, configuration image CRC","From a linux system, I want to read back the c..."
1,USB low-full-high speed peripherals,Arming of bulk/interrupt out endpoint in fx2lp,"Hello,I am stuck at a point,in my application ..."
2,USB low-full-high speed peripherals,Pinout error on documentation of CY7C6514D,"Hi Infineon Community,\nI am reviewing the sec..."
3,USB low-full-high speed peripherals,CY7C65210 TID Number,"Hi,\nWhat is CY7C65210 TID number?\nBR\nEason"
4,USB low-full-high speed peripherals,Setting up isochronous out endpoint in fx2lp,"Hello,Can someone point me to examples codes f..."


In [10]:
df5.loc[:,['subject-link','truncated-body']].head() # MOSFET # No feature 'board-link'

Unnamed: 0,subject-link,truncated-body
0,Double Pulse Test,"In the datasheet, a dual pulse test circuit wa..."
1,TCC data,"Hello,\nI am reaching out to request TCC data ..."
2,Cu Clip in Automotive Mosfet,Hi\n \nDoes infineon has Cu Clip technology in...
3,SiC IMBG120R350M1HXTMA1 sense pin unconnected,"Hello together,\nI'm planning to use a SiC IMB..."
4,EVAL Inverter,I'm playing around with the EVAL_3K3W_TP_PFC_S...
