In [1]:
import re
import pickle
import pandas as pd

def save_for_training(list_to_save, save_name):
    """
    __ Parameters __
    [1D] list_to_save:  to the pkl file
    [string] save_name: column under which to save

    __ Description __
    Append to the pkl file
    """
    temp = pd.read_pickle("adv02_training_dataset.pkl")
    
    temp[save_name] = list_to_save

    with open("adv02_training_dataset.pkl", 'wb') as fout:
        pickle.dump(temp, fout)

def read_pickle(file_name_to_read="adv02_training_dataset.pkl"):
    """
    __ Parameters __
    [str] file_name_to_read:    and load data from
    """

    return pd.read_pickle(file_name_to_read)

# Datacleaning and why even use them
***
Every single neural network at <font color=purple>Dreams AI</font>, regardless of it's architecture (from the simple 2-layer with Relu Activation to a big-brained Bayesian Network) or the genius that stands behind it's creation, requires a good quality dataset to be trained on.

Such a dataset would be a collection of **properties**, **labelled** into **fields**. In the ideal case scenario, data is **labelled** as it is acquired by either:
- Chinese worker diligently copying and pasting information from emails into a spreadsheet;
- `Selenium` or `cdp` web scraper that allocates properties depdning on their position on the webpage e.g. titles → Brand, Timestamp → Unique ID etc.

<img src="images/clean_dataset.png" title="Clean dataset" />

However such unregulated **Feature Extraction** can occassionally lead to errors in the form of:
- Typograhpical errors;
- Incorrect field classification (*below is an example of the `Brand` being cluttered with properties that should be in the `description`*).

<img src="images/dirty_dataset.png" title="Dirty dataset" />

## The dataset used is the same as from the DataFrame tutorial, with fields
<center><font color=red>Brand, Description, Part Number, Price, Qty</font>, Remark, GroupName, UniqueID</center>

The highghted fields have numererous errors that need to be cleaned.

# Example use-case of regular experssions
**Regular expressions** are used to catch word patterns to correct such errors. Consider regular experssion as a heavily-roided word search that, appart from simple words, can search for letter-number combinations, multiple words, words endings, etc. 

The regexp procedure in the code below eleminates errors from the brand column of the above set. 

*It will become clear what it all means by the end of the tutorial*
***
1. Build regular expression to catch "Orig" and "[stuff in square brackets]":
```python
regexp = r"\b(Orig)|(\[.*?\])"
```
<img src="images_inkscape/filtering_brand.png" title="Filtering out the brand"/>
2. <font color=red><b>Very important to have the leading `r` as it ensures that backslahes are passed on without modification</b></font>.
3. Call the regular experssion function (in this case `re.sub` replace the pattern that is found with an empty string):
```python
mg = re.sub(regexp, "", i)
```
4. Ignore the case of what is being matched:
```python
re.IGNORECASE
```
5. Search proceeds left to right, returning matched patterns or `None` if nothing was found.

In [None]:
########################################
# 🍏 Example showing a typical use case of regexp
########################################
# 1 - load data in
brand = read_pickle()['example-1']

# 2 - construct regexp
regexp = r"\b(Orig)|(\[.*?\])"

# 3 - run through the list and remove "orig" and "[1Gx8...]"
for i in brand:
    mg = re.sub(regexp, "", i, re.IGNORECASE)
    print(mg, "\t\t←\t\t", i)

Samsung  		←		 Samsung Orig
Kingston   		←		 Kingston Orig [1Gx8 16C]
Crucial  		←		 Crucial Orig
Samsung   		←		 Samsung Orig [1Gx8 16C]
MIcron   		←		 MIcron Orig [1Gx8 16C]
Hynix   		←		 Hynix Orig [1Gx8 16C]
Micron   		←		 Micron Orig [1Gx8 16C]
Hynix   		←		 Hynix Orig [1Gx8 16C]
OEM 		←		 OEM
Micron   		←		 Micron Orig [1Gx8 8C]
Samsung   		←		 Samsung Orig [1Gx8 8C]
Samsung   		←		 Samsung Orig [1Gx8 8C]
Micron   		←		 Micron Orig [1Gx8 8C]
Crucial  		←		 Crucial Orig
Hynix   		←		 Hynix Orig [1Gx8 8C]
Hynix   		←		 Hynix Orig [1Gx8 8C]
OEM 		←		 OEM
Hynix   		←		 Hynix Orig [512Mx64 8C]
Samsung   		←		 Samsung Orig [512Mx16 4C]
Kingston retail  		←		 Kingston retail [512Mx8 8C]
Samsung   		←		 Samsung Orig [512Mx16 4C]
Samsung   		←		 Samsung Orig [512Mx8 8C]
Micron 		←		 Micron
Micron   		←		 Micron Orig [512Mx16 4C]
Hynix   		←		 Hynix Orig [512Mx64 8C]
Samsung 		←		 Samsung


# 1 - Basics of regular expressions
Basic regular expression patterns match single characters:

<table class="table table-striped table-bordered table-hover table-condensed">
<colgroup>
<col  class="left">
<col  class="left">
</colgroup>
<thead>
<tr>
<th scope="col" class="text-left"><b>Characters</b></th>
<th scope="col" class="text-left"><b>Description</b></th>
</tr>
</thead>
<tbody>

<tr>
<td class="text-left"><code>"a", "X", "9", etc</code></td>
<td class="text-left">Normal characters <font color=red>(Characters with special meaning need to be backslashed: "^" "$" "*" "+" "?" "{" "[" "]" "\" "|" "(" ")" </font></td>
</tr>                                                                                                                                           <tr>                                                                                                                 <td class="text-left">"."</td>
<td class="text-left">Matches any character except for newline</td>
</tr>
                                                                                                                     <tr>
<td class="text-left"><code>"\d"</code> (<code>"\D"</code> complement set)</td>
<td class="text-left">Decimal digits ↔ [0-9]</td>
</tr>

<tr>
<td class="text-left"><code>"\s"<code> (<code>"\S"</code> complement set)</td>
<td class="text-left">White-space character</td>
</tr>
                                                                                                                    <tr>
<td class="text-left"><code>"\w"<code> (<code>"\W"</code> complement set)</td>
<td class="text-left">Word characters ↔ [0-9a-zA-Z] - <font color=red>Though called word, this only watches a <b>single</b> word character not a full word</td>
</tr>

<tr>
<td class="text-left"><code>"A|B"</code></td>
<td class="text-left">Match either A or B</td>
</tr>

<tr>
<td class="text-left"><code>"[✘✘✘✘]"</code></td>
<td class="text-left">A set of characters, all of which will produce a match <font color=red>☩</font></td>
</tr>

<tr>
<td class="text-left"><code>"[^✘✘✘]"</code></td>
<td class="text-left">Anything <font color=red><b>except</b></font> these characters will produce a match</td>
</tr>

<tr>
<td class="text-left"><code>"^"</code></td>
<td class="text-left">Matches begginning of line - rarely used</td>
</tr>

<tr>
<td class="text-left"><code>"$"</code></td>
<td class="text-left">Matches end of line - rarely used</td>
</tr>

</tbody>
</table>
<font color=red>☩ A range of characters can be specified e.g. `A-Z, a-z, 0-9`</font>

In [28]:
########################################
# 🍏 Basic examples
########################################
# 1 - load the data
brand = ["Intel", "intel", "Intel2010", "Intel2011", "Intel2012", "AMD Pro"]

# 2 - regular expression to catch all of above cases
regexp = "Intel\d\d\d\d|[iI]ntel|AMD\sPro"

for idx, i in enumerate(brand):

    # 3 - try to match one of the brands
    mg = re.search(regexp, i, re.IGNORECASE)
    
    if(mg is None):
        # 4 - print if brand did not exist
        print(i)print("✘ Error with\t", i)
    else:
        brand[idx] = mg.group()

print(brand)

['Intel', 'intel', 'Intel2010', 'Intel2011', 'Intel2012', 'AMD Pro']


In [16]:
########################################
# 📓 Modify the regexp by manually adding the possible brands using the "|" separator
########################################
# 1 - load the data
brand = read_pickle("datasets/DRAM_OfferScraperOutput.pkl")['Brand']

# 2 - Use regular experssions to explicitly spell out brand that are accepted
regexp = "SK Hynix|Samsung|Hynix|Micron"

for idx, i in enumerate(brand):

    # 3 - try to match one of the brands
    mg = re.search(regexp, i, re.IGNORECASE)
    
    if(mg is None):
        # 4 - print if brand did not exist
        print("✘ Error with\t", i)
    else:
        brand[idx] = mg

Elpida
Elpida
Elpida
Elpida
Elpida
Elpida
Elpida
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Qimonda
Qimonda
Infineon / Qimonda
Infineon / Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda
Qimonda / Infineon
Qimonda / Infineon
Qimonda / Infineon
Elixir
Elixir
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Nanya
Nanya
Nanya
Nanya
Winbond
Elixir
Elixir
Elixir
Elixir
Winbond
Winbond
Elixir
Elixir
Nanya
Elixir Orig
Elixir Orig
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Infineon / Qimonda
Spectek
Specte

StarRam
Winbond
Winbond
Winbond
AMIC (Taiwan)
Elpida
Etron
Etron
Etron
ICSI
ICSI
ISSI
Nanya
Nanya
Nanya
OEM ACE
OEM VT
PIECEMAKERS
Vanguard
Vanguard
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Elite MT
Eudar
Elite MT
Etron
ESMT
Etron
Mosel Vitelic
Etron
Elite MT
Elpida
EtronTech
EtronTech
EtronTech
EtronTech
EtronTech
GBIT
ISSI
OEM VT
StarRam
Toshiba
Winbond
Winbond
Elite MT
Eorex
Elite MT
Etron
Etron
Etron
Etron
Fujitsu
Fujitsu
Fujitsu
PIECEMAKERS
IC Sensors
ICSI
ICSI
Infineon / Qimonda
ISSI
Mira
Mosel Vitelic
Mosel Vitelic
Winbond
Winbond
Etron
Etron
Etron
Etron
Etron
ISSI
Nanya
Nanya
Nanya
Elpida
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
Winbond
PIECEMAKERS
ISSI
WINBOND
Winbond
WINBOND
ISSI
Elpida
Mira
Elpida
Elpida
Nanya
Nanya
Nanya
WINBOND
Nanya
ICSI
ICSI
ISSI
Etron
Powerchip Semicon
ISSI
ISSI
ISSI
Nanya
Nanya
Nanya
Nanya/Blank
ISSI
Elixir
ISSI
NANYA
Elpida
Winbond
Winbond
ICSI
Winbond
OEM
ISSI
Winbond
ISSI


# 2 - Functions to use
There are 5 major functions for performing the actual matching. Only 3 are extensively used, as we want to process full strings, and not stop at the first matched pattern.

<div id="content" class="container">
<div class="row"><div class="col-md-9"><h1 class="title"></h1>
<table class="table table-striped table-bordered table-hover table-condensed">

<colgroup>
<col  class="left">
<col  class="left">
<col  class="left">
</colgroup>

<tbody>
<tr>
<td class="text-left"><code>re.search(regexp, string, FLAGS)</code></td>
<td class="text-left">Find the first occurence of the regular experssion</td>
<td class="text-left"></td>
</tr>

<tr>
<td class="text-left"><code><font color=red>re.findall(regexp, string, FLAGS)</font></code></td>
<td class="text-left">Find all occurences of the regular expression</td>
<td class="text-left">Returns a list: <code>[match1, match2, match3]</code></td>
</tr>

<tr>
<td class="text-left"><code>re.match(regexp, string, FLAGS)</code></td>
<td class="text-left">Match from the beggning</td>
<td class="text-left"></td>
</tr>

<tr>
<td class="text-left"><code><font color=red>re.fullmatch(regexp, string, FLAGS)</font></code></td>
<td class="text-left">Match from beginning to end</td>
<td class="text-left">Returns either: <code>None</code> or a match, whose contents can be accessed with <code>mg.group()</code></td>
</tr>

<tr>
<td class="text-left"><code><font color=red>re.sub(regexp, replacement, string, FLAGS)</font></code></td>
<td class="text-left">Substitutes the regular experssion for pattern of choice</td>
<td>Returns a string with implemented substitutions: <code>substituted_string</code></td>
</tr>

<tr>
<td class="text-left"><code><font color=red>re.IGNORECASE</font></code></td>
<td class="text-left">Use as a <code>FLAG</code> to ignore lower and uppercase differences</td>
<td class="text-left"></td>
</tr>
</tbody>
</table>

In [17]:
########################################
# 🍏 It is possible for certain entries to have multiple brands
#     If this is the case, then perform a findall search
########################################
# 1 - prepare data
brand = ["Elpida", "Qimonda", "Infineon / Qimonda"]

regexp = "SK Hynix|Samsung|Hynix|Micron|Elpida|Qimonda|Infineon"

for idx, i in enumerate(brand):

    # 3 - try to match one of the brands
    mg = re.findall(regexp, i, re.IGNORECASE)
    
    if(mg is None):
        # 4 - print if brand did not exist
        print(i)
    else:
        brand[idx] = "-".join(mg)

########################################
# 📔 Modify the previous exercise to include multiple brands
########################################

['Elpida', 'Qimonda', 'Infineon-Qimonda']

In [32]:
########################################
# 🍏 re.fullmatch would be used for verfification of a whole string
########################################
# 1 - load in data
example = ["1243GF2222", "124gF2222"]

# 2 - part number format is: 4numbers-3AtoZ-4numbers
regexp = "\d{4,}[A-Z]{2,}\d{4}"

for i in example:
    mg = re.fullmatch(regexp, i)
    if(mg is None):
        print("✘ Error with\t", i)

✘ Error with	 124gF2222


# Repetitions
Occasionally, one wants the specify a pattern where a certain number of characters is repeated. For example, to match all of the following speeds (that need to be labelled as description)

<center>800MHz, speed 600 kHz, 30 mHz, 100-Mhz</center>

***
<table class="table table-striped table-bordered table-hover table-condensed">

<colgroup>
<col  class="left">

<col  class="left">
</colgroup>
<thead>
<tr>
<th scope="col" class="text-left"><b>Repetition</b></th>
<th scope="col" class="text-left"><b>Description</b></th>
</tr>
</thead>
<tbody>
<tr>
<td class="text-left"><code>"*"</code></td>
<td class="text-left">Matches 0 or more of the preceding RE</td>
</tr>

<tr>
<td class="text-left"><code>"+"</code></td>
<td class="text-left">Matches 1 or more of the preceding RE.</td>
</tr>

<tr>
<td class="text-left"><code>"?"</code></td>
<td class="text-left">Matches 0 or 1  of the preceding RE.</td>
</tr>

<tr>
<td class="text-left"><code>"{m,n}"</code></td>
<td class="text-left">Matches from m to n repetitions of the preceding RE.</td>
</tr>

<tr>
<td class="text-left"><code>"*?", "+?", "??", "{m,n}?"</code></td>
<td class="text-left"><b><font color=red>Non-greedy</font></b> versions of the previous three characters</td>
</tr>
</tbody>
</table>
<b><font color=red>Non-greedy means the minimal amount will be matched (not used commonly in our workflow)</font></b>

In [None]:
########################################
# 🍏 Example showing how the re.subn function can be used to remove junk in square brackets that occurs in some of the brands
########################################

# 1 - prepare data
example = ["Kingston Orig [1Gx8 16C]", "Crucial Orig", "Hynix Orig [1Gx8 8C]", "Hynix Orig [1Gx8 8C]", "Hynix Orig [512Mx64 8C]", "Samsung Orig [512Mx16 4C]", "Kingston retail", "Samsung Orig [512Mx16 4C]", "Samsung Orig [512Mx8 8C]"]

# 2 -regexp to eleminate square brackets and everything inside them
# . matches any characters
# * matches any number of times
regexp = "\[.*\]"

for i in example:
    mg = re.sub(regexp, "", i)
    print(mg)

Kingston Orig 
Crucial Orig
Hynix Orig 
Hynix Orig 
Hynix Orig 
Samsung Orig 
Kingston retail
Samsung Orig 
Samsung Orig 


In [None]:
########################################
# 📔 Redo the brand check, but this time go from the opposite end, trying to remove all the junk
########################################
# 1 - load the data
brand = read_pickle("datasets/DRAM_OfferScraperOutput.pkl")['Brand']

# 2 - Use regular experssions to spell out common junk that needs to be removed
regexp = "🍄🍄🍄🍄"

for idx, i in enumerate(brand):

    # 3 - eliminate the junk
    mg = re.sub(regexp, "", i)
    print(mg)

In [34]:
########################################
# 🍏 Example catching word errors
########################################
# A common typo can be handled by allowing the incorrect rearrangment of letters to allow both Intel ↔ Inetl

# 1 - load the data
brand = ["Intel", "Inetl", "AMD"]

# 2 - Use regular experssions to explicitly spell out brand that are accepted
regexp = "In[te]{2}l|AMD"

for idx, i in enumerate(brand):

    # 3 - try to match one of the brands
    mg = re.search(regexp, i, re.IGNORECASE)
    
    if(mg is None):
        # 4 - print if brand did not exist
        print("✘ Error with", i)
    else:
        brand[idx] = mg.group()

print(brand)
########################################
# 📔 Simplify the brand search in the previous cells e.g. \d\d\d\d → \d{4}
########################################


['Intel', 'Inetl', 'AMD']


In [12]:
########################################
# 🍏 Example of matching frequencies
########################################
frequencies_to_match = ["800MHz", "speed 600 kHz", "30 mHz", "100-Mhz", "100", "2666Hz"]

# 1 - look for: 2-3numbers + maybe a white space + khz or mhz
regexp = r"\b\d{2,3}[\s-]?[km]hz" 

for i in frequencies_to_match:
    mg =  re.findall(regexp, i, re.IGNORECASE)
    print(mg, "\t\t←\t\t", i)

['800MHz'] 		←		 800MHz
['600 kHz'] 		←		 speed 600 kHz
['30 mHz'] 		←		 30 mHz
['100-Mhz'] 		←		 100-Mhz
[] 		←		 100
[] 		←		 2666Hz


In [8]:
########################################
# 📓 Ammend the regular expression below, to double check that prices are of the correct format
######################################## 
# Call          →       Call
# Bid           →       Bid
# Sold out      →       Sold out
# ---           →       NA

# 1 - load data and prepare list of verified prices
prices = read_pickle("datasets/DRAM_OfferScraperOutput.pkl")['Price']

# 2 - regular expression to allow only certain prices
regexp = "USD\d{1}\.\d{2}"

# 3 - move through the list and try to find a match
for idx, i in enumerate(prices):
    mg = re.fullmatch(regexp, i, re.IGNORECASE)
    
    if (mg is None):
        # 4 - print elements which did not fit the regexp
        print(i)
        prices[idx] = "INCORRECT FORMAT"

Call
Call
---
Call
Call
Call
Call
---
---
Bid
---
Sold Out
Call
Call
Call
Bid
Call
Call
---
---
Call
Call
Call
---
---
Call
Call
Bid
Call
Call
Call
Call
Bid
Bid
Call
---
Call
Bid
Call
Call
Call
Call
Call
---
Call
Bid
Bid
---
---
Call
Bid
---
Call
---
Call
Call
Bid
Call
Call
Call
---
Call
Bid
---
Call
Bid
Sold Out
Call
Call
Bid
Bid
---
Sold Out
Bid
Call
Bid
Call
Call
---
Bid
Call
Call
Call
Call
---
Call
Call
Call
---
Call
Bid
Call
Call
Call
Call
---
Call
Call
---
Call
---
---
---
---
Call
Call
Call
Bid
Call
Call
Call
Call
---
Bid
---
Call
---
Call
Call
---
Call
---
Bid
Bid
---
Bid
---
Call
---
---
---
Call
Call
---
Call
Bid
---
Bid
---
---
---
---
---
Bid
---
---
---
---
Call
Bid
---
Call
Call
Call
---
Bid
Bid
---
Bid
---
Call
Call
Call
---
---
Bid
Bid
Bid
Call
Bid
Bid
Bid
Call
Call
Call
Bid
Call
Call
Call
Bid
Call
Call
---
---
Bid
Call
Call
Call
Call
Call
Call
Call
---
Call
Call
Bid
Call
Call
---
Call
Call
Call
---
Call
Call
Call
Call
Call
Call
Call
Bid
Call
---
Bid
Call
---
---
Bid
Bi

---
Bid
Bid
Bid
Call
---
Call
Call
Call
---
Call
Call
Call
Bid
Call
---
Call
---
---
Call
Call
Call
Call
---
---
Call
Call
---
---
---
---
Call
Bid
---
---
Call
Call
---
Call
Call
Call
---
---
Call
---
---
Call
---
---
Bid
Call
Bid
Bid
---
Call
Call
---
---
---
Call
Call
---
Call
Bid
---
Call
Call
Call
Bid
---
---
---
Bid
---
---
---
Call
---
---
---
Call
---
---
Call
---
Call
---
---
Call
---
Call
Bid
---
---
---
---
---
Call
Bid
Call
---
---
---
---
Bid
Call
---
Call
---
---
---
Bid
---
Bid
---
---
---
Call
Call
---
---
---
Call
Call
Call
Call
---
Call
Call
---
Call
Call
---
---
Call
Call
Sold Out
Bid
Call
---
Call
---
---
Call
---
---
---
---
Bid
---
---
---
---
---
---
Call
---
---
Call
---
---
---
---
USD15.00
---
---
Call
---
---
---
---
---
---
---
---
---
---
---
---
Call
---
---
Bid
---
---
Sold Out
---
---
---
---
---
Call
Bid
Call
Call
Call
---
Call
Call
Call
Call
---
---
---
Call
Call
---
Call
---
Bid
Bid
Bid
Call
Bid
---
Call
Call
---
---
Call
---
---
---
Bid
---
---
Call


---
---
---
---
USD18.54
Call
---
Call
Bid
Call
---
---
---
---
---
Call
---
---
---
---
---
---
---
Call
Call
---
---
---
---
Call
---
Call
---
Bid
Call
Call
---
Call
---
---
---
---
---
---
---
---
Bid
---
---
---
---
---
---
Call
---
---
---
Call
---
---
---
Bid
Call
---
---
Call
---
---
Call
Call
Call
Call
Bid
Call
---
---
Call
---
Call
Call
Call
Call
---
Call
---
Call
---
---
Call
---
---
---
Call
---
---
---
Bid
Call
Call
Call
---
---
Call
---
Call
---
Bid
---
Bid
Call
---
Call
Sold Out
Call
Call
---
Call
Call
---
Call
---
---
Call
Sold Out
---
Call
Call
---
---
---
---
---
---
Call
---
Bid
---
Call
---
---
---
---
---
Call
Call
---
---
---
Call
---
---
---
Call
---
Call
Bid
Bid
---
Call
---
USD20.50
---
---
Bid
---
---
---
---
Bid
Bid
Call
---
---
Call
Call
Call
Call
---
Call
---
Call
Call
Bid
Bid
Call
---
---
Sold Out
Call
---
---
---
---
---
---
---
---
---
---
Call
---
Call
---
---
Call
Call
---
Call
Call
Call
---
---
Call
---
---
---
---
---
---
---
---
---
---
---
Bid
---
-

In [None]:
########################################
# 📓 Repeat the check that the quantity numbers are valid numbers
######################################## 
# 1 - load data
qty = read_pickle("datasets/DRAM_OfferScraperOutput.pkl")['Qty']

# 2 - regular expression
regexp = "🍄🍄🍄🍄🍄"

# 3 - move through the list and try to find a match
for idx, i in enumerate(prices):

    mg = re.fullmatch(regexp, i, re.IGNORECASE)

    if (mg is None):
        print(i)
        qty[idx] = "INCORRECT FORMAT"

# Using groups
<a id="groups"></a>
Brackets are used to access specifc elements of a matched pattern. Brackets are evaluated in order of: external → internal and then by order of occurence.

<img src="images_inkscape/groups.png" height="100">

Brackets are also required to
- Encapsulating `or` regular expressions e.g. `(Intel|Microsoft|AMD)`
- Encapsulating expressions for repetitions e.g. `(\d\w)*` to match number-letter pattern of 1F2G3G

<table class="table table-striped table-bordered table-hover table-condensed">
<colgroup>
<col  class="left">
<col  class="left">
</colgroup>

<tbody>

<tr>
<td class="text-left"><code>"(&#x2026;)"</code></td>
<td class="text-left">Create a group for the chosen regexp</td>
<td class="text-left"></td>
</tr>

<tr>
<td class="text-left"><code>"(?:&#x2026;)"</code></td>
<td class="text-left">Content is not assigned to a group for later access</td>
</tr>

<tr>
<td class="text-left"><code>"(?!&#x2026;)"</code></td>
<td class="text-left">Matches if &#x2026; doesn't match</td>
</tr>

</tbody>
</table>

In [36]:
########################################
# 🍏 Creating groups to eliminate white spaces
########################################
# 1 - load data
example = ["1666MHz", "200 mHz", "speed = 300 khz"]

# 2 - regular expression to match:
# 2-4 digits, white space, khz or mhz
regexp = "(\d{2,4})(\s)?([km]hz)"

for i in example:
    # 3 - perform search
    mg = re.findall(regexp, i, re.IGNORECASE)[0]
    # 4 - eliminate white space during writing
    print(mg[0] + mg[2])

1666MHz
200mHz
300khz


# Using advaced regular experssions
<div id="content" class="container">
<div class="row"><div class="col-md-9"><h1 class="title"></h1>
<table class="table table-striped table-bordered table-hover table-condensed">


<colgroup>
<col  class="left">

<col  class="left">
</colgroup>
<tbody>
<tr>
<td class="text-left"><code>"DDR[0-9]\b"</code></td>
<td class="text-left">DDR3, DDR4</td>
</tr>

<tr>
<td class="text-left"><code>"i\\. o\\. data"</code></td>
<td class="text-left">i. o. data</td>
</tr>

<tr>
<td class="text-left"><code>"so(?</code>\s)?dimm"=</td>
<td class="text-left">so-dimm</td>
</tr>

<tr>
<td class="text-left"><code>"class\s?\d+"</code></td>
<td class="text-left">class10, class 6</td>
</tr>

<tr>
<td class="text-left"><code>"\w+-\w+-\w+"</code></td>
<td class="text-left">3-3-3, 66-200-21</td>
</tr>

<tr>
<td class="text-left"><code>"\d+ pcs/pack"</code></td>
<td class="text-left">2pcs/pack, 4pc/pack</td>
</tr>

<tr>
<td class="text-left"><code>"i\d[-\s]\w+"</code></td>
<td class="text-left">i7-21421</td>
</tr>

<tr>
<td class="text-left"><code>"\w{0,2}1333"</code></td>
<td class="text-left">PC1333</td>
</tr>

<tr>
<td class="text-left"><code>"[0-9]{1,3}[MK]\sCache"</code></td>
<td class="text-left">2M Cache, 20M Cache, 300MCache</td>
</tr>

<tr>
<td class="text-left"><code>"[0-9]{1,2}x\s\w+</code></td>
<td class="text-left">2x L3311, 12x L3123, 14x L3423</td>
</tr>

<tr>
<td class="text-left"><code>"[0-9]+[Mm][Hh][Zz]"</code></td>
<td class="text-left">800MHz, 12mhz, 30mhz</td>
</tr>

<tr>
<td class="text-left"><code>"[0-9]{2,4}pins"</code></td>
<td class="text-left">666pins, 77pins, 700pins</td>
</tr>

<tr>
<td class="text-left"><code>"\w+\s[bB][iI][tT]"</code></td>
<td class="text-left">64BIT, 128bit, 34bit</td>
</tr>

<tr>
<td class="text-left"><code>"\bd+[GMK]b?/s"</code></td>
<td class="text-left">100M/s, 200MB/s, 12Gb/s</td>
</tr>

<tr>
<td class="text-left"><code>"\d+(\\.\d+)?(\s)?[gmk]bps"</code></td>
<td class="text-left">8.0Gbps, 20 kbps</td>
</tr>

<tr>
<td class="text-left"><code>"(\w{4,20})\s(I{2,4})"</code></td>
<td class="text-left">Pentinum II</td>
</tr>

<tr>
<td class="text-left"><code>"E[0-9]-\w{4,6}"</code></td>
<td class="text-left">E2-12421</td>
</tr>

<tr>
<td class="text-left"><code>"\d\\.\dv"</code></td>
<td class="text-left">2.2v</td>
</tr>


<tr>
<td class="text-left"><code>"\w+\sEd(ition|\\.)"</code></td>
<td class="text-left">Black edition</td>
</tr>

<tr>
<td class="text-left"><code>"\d+(\\.\d+)?\s?(\"|\s?inch)"</code></td>
<td class="text-left">2.5 inch</td>
</tr>

<tr>
<td class="text-left"><code>"SATA(-|\s)?II{2,4}"</code></td>
<td class="text-left">sata III</td>
</tr>

</tbody>
</table>
</div><div class="col-md-3"></div></div></div>

***
In addition to the table, this is a further list of common elements
```python
common regexp = "heat(?=\s)?shield|hyper|EEC|Reg|Chip(s)|BGA|CSP|quad",
            "mobile|b[op]x|giant|tin|gold|lead|dual|die|tray",
            "legacy|sodimm|(s)?TSOP|stack(ed)?|layer|SDRAM|cache",
            "CPU|coated|Orig(inal)?|FBGA|for",
            "scalable|bridge|north|south|type|core|duo|dual"```

In [46]:
########################################
# 📓 1 -Use the above regexp and slot them into the "regexp = [....]" variable below, to extract all meaningful data
#    2 - try it for different description files
######################################## 
# 1 - load data
#description = read_pickle("datasets/SSD_OfferScraperOutput.pkl")['Description']
#description = read_pickle("datasets/Module_OfferScraperOutput.pkl")['Description']
#description = read_pickle("datasets/FlashCard_OfferScraperOutput.pkl")['Description']
description = read_pickle("datasets/DRAM_OfferScraperOutput.pkl")['Description']

# 2 - regular expression, given as a list
regexp = ["[0-9]+[Mm][Hh][Zz]", "\bd+[GMK]b?/s"]

# 3 - regular expression are joined with an OR
regexp = "(" + "|".join(regexp) + ")"

# 4 - move through the description list
description_extracted = []
for idx, i in enumerate(description):

    mg = re.findall(regexp, i, re.IGNORECASE)
    
    # 4 - add what was extracted
    if (len(mg) != 0):
        description_extracted.append(mg)
        print("Extracted\t\t", mg)
    else:
        # 5 - flag what is still to be extracted
        print("✘ Nothing found with\t", i)

✘ Nothing found with	 GDDR5 256Mx32 1.35v (8Gb)
✘ Nothing found with	 GDDR5 256Mx32-60
✘ Nothing found with	 GDDR5 256Kx32-03
✘ Nothing found with	 GDDR5 256Kx32-28
✘ Nothing found with	 GDDR5 256Mx32-70
✘ Nothing found with	 GDDR5 256Mx32-70
✘ Nothing found with	 GDDR5 256Kx32-24
✘ Nothing found with	 GDDR5 256Kx32-25
✘ Nothing found with	 GDDR5 256Mx32 1.35v (8Gb)
✘ Nothing found with	 GDDR5 256Mx32-80
✘ Nothing found with	 GDDR5 256Mx32
✘ Nothing found with	 GDDR5 256Mx32-80
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32 8.0Gbps
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Nothing found with	 GDDR5 128Mx32
✘ Not

✘ Nothing found with	 DDR3 256Mx16 PC1866
✘ Nothing found with	 DDR3 256Mx16 PC1600
✘ Nothing found with	 DDR3 256Mx16 PC1866
✘ Nothing found with	 DDR3 64Mx16 PC1600
✘ Nothing found with	 DDR3 64Mx16 PC1866
✘ Nothing found with	 DDR3 128Mx16 PC1866 1.5V
✘ Nothing found with	 DDR3 256Mx16 PC2133 Indutrial
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR4 1Gx16 PC2400
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 SDRAM - DDR3 128Mx16 PC2400
✘ Nothing found with	 SDRAM - DDR3 128Mx16 PC2400 2GB
✘ Nothing found with	 SDRAM - DDR3 128Mx16 PC2400 2 G
✘ Nothing found with	 SDRAM - DDR3 64M x 8 PC2400 512Mb
✘ Nothing found with	 DDR3 256Mx16 PC1600
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found with	 DDR3 256Mx16 PC2133
✘ Nothing found wit

✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333 Halogen Free
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 64Mx16 PC1333
✘ Nothing found with	 DDR3 32GB PC1066
✘ Nothing found with	 DDR3 32GB PC1066
✘ Nothing found with	 DDR3 1Gx4 PC1066
✘ Nothing found with	 DDR3 1Gx4 PC1066
✘ Nothing found with	 DDR3 256Mx16 PC1066
✘ Nothing foun

✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 256Mx4 PC667 Pb free
✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 256Mx4 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx16 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667
✘ Nothing found with	 DDR2 128Mx8 PC667 CL5
✘ Nothing found with	 

✘ Nothing found with	 DDR 64Mx8 PC400 Pb Free
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR1 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400 Pb Free
✘ Nothing found with	 DDR 64Mx8 PC400 Industrial
✘ Nothing found with	 DDR 64Mx8 PC400 BGA
✘ Nothing found with	 DDR 64Mx8 PC400 TSOP
✘ Nothing found with	 DDR 64Mx8 PC400 Industrial
✘ Nothing found with	 DDR 64Mx8 PC400 Pb Free
✘ Nothing found with	 DDR 64Mx8 PC400 PB Free
✘ Nothing found with	 DDR 64Mx8 PC400 Pb Free
✘ Nothing found with	 DDR 64Mx8 PC400 PB Free
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ Nothing found with	 DDR 64Mx8 PC400
✘ N

✘ Nothing found with	 DDR 32Mx8 PC333
✘ Nothing found with	 DDR 32Mx8 PC333 CL2.5
✘ Nothing found with	 DDR 32Mx8 PC333
✘ Nothing found with	 DDR 32Mx4 PC333
✘ Nothing found with	 DDR 32Mx4 PC333 BGA
✘ Nothing found with	 DDR 16Mx16 PC333 Lead Free
✘ Nothing found with	 DDR 16Mx16 PC333 Lead Free
✘ Nothing found with	 DDR 16Mx16 PC333 TSOP(II)
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333 CL2.5 Pb Free
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR1 16X16 PC400
✘ Nothing found with	 DDR 16Mx16 PC333
✘ Nothing found with	 DDR 16Mx16 PC333 Pb Free
✘ Nothing found with	 DDR 16Mx16 PC333 Pb Free
✘ Nothing found with	 DDR 16Mx16 PC333 Pb Free Industrial temp
✘ Nothing found with	 DDR 16Mx16 PC333 Pb Free Industrial temp
✘ Nothing found with	 DDR 16Mx

Extracted		 ['133MHZ']
✘ Nothing found with	 Mobile SD 16Mx16 PC133 CL3 3.3V
✘ Nothing found with	 Mobile SD 16Mx16 PC133 CL3 3.3V
✘ Nothing found with	 Mobile SD 16Mx16 PC133 CL3 1.8V
✘ Nothing found with	 Mobile SD 16Mx16 PC133 CL3
✘ Nothing found with	 Mobile SD 16Mx16 PC133
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133Mhz']
Extracted		 ['133MHz']
✘ Nothing found with	 Mobile SDRAM 8Mx32 PC133 FBGA90 ind Temp
✘ Nothing found with	 Mobile SDRAM 8Mx32 PC133 FBGA90 ind Temp
✘ Nothing found with	 Mobile SDRAM 8Mx32 FBGA ind Temp
Extracted		 ['166MHz']
Extracted		 ['166MHz']
✘ Nothing found with	 Mobile SDRAM 8Mx32 PC166 FBGA90 ind Temp
✘ Nothing found with	 Mobile SDRAM 8Mx32 PC166 FBGA90 ind Temp
✘ Nothing found with	 Mobile SDRAM 8Mx32 PC166 FBGA90 ind Temp
Extracted		 ['133MHz']
Extracted		 ['133MHz']
Extracted		 ['133MHz']
Extrac

✘ Nothing found with	 SD 4Mx16-75 PC166 Pb Free
✘ Nothing found with	 SD 4Mx16-75 PC166 Pb Free
✘ Nothing found with	 SD 4Mx16 PC133 VFBGA
✘ Nothing found with	 SD 4Mx16-75 PC166 Pb Free
Extracted		 ['166MHz']
✘ Nothing found with	 SD 16Mx16 PC143 3.3V
✘ Nothing found with	 SD 1Mx16-7 PC166
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
✘ Nothing found with	 SD 4Mx16 PC133
✘ Nothing found with	 SD 16Mx16-7 PC143
Extracted		 ['143MHz']
✘ Nothing found with	 SD 16Mx16-7 PC143
✘ Nothing found with	 SD 32Mx16-7 PC143
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
Extracted		 ['143MHz']
✘ Nothing found with	 SD 8Mx32-7 PC143 Ind Temp
✘ Nothing found with	 SD 8Mx32-7 PC143
✘ Nothing found with	 SD 8Mx32-7 PC143 Pb Free
✘ Nothing found with	 SD 8Mx32-7 PC143
✘ Nothing found with	 SD 2Mx32 PC143
✘ Nothing found with	 SD 8Mx16 PC133 Pb Free
✘ Nothing found with

✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
Extracted		 ['133MHz']
✘ Nothing found with	 SD 16Mx16 PC133
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
Extracted		 ['133MHz']
✘ Nothing found with	 SD 16Mx16 PC133
✘ Nothing found with	 SD 16Mx16-75 133 MHz Lead Free
✘ Nothing found with	 SD 16Mx16 PC133 Low Pwr
✘ Nothing found with	 SD 16Mx16-75 PC133
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free Industrial
Extracted		 ['133MHz']
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
✘ Nothing found with	 SD 16Mx16-7C133 MHz
Extracted		 ['133MHz']
Extracted		 ['133MHz']
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
Extracted		 ['133MHz']
Extracted		 ['133MHz']
✘ Nothing found with	 SD 16Mx16 PC133
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
✘ Nothing found with	 SD 16Mx16-75 133 MHz PB Free
✘ Nothing found with	 SD 16Mx16 PC133
✘ Nothing found with	 SD 16Mx16-75 

✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16MX16 Network DRAM
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16 Network DRAM
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 32Mx16 PC133
✘ Nothing found with	 SD 32Mx16 PC133
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16-75
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx16
✘ Nothing found with	 SD 16Mx8-TI75
✘ Nothing found with	 SD 16Mx8-TL75
✘ Nothing found with	 S

✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 8K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 8K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K SR TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 4K 3.3 SOJ
✘ Nothing found with	 EDO 16Mx4-50 4K 3.3 SOJ
✘ Nothing found with	 EDO 16Mx4-50
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3V 8K SOJ
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 3.3 4K TSOP
✘ Nothing found with	 EDO 16Mx4-50 SOJ
✘ Nothing found with	 EDO 16Mx4-50 3.3V 4K TOSP
✘ Nothing found with	 EDO 16Mx4

✘ Nothing found with	 FPM 4Mx1-70 SOJ LP
✘ Nothing found with	 FPM 4Mx1-70 SOJ
✘ Nothing found with	 FPM 4Mx1-70 ZIP LP
✘ Nothing found with	 FPM 4Mx1-70 SOJ
✘ Nothing found with	 FPM 4Mx1-70 SOJ
✘ Nothing found with	 FPM 4Mx1-6 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 2K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 1K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 1K SOJ
✘ Nothing found with	 FPM 4Mx1-60 5V 1K SOJ
✘ Nothing found with	 FPM 4Mx1-60 SOJ
✘ Nothing found with	 FPM 4Mx1-60 ZIP
✘ Nothing found with	 Nibble Mode 4Mx1-60 ZIP
✘ Nothing found with	 FPM 4Mx1-60 5V 1K SOJ
✘ Nothing found with	 FPM 4Mx1-60 ZIP
✘ Nothing found with	 FPM 4Mx1-60 SOJ
✘ Nothing found with	 FPM 4Mx1-60 SOJ 5V
✘ Nothing found with	 FPM 4Mx1-60 5V SOJ
✘ Nothing found with	 FPM 4Mx1-60 SOJ
✘ Nothing found with	 FPM 4Mx1-60 ZIP
✘ Nothing

# Advanced options
For even more complex patterns you would need to check the content around what is being matched.

<table class="table table-striped table-bordered table-hover table-condensed">

<colgroup>
<col  class="left">

<col  class="left">
</colgroup>
<thead>
<tr>
<th scope="col" class="text-left"></th>
<th scope="col" class="text-left"><b>Description</b></th>
<th scope="col" class="text-left"><b>Use case</b></th>
</tr>
</thead>
<tbody>


<tr>
<td class="text-left"><code>"(?P&lt;name&gt;&#x2026;)"</code></td>
<td class="text-left">The substring matched by the group is accessible by name.</td>
<td class="text-left"></td>
</tr>

<tr>
<td class="text-left"><code>"(?P=name)"</code></td>
<td class="text-left">Matches the text matched earlier by the group named name.</td>
<td class="text-left"></td>
</tr>

<tr>
<td class="text-left"><code>"(?=&#x2026;)"</code></td>
<td class="text-left">Matches if &#x2026; matches next, but doesn't consume the string.</td>
<td class="text-left">Use instead of creating [GROUPS](#groups), and selecting which ones to keep</td>
</tr>

<tr>
<td class="text-left"><code>"(?&lt;=&#x2026;)"</code></td>
<td class="text-left">Matches if preceded by &#x2026; (must be fixed length).</td>
<td class="text-left">Use instead of creating [GROUPS](#groups), and selecting which ones to keep</td>
</tr>



<tr>
<td class="text-left"><code>"(?&lt;!&#x2026;)"</code></td>
<td class="text-left">Matches if not preceded by &#x2026; (must be fixed length).</td>
<td class="text-left">Use to check for things to not match</td>
</tr>

</tbody>
</table>

# Trimming white spaces
Cleaning using regular experssions will often result in trailing white spaces at start and ends of expressions, which need to be eliminated using the following script:
1. Match an experssion with 3 groups: 1 - leading white space; 2 - main content; 3 trailing white space
2. Return only the main content

In [None]:
def trim_white_space(string):
    """
    __ Parameters __
    string to trim white space from

    __ Description __
    trip from start and end of string

    __ Return __
    trimmed string
    """

    # 1 - search for spaces
    groupMatch = re.match("^(\s*)(.*)(\s*)$", string)
    if(groupMatch):
        return groupMatch.group(2)
    else:
        return string

# Extra datasets to practise filtering on

In [None]:
data = read_pickle("datasets/Module_OfferScraperOutput.pkl")
brand = data['Brand']
qty = data['Qty']
price = data['Price']
description = data['Description']

In [None]:
data = read_pickle("datasets/SSD_OfferScraperOutput.pkl")
brand = data['Brand']
qty = data['Qty']
price = data['Price']
description = data['Description']

In [None]:
data = read_pickle("datasets/FlashCard_OfferScraperOutput.pkl")
brand = data['Brand']
qty = data['Qty']
price = data['Price']
description = data['Description']