# MATLAB plots scrape

Run the following two code cells.

The second code cell will create `1A-plot-types/` folder (you can find it on the left side in `Files`. 

* Inside you will find Matlab's `12 plot categories`.

* Inside every one of these you will find `file_names.md` which are .md created from scraping page of every plot under the `12 plot categories`. Additionally, you will find `plots_code` folder which contains one file containing code for every plot.

Note: scraping will take approx 2 minutes because I set-up 0-3.5 waiting time before evry request, in order to prevent bombarding Matlab's domain. You can turn this off by setting `create_folers=False` at the bottom of the second code cell. 

In [1]:
import requests, time, random, re
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.request
import pandas as pd
import numpy as np
import glob

print("Packages loaded")

Packages loaded


In [7]:
class ScrapeMATLABPlotExamples():
  URL_BASE = "https://mathworks.com/help/matlab/"         # DO NOT change, this points to the base url location that is used in the file
  URL_plots = "creating_plots/types-of-matlab-plots.html" # DO NOT change, this points where the page with the list of all the plots is located, relative to URL_BASE
  URL_IMAGE_BASE = "https://mathworks.com/help/"          # DO NOT change, OBSOLETE, once POST is properly created. For now this is used to get image links as a reference.

  DISK_SAVE_LOCATION = "/content/1A-plot-types"   # New directories and subdirectories are created in this location, where the results of the model are saved.

  def __init__(self, considerate_scraping, create_folers=True):
    """ param:: (bool) considerate_scraping: True - random waiting time from 0 to 3.5s will be added for every request to prevent bombarding the domain with requests
                                             False - no waiting beterrn requests
                (bool) create_folers - OBSOLETE -  used during development to reduce time creating folders, once they had already been created.
    """
    self.considerate_scraping = considerate_scraping
    self.create_folers = create_folers
  
  def __call__(self):
    if self.considerate_scraping:
      print("Considerate scraping mode is ON. Scraping will take approximately 2-5 minutes.\n")
    self.get_all_plot_links_and_names()
    self.scrape_all_plot_pages()
    self.finalize_plots_code()
    self.create_index_page()

  def considerate_scraping_sleep_time(self):
    """ This creates random waiting time from 0-3.5s, in order to be nice to prevent bombarding the domain with requests. """
    sleep_time = round(random.random()*3.5, 2)
    time.sleep(sleep_time)

  def get_all_plot_links_and_names(self):
    """ Scrape links + plot group names + plot names. This data will be used to scrape all examples inside these links.
        Create appropriate folders where the scraped documentation will be stored. """
    request = requests.get(self.URL_BASE + self.URL_plots)
    soup = BeautifulSoup(request.content,"lxml")
    table = soup.find("table")

    records, columns, links = [], [], []
    for tr in table.findAll("tr"):                  # Scrape all rows
      ths = tr.findAll("th")
      if ths != []:                                 # Scrape all headers 
        for each in ths:
          text = each.text.replace("\n", "")
          text = '_'.join(text.lower().split())
          columns.append(text)
      else:                                         # Scrape all columns
        record, link = [], []
        for each in tr.findAll("td"):
          try:                                      # Get name of the plot
            text = each.text.replace("\n", "")
            text = '_'.join(text.lower().split())
            record.append(text)
          except:
            print(f"-----WARNING----- can't find plot title in this div: {each}")
            print("Code will continue without including this example.")
            continue
          
          try:                                      # Get link of the plot
            _link = each.find("a")["href"].split("../")[1]
            link.append(self.URL_BASE + _link)
          except:
            link.append("")
        records.append(record)
        links.append(link)

    # Convert scraped plot names into dictionary, remove empty entries
    dict_names = pd.DataFrame(records, columns=columns).to_dict()
    for key, nested_dict in list(dict_names.items()):
      for key, value in list(nested_dict.items()):
        if value == "":
            del nested_dict[key]

    # Convert scraped plot links into dictionary, remove empty entries
    dict_links = pd.DataFrame(links, columns=columns).to_dict()
    for key, nested_dict in list(dict_links.items()):
      for key, value in list(nested_dict.items()):
        if value == "":
            del nested_dict[key]

    # Join the links and names  together into a single nested dictionary
    for key, nested_dict in list(dict_names.items()):
      for nested_key, value in list(nested_dict.items()):
        dict_names[key].pop(nested_key)
        dict_names[key][value] = dict_links[key][nested_key]
    
    self.plots_overview = dict_names

    # Create corresponding folders
    if self.create_folers:
      !rm -rf $self.DISK_SAVE_LOCATION
      !mkdir $self.DISK_SAVE_LOCATION
      for plot_group in self.plots_overview:
        !mkdir $self.DISK_SAVE_LOCATION/$plot_group
        !mkdir $self.DISK_SAVE_LOCATION/$plot_group/plots-code
    
    # Create concat (de-nested) list out of all plots. This is used to find duplicates and to track progress of the scraping.
    self.concat_list_of_all_plots = []
    for key in self.plots_overview.keys():
      self.concat_list_of_all_plots.extend([*self.plots_overview[key].keys()])

    plots_that_apper_more_than_once = set([i for i in self.concat_list_of_all_plots if self.concat_list_of_all_plots.count(i)>1])
    print(f"----WARNING --- These plots appear more than once on the webpage: {plots_that_apper_more_than_once}. Code will proceed to download them multiple times.\n")
    time.sleep(2)     # Leave this here so that everything catches up, so that progress bar is initialized properly
  
  def scrape_all_plot_pages(self):
    """ Loop through all the plot pages, and 'scrape_all_examples_on_a_page()' scrapes their content.
        pbar - this creates progress bar of the loop in the command line """
    pbar = tqdm(total=len(self.concat_list_of_all_plots), 
                position=0, 
                leave=True)
    
    for plot_group_name, dict_plots in self.plots_overview.items():             # Loop through main plot categories
      for plot_name, plot_url in self.plots_overview[plot_group_name].items():  # Loop through all plots in the category
        self.scrape_all_examples_on_a_page(plot_name, plot_group_name, plot_url)
        pbar.update(1)

  def scrape_all_examples_on_a_page(self, plot_name, plot_group_name, plot_url):
    """ Scrape content of the page, and call save_page_content_as_markdown() to save the content into the correct format 
        
        param:: (str) plot_name       - name of the plot - eg. area
                (str) plot_group_name - name of the group the plot belongs to - eg. 'line_plots'" 
                (str) plot_url        - url directing to the plot's location
    """
    if self.considerate_scraping:
      self.considerate_scraping_sleep_time()

    request = requests.get(plot_url)
    soup = BeautifulSoup(request.content, "lxml")

    try: # Get page title. There are some archaid webpages which have different divs, hence need for two scraping systems here
      plot_page_title = soup.find("span", attrs={"class": "refname"}).text
    except:
      plot_page_title = soup.find("h1", attrs={"itemprop": "title"}).text
    
    examples = soup.find("div", attrs={"id": "expandableExamples"}).find_all("div", attrs={"class": "panel-group"}) # get all plot exampes on the page

    with open(f"{self.DISK_SAVE_LOCATION}/{plot_group_name}/{plot_name}.md", "w") as writer: # Create a file at the given location to write into
      
      self.create_plot_file_meta_data(writer = writer,
                                      layout = "post",
                                      title = "MATLAB plot",
                                      description = "Learn how to make line charts in MATLAB, then publish them to the Web with Plotly.",
                                      permalink = f"/matlab/{plot_group_name}/{plot_name}/",
                                      layout_2 = "matlab", # TODO: WHY are there 2 layouts
                                      function = "plot",
                                      reference = plot_url, 
                                      github = f"{plot_group_name}/{plot_name}.md")
      idx_example_suffix = 0
      for example in examples:                                                          # Loop through all plot examples on the page
        

        example_panel_title = example.find("h3", attrs={"class": "panel-title"}).text
        example_content = example.find("div", attrs={"class": "panel-body"})
        try:
          temp = example_content.find("div", attrs={"class": "procedure"})
          if temp[0]:
            example_content = temp
        except:
          pass

        self.save_page_content_as_markdown(writer, plot_name, plot_group_name, example_panel_title, example_content, idx_example_suffix)  # scrape and save plot example
        idx_example_suffix += 1

  def create_plot_file_meta_data(self, writer, layout, title, description, permalink, layout_2, function, reference, github):
    """ Create metadata at the top of every plot file """
    writer.write(f"""---
                    layout: {layout}
                    title:  {title}
                    description: {description}
                    permalink: {permalink}
                    layout: {layout_2}
                    function: {function}
                    reference: {reference}
                    github: {github}
                    ---\n
                    """)

  def save_code_as_markdown(self, plot_name, plot_group_name, idx_example_suffix, code_index, code_text):
    """ Saves the code into a separate file
        
        param:: (str) plot_name           - name of the plot - eg. area
                (str) plot_group_name     - name of the group the plot belongs to - eg. 'line_plots'"
                (int) idx_example_suffix  - incrementing numerical suffix, it is unique for all examples for every plot
                (int) code_index          - incrementing numerical fuffix, for every code in the example
                (str) code                - code
                
                """
    
    code =  f"function {plot_name}_{str(idx_example_suffix)}()\n" if code_index == 0 else ""

    code += code_text + ";\n"

    if plot_name in code_text: #! TODO: this won't trigger if the figure calling is called something else, like p(1), p(2)
      is_3d = "true" if "3" in plot_name else "false"
      code += f"thumbnail_generator(gcf, '{plot_group_name}', '{plot_name}', {is_3d});\n\n"

    code = code.replace(";;", ";")
    with open(f"{self.DISK_SAVE_LOCATION}/{plot_group_name}/plots-code/{plot_name}_{str(idx_example_suffix)}.md", "a") as code_writer:
      code_writer.write(code)
    
  def save_page_content_as_markdown(self, writer, plot_name, plot_group_name, pannel_title, content, idx_example_suffix):
    """ Saves the example into the desired format, with unique file name, and correct references to the code markdown files 

        param: (writer) writer         - active writer that writes into a single file for all examples of the given plot
               (str) plot_name         - name of the plot - eg. area
               (str) plot_group_name   - name of the group the plot belongs to - eg. 'line_plots'"
               (str) pannel_title      - title of the example plot pannel 
               (BeautifulSoup) content - content of the example
               (int) idx_example_suffix- incrementing numerical suffix, for every example section
    """
    self.write_line(writer, "header", pannel_title)
    code_index = 0
    for x in content:                           # Loop through every line in the (BeautifulSoup) object
      self.write_line(writer, "new_line")

      try:                                      # This is a very ugly approach to a stupid error where 
        if not x.strip():                       # code us unable to determine whether the tag is empty
          return
      except:
        pass                                    # If the tag is not empty, continue

      if "class" in x.attrs:                    # If "class" is in the tag, else it will be "p" which standx for text
        if "shortdesc" in x["class"]:           # Row is text
          self.write_line(writer, "text", self.reformat_text(x))

        elif "code_responsive" in x["class"]:   # Row is code
          code_input = False
          for element in x.find_all():          # Go through all childeren and find whether the code is "codeinput" or "codeoutput"
            element_attributes = list(element.attrs.values())
            if element_attributes:
              if "codeinput" in element_attributes[0]:
                code_input = True
                break
          if code_input:
            example_code = x.find("div", attrs={"class": "codeinput"}).text
            self.save_code_as_markdown(plot_name, 
                            plot_group_name, 
                            idx_example_suffix, 
                            code_index,
                            self.format_code(example_code, False))
            code_index += 1
          else:
            example_code = x.find("div", attrs={"class": "codeoutput"})

          self.write_line(writer, 
                          "code", 
                          self.format_code(example_code))

        elif "informalfigure" in x["class"]:    # Row is image
          image_src = x.find("img")["src"].split("../../")[1]
          image_alt = x.find("img")["alt"]
          example_image_source_url = (self.URL_IMAGE_BASE +  image_src)
          self.write_line(writer, "matlab_image", example_image_source_url, tag_alt=image_alt)
          # self.write_line(writer, "image", code_file_name=f("{plot_name}_{self.idx_code_suffix-1}"))      # TODO: POST image

      elif x.name == "p":                       # Row is text
        self.write_line(writer, "text", self.reformat_text(x))
  
  def reformat_text(self, expression):
    """ This is used in places where is text. It replaces all the html formatting with .md style formatting, and then returns pure text that will be written into the file. 

        param:: (BeautifulSoup) expression - expression that requires formatting 
    """
    expression = str(expression).replace('<code class="literal">', "`").replace('</code>', "`")
    expression = str(expression).replace('<span class="emphasis"><em>', "*").replace('</em></span>', "*")
    expression = str(expression).replace('<span class="inlineequation"><span><span class="MathEquation" role="math" style="font-size: 15px;">', "`").replace('</span></span></span></span></span>', "`")
    return BeautifulSoup(expression, "html.parser").text

  def format_code(self, code, save_with_formatting=True):
    """ Re-format code with .md style formatting      !# TODO: add '\n\nfig2plotly()' at the end, however, only into the last code cell of the example. 

        param:: (BeautifulSoup text) code  - code that requires formatting 
                (str) save_with_formatting: True  - saves code with html tags
                                            False - saves just the string
    """
    if save_with_formatting:
      return f'<pre class="mcode">{str(code)}</pre>'
    else:
      return str(code)

  def write_line(self, writer, line_type, expression=None, tag_alt=None, code_file_name=None):
    """ Write expression into the file. 
    
        param: (writer) writer            - active writer that writes into a single file for all examples of the given plot
               (str) line_type: header    - write a header of the example
                                new_line  - create new line
                                text      - write formatted text of the example
                                code      - create code box
                                matlab_image - OBSOLETE - remove after POST is fixed, temporray image plot matlab reference
                                image     - TODO: create POST
               (str) expression           - expression to be written
               (str) tag_alt              - OBSOLETE - remove after POST is fixed, temporray image plot matlab reference
               (str) code_file_name       - name of the corresponing code that will be used in POST to inplace the image
    """
    if line_type == "header":
      writer.write(f"## {expression}")
    elif line_type == "new_line":
      writer.write(f"\n\n")
    elif line_type == "text":
      writer.write(f"> {expression}")
    elif line_type == "code":
      writer.write(expression)
    elif line_type == "matlab_image":
      writer.write(f"![{tag_alt}]({expression})")
    elif line_type == "image":
      writer.write("{% include posts/mframe.html src='https://plotly.com/' %}")

  def finalize_plots_code(self):
    """ There is a need to finalize all the plot files by writing `end` at the end of every .m function file """
    for plot_category in glob.glob(self.DISK_SAVE_LOCATION + '/*'):
      for plot_file in glob.glob(plot_category + '/plots-code/*'):
        with open(plot_file, "a") as code_writer:
          code_writer.write("end")
  
  def create_index_page(self):
    """ Create the content of the index page with references to all plot figures """
    plot_categories_folders = glob.glob(self.DISK_SAVE_LOCATION + '/*')
    front_page_index = ""

    for folder in plot_categories_folders:
      plot_code = glob.glob(folder + '/plots-code/*')
      list_of_all_plots_code = [plot.split("/")[-1].split("_")[0] for plot in plot_code]
      list_of_all_plots = list(set(list_of_all_plots_code))

      category_title = " ".join(folder.split("_")).title()
      front_page_index += f"\n<h2>{category_title}</h4>\n\n"
      plot_category= folder.split("/")[-1]
      
      for plot in list_of_all_plots:
        front_page_index += "{% include posts/thumb.html type='" + plot_category + "' function='" + plot + "' %}	\n"

    # Add section with additional examples
    front_page_index += "\n\n<h2>More Examples</h2>\n"
    front_page_index += '{% assign languagelist = site.posts | where:"page_type","example_index" | where:"language","matlab"  | sort: "order" %}\n'
    front_page_index += "{% include posts/documentation_eg.html %}\n"
    front_page_index = f"""---
                            name: Plotly Graphing Library for MATLAB®
                            permalink: /matlab/
                            description: Create interactive charts in your web browser with MATLAB<sup>&reg;</sup> and Plotly.
                            layout: langindex
                            language: matlab
                            ---


                            <header class="--welcome">
                              <div class="--welcome-body">
                                <div class="--title">
                                  <div class="--body">
                                    <h1>Plotly Graphing Library for MATLAB<sup>&reg;</sup></h1>
                                    <p>
                                      {{page.description}}
                                    </p>
                                    <p>
                                      Head over to the [community forum](https://community.plotly.com/c/api/matlab/22) to ask questions and get help.
                                    </p>
                                  </div>
                                </div>
                              </div>
                            </header>

                            <div class="section">
                              <div class="row auto-eg-padding">
                                <div class="row">
                                  <div class="twelve columns">
                                    <h6>Plotly brings interactive graphing to your MATLAB<sup>&reg;</sup> console.</h6>
                                    <div class="z-depth-1">
                            <pre><code>
                            x = 80 * randn(1, 30);
                            y = 80 * randn(size(x));
                            r = randi(1500, size(x));
                            c = randi(10, size(x));

                            fig = figure;

                            scatter(x, y, r, c, 'filled', 'MarkerEdgeColor', 'k')

                            %--PLOTLY--%

                            response = fig2plotly(fig, 'filename', 'matlab-bubble-chart',  'strip', false);
                            plotly_url = response.url;

                            </code></pre>
                                    </div>
                                  </div>
                                </div>
                                <div class="row">
                                  <div class="twelve columns">
                                    <iframe 
                                      scrolling="no" 
                                      margin="none" 
                                      padding="none" 
                                      seamless=seamless 
                                      frameBorder="0"
                                      style="border:0"
                                      src="https://plotly.com/~PlotBot/113.embed" 
                                      height="500" 
                                      width="100%">
                                    </iframe>
                                  </div>
                                </div>
                                
                                <div class="row">

                                  <h2>Types of MATLAB® Plots</h2>
                                  {front_page_index}
                                </div>
                              </div>
                            </div>

                            <p style="margin-top:200px">
                              <em>MATLAB<sup>&reg;</sup> is a registered trademark of The MathWorks, Inc.</em>
                            </p>
                            """
    with open("index_page.html", "w") as code_writer:
      code_writer.write(front_page_index)

model = ScrapeMATLABPlotExamples(considerate_scraping=True)
model()
!zip -r -q 1A-plot-types.zip 1A-plot-types

Considerate scraping mode is ON. Scraping will take approximately 2-5 minutes.




100%|██████████| 80/80 [02:57<00:00,  1.93s/it]

In [None]:
strings = []  # TEST
plot_categories_folders = glob.glob("/content/1A-plot-types" + '/*')
front_page_index = ""
first_plot = ""

!rm -rf /content/first-plot
!mkdir /content/first-plot

for folder in plot_categories_folders:
  plot_code = glob.glob(folder + '/plots-code/*')
  
  list_of_all_plots_code = [plot.split("/")[-1].split("_")[0] for plot in plot_code]
  list_of_all_plots = list(set(list_of_all_plots_code))
  print(list_of_all_plots)

  category_title = " ".join(folder.split("_")).title()
  front_page_index += f"\n<h2>{category_title}</h4>\n\n"
  plot_category= folder.split("/")[-1]
  !mkdir /content/first-plot/$plot_category
  
  for plot in list_of_all_plots:
    print(plot)
    first_plot = ""

    
    front_page_index += "{% include posts/thumb.html type='" + plot_category + "' function='" + plot + "' %}	\n"

    first_plot += f"function {plot}_plot()\n"
    idx = 0
    while True:
      file_path = f"{folder}/plots-code/{plot}_{idx}.md"
      with open(file_path, mode="r") as f:
        code = f.read()
        first_plot += code + ";\n"

        if plot in code:
          is_3d = "true" if "3" in plot else "false"
          first_plot += f"thumbnail_generator(gcf, '{plot_category}', '{plot}', {is_3d});\n\n"
          first_plot += "end"
          break
        idx += 1
    with open(f"/content/first-plot/{plot_category}/{plot}_plot.m", "w") as code_writer:
      first_plot = first_plot.replace(";;", ";")
      code_writer.write(first_plot)

  strings.append(first_plot) # TEST

# Add section with additional examples
front_page_index += "\n\n<h2>More Examples</h2>\n"
front_page_index += '{% assign languagelist = site.posts | where:"page_type","example_index" | where:"language","matlab"  | sort: "order" %}\n'
front_page_index += "{% include posts/documentation_eg.html %}\n"
			

			
!zip -r -q first-plot.zip first-plot