In [1]:
import pandas as pd
import dataprep.clean as clean
import plotly.express as px

def select_relevant_columns(df):
    # Find correlations between numeric columns
    correlations = df.corr().abs().unstack().sort_values(ascending=False)

    # Remove correlations of a column with itself
    correlations = correlations[correlations < 1]
    print(correlations)

    # Get the pair of columns with the highest correlation
    most_correlated_columns = correlations.index[0]
    print(most_correlated_columns)

    # Get the categorical column with the highest number of unique values
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    most_unique_categorical = None
    max_unique_values = -1
    for column in categorical_columns:
        unique_values = df[column].nunique()
        if unique_values > max_unique_values:
            most_unique_categorical = column
            max_unique_values = unique_values

    return most_correlated_columns, most_unique_categorical


In [2]:
def auto_visualize(df, most_correlated_columns, most_unique_categorical):
    x, y = most_correlated_columns

    # Generate a scatter plot for the most correlated numerical columns
    scatter_fig = px.scatter(df, x=x, y=y, color=most_unique_categorical)

    return scatter_fig


In [3]:
url = 'https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv'
df = pd.read_csv(url)

# Clean and preprocess the DataFrame using DataPrep, if necessary


In [4]:
most_correlated_columns, most_unique_categorical = select_relevant_columns(df)
scatter_fig = auto_visualize(df, most_correlated_columns, most_unique_categorical)
scatter_fig.show()


  correlations = df.corr().abs().unstack().sort_values(ascending=False)


lifeExp    gdpPercap    0.583706
gdpPercap  lifeExp      0.583706
year       lifeExp      0.435611
lifeExp    year         0.435611
year       gdpPercap    0.227318
gdpPercap  year         0.227318
year       pop          0.082308
pop        year         0.082308
           lifeExp      0.064955
lifeExp    pop          0.064955
pop        gdpPercap    0.025600
gdpPercap  pop          0.025600
dtype: float64
('lifeExp', 'gdpPercap')


In [5]:
import inspect

plotly_vizu_list = [px.scatter, px.line, px.bar, px.histogram, px.box]

plotly_vizu_dict = {}
for vizu_func in plotly_vizu_list:
    plotly_vizu_dict[vizu_func.__name__] = vizu_func


common_params = set.intersection(
    *[set(inspect.signature(func).parameters.keys()) for func in plotly_vizu_list]
)
common_param_names = [p for p in list(common_params)]
common_param_names.sort(
    key=lambda x: list(inspect.signature(plotly_vizu_list[0]).parameters).index(x)
)

specific_params = {}

for vizu_func in plotly_vizu_list:
    func_params = inspect.signature(vizu_func).parameters
    param_names = list(func_params.keys())

    common_params_tmp = (
        common_params.intersection(func_params.keys())
        if common_params
        else set(func_params.keys())
    )

    specific_params[vizu_func] = [p for p in param_names if p not in common_params_tmp]

print("Common parameters:", common_param_names)
for vizu_func, params in specific_params.items():
    print(f"Specific parameters for {vizu_func.__name__}: {params}")


Common parameters: ['data_frame', 'x', 'y', 'color', 'hover_name', 'hover_data', 'facet_row', 'facet_col', 'facet_col_wrap', 'facet_row_spacing', 'facet_col_spacing', 'animation_frame', 'animation_group', 'category_orders', 'labels', 'orientation', 'color_discrete_sequence', 'color_discrete_map', 'log_x', 'log_y', 'range_x', 'range_y', 'title', 'template', 'width', 'height']
Specific parameters for scatter: ['symbol', 'size', 'custom_data', 'text', 'error_x', 'error_x_minus', 'error_y', 'error_y_minus', 'color_continuous_scale', 'range_color', 'color_continuous_midpoint', 'symbol_sequence', 'symbol_map', 'opacity', 'size_max', 'marginal_x', 'marginal_y', 'trendline', 'trendline_options', 'trendline_color_override', 'trendline_scope', 'render_mode']
Specific parameters for line: ['line_group', 'line_dash', 'symbol', 'custom_data', 'text', 'error_x', 'error_x_minus', 'error_y', 'error_y_minus', 'line_dash_sequence', 'line_dash_map', 'symbol_sequence', 'symbol_map', 'markers', 'line_shape

In [6]:
inspect.signature(plotly_vizu_list[0]).parameters

mappingproxy({'data_frame': <Parameter "data_frame=None">,
              'x': <Parameter "x=None">,
              'y': <Parameter "y=None">,
              'color': <Parameter "color=None">,
              'symbol': <Parameter "symbol=None">,
              'size': <Parameter "size=None">,
              'hover_name': <Parameter "hover_name=None">,
              'hover_data': <Parameter "hover_data=None">,
              'custom_data': <Parameter "custom_data=None">,
              'text': <Parameter "text=None">,
              'facet_row': <Parameter "facet_row=None">,
              'facet_col': <Parameter "facet_col=None">,
              'facet_col_wrap': <Parameter "facet_col_wrap=0">,
              'facet_row_spacing': <Parameter "facet_row_spacing=None">,
              'facet_col_spacing': <Parameter "facet_col_spacing=None">,
              'error_x': <Parameter "error_x=None">,
              'error_x_minus': <Parameter "error_x_minus=None">,
              'error_y': <Parameter "error_y

In [13]:
import plotly.express as px
import inspect

sig = inspect.signature(px.scatter)
default_color = sig.parameters['color'].default
sig.parameters['color'].annotation.__values__

AttributeError: type object '_empty' has no attribute '__values__'

In [36]:
import plotly.express as px
import inspect
import re

plotly_vizu_list = [px.scatter, px.line, px.bar, px.histogram, px.box]

param_info = {}

for func in plotly_vizu_list:
    param_info[func.__name__] = {}
    docstring = func.__doc__
    print(docstring)
    # print(func.__name__, func.__doc__)
    if not docstring:
        continue

    first_match = False
    for j, line in enumerate(docstring.split("\n")):
        match = re.search(r"\s*([^\s]+):\s*(.+)", line)
        if match:
            param_name, param_desc = match.groups()
            if param_name not in ["'", "Optional"]:
                first_match = True
                param_info[func.__name__][param_name] = {
                    "description": param_desc.strip(),
                    "type": None,
                    # "options": None,
                }
        elif not match and first_match is True:
            print(j, line)

            # print(param_info)

            # signature = inspect.signature(func)
            # param = signature.parameters.get(param_name)
            # if param:
            #     if param.default is not inspect.Parameter.empty:
            #         param_info[func.__name__][param_name]["default"] = param.default

            #     if param.annotation is not inspect.Parameter.empty:
            #         if hasattr(param.annotation, "__args__"):
            #             if param.annotation.__args__[0] is not inspect.Parameter.empty:
            #                 param_info[func.__name__][param_name]["options"] = param.annotation.__args__[0]
            #         elif param.annotation is not None:
            #             param_info[func.__name__][param_name]["options"] = param.annotation

from pprint import pprint
# pprint(param_info)



    In a scatter plot, each row of `data_frame` is represented by a symbol
    mark in 2D space.
    
Parameters
----------
data_frame: DataFrame or array-like or dict
    This argument needs to be passed for column names (and not keyword
    names) to be used. Array-like and dict are transformed internally to a
    pandas DataFrame. Optional: if missing, a DataFrame gets constructed
    under the hood using the other arguments.
x: str or int or Series or array-like
    Either a name of a column in `data_frame`, or a pandas Series or
    array_like object. Values from this column or array_like are used to
    position marks along the x axis in cartesian coordinates. Either `x` or
    `y` can optionally be a list of column references or array_likes,  in
    which case the data will be treated as if it were 'wide' rather than
    'long'.
y: str or int or Series or array-like
    Either a name of a column in `data_frame`, or a pandas Series or
    array_like object. Values from this colu

In [28]:
import plotly.express as px
import inspect
import re

plotly_vizu_list = [px.scatter, px.line, px.bar, px.histogram, px.box]

param_info = {}

for func in plotly_vizu_list:
    param_info[func.__name__] = {}
    docstring = func.__doc__
    if not docstring:
        continue

    for line in docstring.split("\n"):
        match = re.search(r"\s*([^\s]+):\s*(.+)", line)
        if match:
            param_name, param_desc = match.groups()
            param_info[func.__name__][param_name] = {
                "description": param_desc.strip(),
                "default": None,
                "options": None,
            }

            signature = inspect.signature(func)
            param = signature.parameters.get(param_name)
            if param and param.default is not inspect.Parameter.empty:
                param_info[func.__name__][param_name]["default"] = param.default

                if param.annotation is not inspect.Parameter.empty:
                    if hasattr(param.annotation, "__args__"):
                        if param.annotation.__args__[0] is not inspect.Parameter.empty and param_name not in ["'", "Optional"]:
                            param_info[func.__name__][param_name]["options"] = param.annotation.__args__[0]
                    elif param.annotation is not None and param_name not in ["'", "Optional"]:
                        param_info[func.__name__][param_name]["options"] = param.annotation

from pprint import pprint
pprint(param_info)


{'bar': {"'": {'default': None,
               'description': ".3f' or '|%a' or list-like data to appear in "
                              'the hover tooltip or',
               'options': None},
         'Optional': {'default': None,
                      'description': 'if missing, a DataFrame gets constructed',
                      'options': None},
         'animation_frame': {'default': None,
                             'description': 'str or int or Series or '
                                            'array-like',
                             'options': None},
         'animation_group': {'default': None,
                             'description': 'str or int or Series or '
                                            'array-like',
                             'options': None},
         'barmode': {'default': 'relative',
                     'description': "str (default `'relative'`)",
                     'options': None},
         'base': {'default': None,
               

In [44]:
import plotly.express as px
import inspect
import re

plotly_vizu_list = [px.scatter, px.line, px.bar, px.histogram, px.box]

param_info = {}

# for func in plotly_vizu_list:
    # param_info[func.__name__] = {}
docstring = plotly_vizu_list[0].__doc__
print(docstring)


    In a scatter plot, each row of `data_frame` is represented by a symbol
    mark in 2D space.
    
Parameters
----------
data_frame: DataFrame or array-like or dict
    This argument needs to be passed for column names (and not keyword
    names) to be used. Array-like and dict are transformed internally to a
    pandas DataFrame. Optional: if missing, a DataFrame gets constructed
    under the hood using the other arguments.
x: str or int or Series or array-like
    Either a name of a column in `data_frame`, or a pandas Series or
    array_like object. Values from this column or array_like are used to
    position marks along the x axis in cartesian coordinates. Either `x` or
    `y` can optionally be a list of column references or array_likes,  in
    which case the data will be treated as if it were 'wide' rather than
    'long'.
y: str or int or Series or array-like
    Either a name of a column in `data_frame`, or a pandas Series or
    array_like object. Values from this colu

In [56]:
import plotly.express as px
import inspect
import re

plotly_vizu_list = [px.scatter, px.line, px.bar, px.histogram, px.box]

param_info = {}

# for func in plotly_vizu_list:
    # param_info[func.__name__] = {}
docstring = plotly_vizu_list[0].__doc__
# print(docstring)

# docstring = '''
#     # Put the entire docstring here
# '''


def extract_info_from_docstring(docstring):
    lines = docstring.split("\n")
    # print(lines)
    parameters_section = False
    result = {}

    for line in lines:
        # print(line)
        if line.startswith("Parameters"):
            parameters_section = True
            continue
        if parameters_section:
            # if line.startswith("----------"):
            #     break
            if line.startswith("    ") is False:
                print(line.split(': '))
                line_processed = line.split(': ')
                parameter, type = line_processed[0], line_processed[1]
                
                result[parameter] = {"type" : type} 

            elif line.startswith("    ") is True:
                result[-1] += " " + line.strip()

    return result

parameters_info = extract_info_from_docstring(docstring)
for info in parameters_info:
    print(info)
    print()


['----------']
['data_frame', 'DataFrame or array-like or dict']
['x', 'str or int or Series or array-like']
['y', 'str or int or Series or array-like']
['color', 'str or int or Series or array-like']
['symbol', 'str or int or Series or array-like']
['size', 'str or int or Series or array-like']
['hover_name', 'str or int or Series or array-like']
['hover_data', 'str, or list of str or int, or Series or array-like, or dict']
['custom_data', 'str, or list of str or int, or Series or array-like']
['text', 'str or int or Series or array-like']
['facet_row', 'str or int or Series or array-like']
['facet_col', 'str or int or Series or array-like']
['facet_col_wrap', 'int']
['facet_row_spacing', 'float between 0 and 1']
['facet_col_spacing', 'float between 0 and 1']
['error_x', 'str or int or Series or array-like']
['error_x_minus', 'str or int or Series or array-like']
['error_y', 'str or int or Series or array-like']
['error_y_minus', 'str or int or Series or array-like']
['animation_frame