In [5]:
import pandas as pd
import dataprep.clean as clean
import plotly.express as px

def select_relevant_columns(df):
    # Find correlations between numeric columns
    correlations = df.corr().abs().unstack().sort_values(ascending=False)

    # Remove correlations of a column with itself
    correlations = correlations[correlations < 1]
    print(correlations)

    # Get the pair of columns with the highest correlation
    most_correlated_columns = correlations.index[0]
    print(most_correlated_columns)

    # Get the categorical column with the highest number of unique values
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    most_unique_categorical = None
    max_unique_values = -1
    for column in categorical_columns:
        unique_values = df[column].nunique()
        if unique_values > max_unique_values:
            most_unique_categorical = column
            max_unique_values = unique_values

    return most_correlated_columns, most_unique_categorical


In [2]:
def auto_visualize(df, most_correlated_columns, most_unique_categorical):
    x, y = most_correlated_columns

    # Generate a scatter plot for the most correlated numerical columns
    scatter_fig = px.scatter(df, x=x, y=y, color=most_unique_categorical)

    return scatter_fig


In [3]:
url = 'https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv'
df = pd.read_csv(url)

# Clean and preprocess the DataFrame using DataPrep, if necessary


In [6]:
most_correlated_columns, most_unique_categorical = select_relevant_columns(df)
scatter_fig = auto_visualize(df, most_correlated_columns, most_unique_categorical)
scatter_fig.show()


lifeExp    gdpPercap    0.583706
gdpPercap  lifeExp      0.583706
year       lifeExp      0.435611
lifeExp    year         0.435611
year       gdpPercap    0.227318
gdpPercap  year         0.227318
year       pop          0.082308
pop        year         0.082308
           lifeExp      0.064955
lifeExp    pop          0.064955
pop        gdpPercap    0.025600
gdpPercap  pop          0.025600
dtype: float64




