In [35]:
import numpy as np 
import pandas as pd
import glob
import os
from tqdm import tqdm
import re
import plotly.graph_objects as go

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [2]:
def calculate_searches_by_race(df):
    searched_df = df[df['search_conducted'] == True]
    searches_by_race = searched_df.groupby('subject_race').size().reset_index(name='searches_count')
    searches_by_race = searches_by_race.set_index('subject_race')
    return searches_by_race

In [3]:
def calculate_stops_by_race(df):
    stops_by_race = df.groupby('subject_race').size().reset_index(name='stops_count')
    stops_by_race = stops_by_race.set_index('subject_race')
    return stops_by_race

In [4]:
def extract_filename(filenames):
    """
    Extracts the filename without extension from a list of filenames with paths.
    
    Args:
        filenames (list): A list of filenames with paths.
        
    Returns:
        list: A list of filenames without extensions.
    """
    filenames_without_ext = []
    for filename in filenames:
        # Split the filename into the path and the filename with extension
        path, filename_with_ext = os.path.split(filename)
        
        # Split the filename with extension into the filename and the extension
        filename, ext = os.path.splitext(filename_with_ext)
        
        filenames_without_ext.append(filename)
    
    return filenames_without_ext

In [5]:
def reformat_filename(filename):
    # Extract state name until first underscore
    state_match = re.match(r'^([^_]+)', filename)
    state = state_match.group(1)
    
    # Extract city name until underscore followed by number
    city_match = re.match(r'^[a-zA-Z]+_(.+?)_\d{4}_\d{2}_\d{2}', filename)
    city = city_match.group(1)
    
    # Dictionary mapping state abbreviations to full state names
    state_names = {
        "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas",
        "ca": "California", "co": "Colorado", "ct": "Connecticut", "de": "Delaware",
        "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", "id": "Idaho",
        "il": "Illinois", "in": "Indiana", "ia": "Iowa", "ks": "Kansas",
        "ky": "Kentucky", "la": "Louisiana", "me": "Maine", "md": "Maryland",
        "ma": "Massachusetts", "mi": "Michigan", "mn": "Minnesota", "ms": "Mississippi",
        "mo": "Missouri", "mt": "Montana", "ne": "Nebraska", "nv": "Nevada",
        "nh": "New Hampshire", "nj": "New Jersey", "nm": "New Mexico", "ny": "New York",
        "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", "ok": "Oklahoma",
        "or": "Oregon", "pa": "Pennsylvania", "ri": "Rhode Island", "sc": "South Carolina",
        "sd": "South Dakota", "tn": "Tennessee", "tx": "Texas", "ut": "Utah",
        "vt": "Vermont", "va": "Virginia", "wa": "Washington", "wv": "West Virginia",
        "wi": "Wisconsin", "wy": "Wyoming"
    }
    
    state_name = state_names.get(state.lower(), state)
    city_name = city.replace("_", " ").capitalize()
    
    return f"{city_name}, {state_name}"

In [6]:
path = r'./data_h2/'
all_files = glob.glob(path + "/*.csv")
filenames = extract_filename(all_files)
raw_filenames = filenames.copy()
print(filenames)

for i in range(len(filenames)):
    filenames[i] = reformat_filename(filenames[i])

stops_by_race_list = []
search_conducted_by_race_list = []
all_races = set()

for filename in tqdm(all_files, desc="Processing csv", unit="csv"):
    print(f"Loading {filename}")
    df = pd.read_csv(filename, index_col=None, header=0)
    print(f"Loaded {filename}")
    stops_by_race_list.append(calculate_stops_by_race(df))
    all_races.update(df['subject_race'].unique())
    search_conducted_by_race_list.append(calculate_searches_by_race(df))

['mn_saint_paul_2020_04_01', 'nc_winston-salem_2020_04_01', 'nc_durham_2020_04_01', 'ca_san_diego_2020_04_01', 'nc_greensboro_2020_04_01', 'ct_hartford_2020_04_01', 'tx_austin_2020_04_01', 'vt_burlington_2023_01_26', 'nc_raleigh_2020_04_01', 'ca_oakland_2020_04_01', 'ky_louisville_2023_01_26', 'ca_stockton_2020_04_01', 'ri_statewide_2020_04_01', 'pa_philadelphia_2020_04_01', 'tx_san_antonio_2023_01_26', 'ca_san_francisco_2020_04_01', 'la_new_orleans_2020_04_01', 'tx_plano_2020_04_01']


Processing csv:   0%|          | 0/18 [00:00<?, ?csv/s]

Loading ./data_h2/mn_saint_paul_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:   6%|▌         | 1/18 [00:00<00:15,  1.07csv/s]

Loaded ./data_h2/mn_saint_paul_2020_04_01.csv
Loading ./data_h2/nc_winston-salem_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  11%|█         | 2/18 [00:02<00:17,  1.10s/csv]

Loaded ./data_h2/nc_winston-salem_2020_04_01.csv
Loading ./data_h2/nc_durham_2020_04_01.csv


Processing csv:  17%|█▋        | 3/18 [00:03<00:15,  1.02s/csv]

Loaded ./data_h2/nc_durham_2020_04_01.csv
Loading ./data_h2/ca_san_diego_2020_04_01.csv


Processing csv:  22%|██▏       | 4/18 [00:03<00:13,  1.05csv/s]

Loaded ./data_h2/ca_san_diego_2020_04_01.csv
Loading ./data_h2/nc_greensboro_2020_04_01.csv


Processing csv:  28%|██▊       | 5/18 [00:05<00:15,  1.16s/csv]

Loaded ./data_h2/nc_greensboro_2020_04_01.csv
Loading ./data_h2/ct_hartford_2020_04_01.csv
Loaded ./data_h2/ct_hartford_2020_04_01.csv
Loading ./data_h2/tx_austin_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  44%|████▍     | 8/18 [00:06<00:07,  1.42csv/s]

Loaded ./data_h2/tx_austin_2020_04_01.csv
Loading ./data_h2/vt_burlington_2023_01_26.csv
Loaded ./data_h2/vt_burlington_2023_01_26.csv
Loading ./data_h2/nc_raleigh_2020_04_01.csv


Processing csv:  50%|█████     | 9/18 [00:09<00:10,  1.13s/csv]

Loaded ./data_h2/nc_raleigh_2020_04_01.csv
Loading ./data_h2/ca_oakland_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  56%|█████▌    | 10/18 [00:09<00:07,  1.06csv/s]

Loaded ./data_h2/ca_oakland_2020_04_01.csv
Loading ./data_h2/ky_louisville_2023_01_26.csv


Processing csv:  61%|██████    | 11/18 [00:10<00:05,  1.25csv/s]

Loaded ./data_h2/ky_louisville_2023_01_26.csv
Loading ./data_h2/ca_stockton_2020_04_01.csv
Loaded ./data_h2/ca_stockton_2020_04_01.csv
Loading ./data_h2/ri_statewide_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  72%|███████▏  | 13/18 [00:11<00:03,  1.36csv/s]

Loaded ./data_h2/ri_statewide_2020_04_01.csv
Loading ./data_h2/pa_philadelphia_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  78%|███████▊  | 14/18 [00:15<00:05,  1.46s/csv]

Loaded ./data_h2/pa_philadelphia_2020_04_01.csv
Loading ./data_h2/tx_san_antonio_2023_01_26.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  83%|████████▎ | 15/18 [00:18<00:06,  2.06s/csv]

Loaded ./data_h2/tx_san_antonio_2023_01_26.csv
Loading ./data_h2/ca_san_francisco_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  89%|████████▉ | 16/18 [00:20<00:04,  2.08s/csv]

Loaded ./data_h2/ca_san_francisco_2020_04_01.csv
Loading ./data_h2/la_new_orleans_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  94%|█████████▍| 17/18 [00:22<00:01,  1.91s/csv]

Loaded ./data_h2/la_new_orleans_2020_04_01.csv
Loading ./data_h2/tx_plano_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv: 100%|██████████| 18/18 [00:23<00:00,  1.30s/csv]

Loaded ./data_h2/tx_plano_2020_04_01.csv





In [7]:
print(all_races)
for i in range(len(search_conducted_by_race_list)):
    print(filenames[i])
    print(search_conducted_by_race_list[i])

{'hispanic', 'black', 'asian/pacific islander', nan, 'unknown', 'other', 'white'}
Saint paul, Minnesota
                        searches_count
subject_race                          
asian/pacific islander            2989
black                            21701
hispanic                          4197
other                              328
white                            13861
Winston-salem, North Carolina
                        searches_count
subject_race                          
asian/pacific islander              24
black                             4555
hispanic                          1398
other                               16
unknown                             19
white                             3197
Durham, North Carolina
                        searches_count
subject_race                          
asian/pacific islander              76
black                            15382
hispanic                          2625
other                               35
unknown                 

In [8]:
results_race = {race: [] for race in all_races}
results_searches = {race: [] for race in all_races}

for i in range(len(stops_by_race_list)):
    for race in all_races:
        if race in stops_by_race_list[i].index:
            results_race[race].append(stops_by_race_list[i].loc[race, 'stops_count'])
        else:
            results_race[race].append(0)
        
        if race in search_conducted_by_race_list[i].index:
            results_searches[race].append(search_conducted_by_race_list[i].loc[race, 'searches_count'])
        else:
            results_searches[race].append(0)


results_df_race = pd.DataFrame(results_race)
results_df_searches = pd.DataFrame(results_searches)

results_df_race.index = [filename for filename in filenames]
results_df_searches.index = [filename for filename in filenames]

In [9]:
results_df_race.head(n=len(filenames))

Unnamed: 0,hispanic,black,asian/pacific islander,NaN,unknown,other,white
"Saint paul, Minnesota",43062,188761,54904,0,0,5292,265143
"Winston-salem, North Carolina",46554,201226,3300,0,1579,726,199175
"Durham, North Carolina",38968,180851,4935,0,2591,1263,97415
"San diego, California",117083,42705,32541,0,0,27238,162226
"Greensboro, North Carolina",27929,299893,10401,0,5886,2344,253574
"Hartford, Connecticut",5073,7104,176,0,0,29,6057
"Austin, Texas",123943,72324,13167,0,3135,2626,268058
"Burlington, Vermont",223,2935,1453,0,63,472,29994
"Raleigh, North Carolina",87460,383628,15349,0,8089,555,361319
"Oakland, California",26257,78925,8099,0,0,4498,15628


In [10]:
results_df_searches.head(n=len(filenames))

Unnamed: 0,hispanic,black,asian/pacific islander,NaN,unknown,other,white
"Saint paul, Minnesota",4197,21701,2989,0,0,328,13861
"Winston-salem, North Carolina",1398,4555,24,0,19,16,3197
"Durham, North Carolina",2625,15382,76,0,66,35,3522
"San diego, California",6501,3873,910,0,0,451,4510
"Greensboro, North Carolina",1828,20654,258,0,199,96,8901
"Hartford, Connecticut",1389,2037,47,0,0,10,1705
"Austin, Texas",7057,5071,191,0,90,73,6774
"Burlington, Vermont",5,100,9,0,0,5,300
"Raleigh, North Carolina",4568,19395,225,0,146,30,8926
"Oakland, California",6722,30025,1304,0,0,706,2400


In [11]:
# remove the nan column
results_df_race = results_df_race.loc[:, results_df_race.columns.notnull()]
results_df_searches = results_df_searches.loc[:, results_df_searches.columns.notnull()]

In [12]:
# rename the index column as regions
results_df_race.rename_axis('regions', inplace=True)
results_df_searches.rename_axis('regions', inplace=True)

In [13]:
# export to csv
results_df_race.to_csv('h2_race.csv')
results_df_searches.to_csv('h2_searches.csv')

In [14]:
results_df_race

Unnamed: 0_level_0,hispanic,black,asian/pacific islander,unknown,other,white
regions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Saint paul, Minnesota",43062,188761,54904,0,5292,265143
"Winston-salem, North Carolina",46554,201226,3300,1579,726,199175
"Durham, North Carolina",38968,180851,4935,2591,1263,97415
"San diego, California",117083,42705,32541,0,27238,162226
"Greensboro, North Carolina",27929,299893,10401,5886,2344,253574
"Hartford, Connecticut",5073,7104,176,0,29,6057
"Austin, Texas",123943,72324,13167,3135,2626,268058
"Burlington, Vermont",223,2935,1453,63,472,29994
"Raleigh, North Carolina",87460,383628,15349,8089,555,361319
"Oakland, California",26257,78925,8099,0,4498,15628


In [15]:
results_df_searches

Unnamed: 0_level_0,hispanic,black,asian/pacific islander,unknown,other,white
regions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Saint paul, Minnesota",4197,21701,2989,0,328,13861
"Winston-salem, North Carolina",1398,4555,24,19,16,3197
"Durham, North Carolina",2625,15382,76,66,35,3522
"San diego, California",6501,3873,910,0,451,4510
"Greensboro, North Carolina",1828,20654,258,199,96,8901
"Hartford, Connecticut",1389,2037,47,0,10,1705
"Austin, Texas",7057,5071,191,90,73,6774
"Burlington, Vermont",5,100,9,0,5,300
"Raleigh, North Carolina",4568,19395,225,146,30,8926
"Oakland, California",6722,30025,1304,0,706,2400


In [16]:
search_rate_df = results_df_searches.copy()
for col in results_df_race.columns[0:]:
    search_rate_df[col] = results_df_searches[col] / results_df_race[col].replace(0, 1)  # Avoid division by zero

search_rate_df

Unnamed: 0_level_0,hispanic,black,asian/pacific islander,unknown,other,white
regions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Saint paul, Minnesota",0.097464,0.114965,0.05444,0.0,0.06198,0.052277
"Winston-salem, North Carolina",0.03003,0.022636,0.007273,0.012033,0.022039,0.016051
"Durham, North Carolina",0.067363,0.085053,0.0154,0.025473,0.027712,0.036155
"San diego, California",0.055525,0.090692,0.027965,0.0,0.016558,0.027801
"Greensboro, North Carolina",0.065452,0.068871,0.024805,0.033809,0.040956,0.035102
"Hartford, Connecticut",0.273802,0.28674,0.267045,0.0,0.344828,0.281492
"Austin, Texas",0.056937,0.070115,0.014506,0.028708,0.027799,0.025271
"Burlington, Vermont",0.022422,0.034072,0.006194,0.0,0.010593,0.010002
"Raleigh, North Carolina",0.05223,0.050557,0.014659,0.018049,0.054054,0.024704
"Oakland, California",0.256008,0.380424,0.161008,0.0,0.156959,0.153571


In [17]:
search_rate_df.rename_axis('regions', inplace=True)
search_rate_df.to_csv('h2_search_rate.csv')

In [1]:
traces = []
for race in search_rate_df.columns:
    trace = go.Bar(x=search_rate_df.index, y=search_rate_df[race], name=race)
    traces.append(trace)

# Create layout
layout = go.Layout(title='Search Rate by Race and Region',
                   xaxis=dict(title='Region'),
                   yaxis=dict(title='Search Rate'))

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()


NameError: name 'search_rate_df' is not defined

In [25]:
from scipy.stats import levene

# Assuming your DataFrame is named "data"
# Perform Levene's test for each numeric column
for column in search_rate_df.select_dtypes(include='number').columns:
    # Perform Levene's test
    statistic, p_value = levene(*[search_rate_df[column][~search_rate_df[column].isnull()]])  # Exclude NaN values
    
    # Print the results
    print("Column:", column)
    print("Levene's statistic:", statistic)
    print("p-value:", p_value)
    
    if p_value < 0.05:
        print("There is significant variation in column", column)
    else:
        print("There is no significant variation in column", column)
    print()


ValueError: Must enter at least two input sample vectors.

In [22]:
import numpy as np

def compute_difference(region1, region2):
    return np.mean(region1) - np.mean(region2)

# Perform permutation testing for each race
for race in search_rate_df.columns:
    print(f"Race: {race}")
    region_names = search_rate_df.index
    num_permutations = 1000
    observed_difference = compute_difference(search_rate_df[race], search_rate_df[race])

    differences = []
    for _ in range(num_permutations):
        permuted_indices = np.random.permutation(len(region_names))
        permuted_df = search_rate_df.iloc[permuted_indices]
        permuted_difference = compute_difference(permuted_df[race], search_rate_df[race])
        # permuted_df = pretty_print_dataframe(permuted_df)
        # print(permuted_difference)
        differences.append(permuted_difference)

    differences = np.array(differences)
    p_value = np.mean(differences >= observed_difference)
    print(f"Observed Difference: {observed_difference}, p-value: {p_value}")
    

Race: other
Observed Difference: 0.0, p-value: 1.0
Race: unknown
Observed Difference: 0.0, p-value: 1.0
Race: hispanic
Observed Difference: 0.0, p-value: 0.868
Race: black
Observed Difference: 0.0, p-value: 0.693
Race: white
Observed Difference: 0.0, p-value: 0.626
Race: asian/pacific islander
Observed Difference: 0.0, p-value: 0.872


In [21]:
import pandas as pd
import numpy as np
from itertools import permutations

# Function to calculate search rate
def calculate_search_rate(df):
    total_stops = len(df)
    searched_stops = df['search_conducted'].sum()
    search_rate = searched_stops / total_stops
    return search_rate

# Read CSV files in chunks and calculate search rates
regions = all_files.copy()
search_rates = {}

for region in regions:
    search_rates[region] = []
    chunk_size = 10000  # Adjust based on your memory constraints
    for chunk in pd.read_csv(region, chunksize=chunk_size):
        search_rate = calculate_search_rate(chunk)
        search_rates[region].append(search_rate)

# Perform permutation test
observed_means = {region: np.mean(search_rates[region]) for region in regions}

# Generate all possible permutations of the regions
region_permutations = permutations(regions)

# Calculate mean search rates for each permutation
permuted_means = []
for perm in region_permutations:
    permuted_search_rates = [search_rates[region] for region in perm]
    mean_search_rate = np.mean(permuted_search_rates)
    permuted_means.append(mean_search_rate)

# Calculate p-value
observed_mean = np.mean(list(observed_means.values()))
p_value = np.mean(np.array(permuted_means) >= observed_mean)

print("Observed Mean Search Rates:", observed_means)
print("P-value:", p_value)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9,) + inhomogeneous part.

In [None]:
import pandas as pd

def pretty_print_dataframe(df):
    # Function to highlight the maximum value in each column
    def highlight_max(s):
        is_max = s == s.max()
        return ['background-color: yellow' if v else '' for v in is_max]

    # Style DataFrame for better visualization
    styled_df = df.style \
        .set_caption('Sample DataFrame') \
        .set_table_styles([{'selector': 'caption', 'props': [('color', 'blue'), ('font-size', '18px')]}]) \
        .background_gradient(cmap='coolwarm', subset=['Salary']) \
        .apply(highlight_max, subset=['Age', 'Salary']) \
        .format({'Salary': "${:,.0f}"}) \
        .set_properties(**{'text-align': 'center'})

    # Print the styled DataFrame
    print(styled_df.to_string)

# Sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
        'Age': [25, 30, 35, 40, 45],
        'Salary': [50000, 60000, 70000, 80000, 90000]}

df = pd.DataFrame(data)

# Print styled DataFrame
pretty_print_dataframe(df)
df

<bound method Styler.to_string of <pandas.io.formats.style.Styler object at 0x7f4db06e5490>>


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,35,70000
3,David,40,80000
4,Emily,45,90000


In [32]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Step 1: Load CSV into DataFrame
data = pd.read_csv('h2_search_rate.csv')

# Step 2: Perform ANOVA
# Assuming 'Region' is the first column and the rest are races
formula = ' '.join(data.columns[1:]) + ' ~ C(Region)'

print(formula)
# lm = ols(formula, data=data).fit()
# anova_table = anova_lm(lm)

# print(anova_table)


other unknown hispanic black white asian/pacific_islander ~ C(Region)


In [31]:
df_new = search_rate_df.stack().reset_index()
df_new.columns = ['index', 'Subject_Race', 'Search_Rate']
df_new = df_new[['Search_Rate', 'Subject_Race']]

df_new.to_csv('h2_search_rate_new.csv')


print(df_new)


     Search_Rate            Subject_Race
0       0.097464                hispanic
1       0.114965                   black
2       0.054440  asian/pacific islander
3       0.000000                 unknown
4       0.061980                   other
..           ...                     ...
103     0.100234                   black
104     0.016820  asian/pacific islander
105     0.004644                 unknown
106     0.018219                   other
107     0.044718                   white

[108 rows x 2 columns]


In [32]:
# save as csv
df_new.to_csv('h2_search_rate_new.csv')

In [37]:
fig = go.Figure(data=[go.Bar(x=df_new['Subject_Race'], y=df_new['Search_Rate'])])
fig.update_layout(title='DataFrame Values', xaxis_title='Column Heading', yaxis_title='Value')

# Show the plot
fig.show()

In [34]:
formula = 'Search_Rate ~ C(Subject_Race)' # Assuming 'Region' is the first column

lm = ols(formula, data=df_new).fit()
anova_table = anova_lm(lm)

print(anova_table)

                    df    sum_sq   mean_sq        F    PR(>F)
C(Subject_Race)    5.0  0.115881  0.023176  4.48449  0.000981
Residual         102.0  0.527144  0.005168      NaN       NaN


In [43]:
import pandas as pd
import os
import random

# List of CSV files
csv_files = ['file1.csv', 'file2.csv', 'file3.csv']  # Add more files as needed

# Percentage of data to read
percentage_to_read = 0.2  # Change this to the desired percentage

# Columns to keep
columns_to_keep = ['subject_race', 'search_conducted']

# Initialize an empty DataFrame to store the sampled data
sampled_data = pd.DataFrame()

# Initialize a dictionary to keep track of how many data points were read from each file
data_points_read = {file: 0 for file in all_files}

# Loop through each CSV file
for file in tqdm(all_files, desc="Reading CSV files", unit="file"):
    # Read a random sample of data from the CSV file
    df = pd.read_csv(file, usecols=columns_to_keep)
    num_rows = len(df)
    num_rows_to_read = int(percentage_to_read * num_rows)
    random_rows = random.sample(range(num_rows), num_rows_to_read)
    sampled_df = df.iloc[random_rows]
    
    # Increment the count of data points read from this file
    data_points_read[file] = num_rows_to_read
    
    # Append the sampled data to the DataFrame
    sampled_data = pd.concat([sampled_data, sampled_df], ignore_index=True)

# Permute the data
sampled_data = sampled_data.sample(frac=1).reset_index(drop=True)

# Store the sampled data in a new CSV file
sampled_data.to_csv('sampled_data_permuted.csv', index=False)

# Display the first few rows of the permuted sampled data
print(sampled_data.head())

# Display how many data points were read from each file
print("Number of data points read from each file:")
for file, count in data_points_read.items():
    print(f"{file}: {count}")


Columns (22) have mixed types. Specify dtype option on import or set low_memory=False.

Reading CSV files: 100%|██████████| 18/18 [00:08<00:00,  2.23file/s]


             subject_race search_conducted
0                   other            False
1                hispanic            False
2  asian/pacific islander            False
3                   white            False
4                   white            False
Number of data points read from each file:
./data_h2/mn_saint_paul_2020_04_01.csv: 135031
./data_h2/nc_winston-salem_2020_04_01.csv: 90512
./data_h2/nc_durham_2020_04_01.csv: 65204
./data_h2/ca_san_diego_2020_04_01.csv: 76605
./data_h2/nc_greensboro_2020_04_01.csv: 120006
./data_h2/ct_hartford_2020_04_01.csv: 3687
./data_h2/tx_austin_2020_04_01.csv: 96651
./data_h2/vt_burlington_2023_01_26.csv: 7369
./data_h2/nc_raleigh_2020_04_01.csv: 171280
./data_h2/ca_oakland_2020_04_01.csv: 26681
./data_h2/ky_louisville_2023_01_26.csv: 29312
./data_h2/ca_stockton_2020_04_01.csv: 8325
./data_h2/ri_statewide_2020_04_01.csv: 101936
./data_h2/pa_philadelphia_2020_04_01.csv: 373019
./data_h2/tx_san_antonio_2023_01_26.csv: 260220
./data_h2/ca_san_fra

In [1]:
import pandas as pd
import plotly.express as px

# Example DataFrame
data = {
    'category': ['A', 'A', 'B', 'B', 'C', 'C'],
    'value': [1, 2, 3, 4, 5, 6]
}

df = pd.DataFrame(data)

# Create violin plot using Plotly
fig = px.violin(df, y='value', x='category', box=True, points='all', hover_data=df.columns)
fig.update_layout(title='Violin Plot', xaxis_title='Category', yaxis_title='Value')
fig.show()
