In [1]:
import pandas as pd
import numpy as np
import glob
import json

In [2]:
#Step-1: Get filenames from the directory for setting up the dropdown.

FullFilenames = (glob.glob("Dataset/*.csv"))
OnlyFilenames = []
for i in FullFilenames:
    SplitOnUnderscores = i.split('_')
    SplitOnSlash = SplitOnUnderscores[0].split('\\')
    filename = SplitOnSlash[1]
    OnlyFilenames.append(filename)

In [5]:
#Step-2 define a processor for a file.
# We will be removing those trees from dataset whose count is less than or equal to 1.
def processFile(city):
    Filename = (glob.glob("Dataset/" + city + "*.csv"))
    data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
    data.dropna(how='any', inplace=True)
    if(not(data.empty)):
        # Create a list of dictionaries for each tree species in the city
        city_tree_data = []
        
        # Group data by scientific_name and get the count
        tree_counts = data['scientific_name'].value_counts().to_dict()
        
        # Extract city and state information
        city_info = data.iloc[0][['city', 'state']].to_dict()
    
        for scientific_name, count in tree_counts.items():
            common_names = data[data['scientific_name'] == scientific_name]['common_name'].unique().tolist()
            if(not(count <=1)):
                city_tree_data.append({
                    'scientific_name': scientific_name,
                    'common_name': common_names,
                    'count': count,
                    'state': city_info['state']
                })
    
        # Store the city-specific tree data in the dictionary
        return city_tree_data

In [6]:
'''
Step-3: Clean the dataset of given cities and save them as unified json data with trees count grouped by city. Create the dropdown for cities for 
which we have data for.
'''
# For the sake of data integrity, We will be dropping the rows which have null values
# Set the city names you wish to get data for or for all cities insert "All"
City = ['All'] 

# insert the names of columns you wish to extract Intresting columns are 
# the ones for which we will visualise the data. If all rows of any intresting columns have null values we will drop the data file.

IntrestingColumns = ['common_name','scientific_name','city','state'] 

tree_data_by_city = {}
if(len(City)==1 and City[0] == 'All'):
    for city in OnlyFilenames:
        processedData =  processFile(city)
        if(processedData is not None):
            tree_data_by_city[city] = processedData
        else:
            OnlyFilenames.remove(city)
    
else:
    for city in City:
        processedData =  processFile(city)
        if(processedData is not None):
            tree_data_by_city[city] = processedData
        else:
            OnlyFilenames.remove(city)
        
tree_data_json = json.dumps(tree_data_by_city)

with open("tree_data.json", "w") as f:
    f.write(tree_data_json)
    
#Step-2: Create JSON for city dropdown using extracted filenames
cities= json.dumps({"cities": OnlyFilenames})
f = open("city_dropdown.json", "w")
f.write(cities)
f.close()