In [1]:
import pandas as pd
import numpy as np
import glob
import json
import csv 

In [2]:
#Get filenames from the directory for setting up the dropdown.

FullFilenames = (glob.glob("Dataset/*.csv"))
OnlyFilenames = []
for i in FullFilenames:
    SplitOnUnderscores = i.split('_')
    SplitOnSlash = SplitOnUnderscores[0].split('\\')
    filename = SplitOnSlash[1]
    OnlyFilenames.append(filename)
print(OnlyFilenames)

['Albuquerque', 'Anaheim', 'Arlington', 'Atlanta', 'Aurora', 'Austin', 'Baltimore', 'Boston', 'Buffalo', 'CapeCoral', 'ColoradoSprings', 'Columbus', 'Dallas', 'Denver', 'DesMoines', 'Detroit', 'Durham', 'Fresno', 'GardenGrove', 'GrandRapids', 'Greensboro', 'Honolulu', 'Houston', 'HuntingtonBeach', 'Indianapolis', 'Irvine', 'Jerseycity', 'Knoxville', 'LasVegas', 'LosAngeles', 'Louisville', 'Madison', 'Miami', 'Milwaukee', 'Minneapolis', 'Nashville', 'NewOrleans', 'NewYork', 'Oakland', 'OklahomaCity', 'Ontario', 'Orlando', 'OverlandPark', 'Phoenix', 'Pittsburgh', 'Plano', 'Portland', 'Providence', 'RanchoCucamonga', 'Richmond', 'Rochester', 'Sacramento', 'SanDiego', 'SanFrancisco', 'SanJose', 'SantaRosa', 'Seattle', 'SiouxFalls', 'StLouis', 'Stockton', 'Tampa', 'WashingtonDC', 'Worcester']


In [3]:
# insert the names of columns you wish to extract Intresting columns are 
# the ones for which we will visualise the data. If all rows of any intresting columns have null values we will drop the data file.

IntrestingColumns = ['scientific_name','common_name','city','state'] 
print(IntrestingColumns)

['scientific_name', 'common_name', 'city', 'state']


In [13]:
#Define a processor for a bar graphs section 1.
# We will be removing those trees from dataset whose count is less than 50 so visualization can be better.
def processFile(city):
    Filename = (glob.glob("Dataset/" + city + "*.csv"))
    data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
    data.dropna(how='any', inplace=True)
    if(not(data.empty)):
        # Create a list of dictionaries for each tree species in the city
        city_tree_data = []
        
        # Group data by scientific_name and get the count
        tree_counts = data['scientific_name'].value_counts().to_dict()
        
        # Extract city and state information
        city_info = data.iloc[0][['city', 'state']].to_dict()
    
        for scientific_name, count in tree_counts.items():
            common_names = data[data['scientific_name'] == scientific_name]['common_name'].unique().tolist()
            if(count>50):
                city_tree_data.append({
                    'scientific_name': scientific_name,
                    'common_name': common_names,
                    'count': count,
                    "city" : city_info['city'],
                    'state': city_info['state']
                })
        # Store the city-specific tree data in the dictionary
        return city_tree_data

In [9]:
# Define a preprocessor for heatmap data. We will be generating data for given cities against the common trees in all given cities.
def processHeatMapFile(cities):
    # Create a list of dictionaries for each tree species in the city
    city_tree_dataList = []
    city_tree_data = {city: {} for city in cities}
    common_scientific_trees_list = []
    for city in cities:
        Filename = (glob.glob("Dataset/" + city + "*.csv"))
        data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
        data.dropna(how='any', inplace=True)
        if(not(data.empty)):
            # Group data by scientific_name and get the count
            tree_counts = data['scientific_name'].value_counts().to_dict()
            
            city_tree_data[city]=tree_counts
            
            keys_list = [set(inner_dict.keys()) for inner_dict in city_tree_data.values()]
            
            # Find the common keys (intersection) among all inner dictionaries
            common_scientific_trees = set.intersection(*keys_list)
            
            common_scientific_trees_list = list(common_scientific_trees)
    trees_to_keep = {city: {key: value for key, value in inner_dict.items() if key in common_scientific_trees_list} for city, inner_dict in city_tree_data.items()}
    for city in cities:
        Filename = (glob.glob("Dataset/" + city + "*.csv"))
        data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
        data.dropna(how='any', inplace=True)
        if(not(data.empty)):
            
            # Extract city and state information
            city_info = data.iloc[0][['city', 'state']].to_dict()
        
            for scientific_name, count in trees_to_keep[city].items():
                common_names = data[data['scientific_name'] == scientific_name]['common_name'].unique().tolist()
                city_tree_dataList.append({
                    "city" : city_info['city'],
                    'scientific_name': scientific_name,
                    'count': count,
                    'common_name': common_names,
                    'state': city_info['state']
                })
    return city_tree_dataList
        

In [14]:
#Clean the dataset of given cities and save them as unified json data with trees count grouped by city. Create the dropdown for cities for 
#which we have data for.

# For the sake of data integrity, We will be dropping the rows which have null values
# Set the city names you wish to get data for or for all cities insert "All"
City = ['All'] 

tree_data_by_city = {}
if(len(City)==1 and City[0] == 'All'):
    for city in OnlyFilenames:
        processedData =  processFile(city)
        if(processedData is not None):
            tree_data_by_city[city] = processedData
        else:
            OnlyFilenames.remove(city)
    
else:
    for city in City:
        processedData =  processFile(city)
        if(processedData is not None):
            tree_data_by_city[city] = processedData
        else:
            OnlyFilenames.remove(city)
        
tree_data_json = json.dumps(tree_data_by_city)

with open("tree_data.json", "w") as f:
    f.write(tree_data_json)
    
#Step-4: Create JSON for city dropdown using extracted filenames
cities= json.dumps({"cities": OnlyFilenames})
f = open("city_dropdown.json", "w")
f.write(cities)
f.close()

In [11]:
# Insert the cities names for which you want to get the data for heatmap. Remember that we will be generating data with common trees so it is possible
# the you might get empty file.
# Intresting fact:
# No tree is common among all cities
Cities = ["Austin","LosAngeles","WashingtonDC","Buffalo","Boston","Columbus"] #,,"Denver","Houston","NewYork"
#Cities = ["All"]
csvHeaders = ['city','scientific_name','count','state','common_name'] 
if(len(Cities)==1 and Cities[0] == 'All'):
    with open('heatmap.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames = csvHeaders)
        writer.writeheader() 
        processedData =  processHeatMapFile(OnlyFilenames)
        if(processedData is not None):
            writer.writerows(processedData)
    
else:
    with open('heatmap.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames = csvHeaders)
        writer.writeheader() 
        processedData =  processHeatMapFile(Cities)
        if(processedData is not None):
            writer.writerows(processedData)
            
        

['Ailanthus altissima', 'Quercus rubra', 'Pyrus calleryana', 'Morus alba', 'Prunus cerasifera', 'Robinia pseudoacacia', 'Ulmus americana']
