In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
%%capture
!pip install requests

Collecting requests
  Using cached requests-2.28.1-py3-none-any.whl (62 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2022.12.7-py3-none-any.whl (155 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.4-py3-none-any.whl (61 kB)
Installing collected packages: urllib3, certifi, idna, requests
Successfully installed certifi-2022.12.7 idna-3.4 requests-2.28.1 urllib3-1.26.13


You should consider upgrading via the 'c:\python39\python.exe -m pip install --upgrade pip' command.


In [None]:
df = pd.read_json("multi_label_train.json")
df

In [2]:
df = pd.read_pickle("../data/multi_label_train_v0.4.pkl")
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/multi_label_train_v0.4.pkl'

TODO

- Incident vs place heatmap
- Incident vs incident
- Top 10 incident/place
- Subsample and maintain distribution
- Rename images according to key

In [None]:
df[(df["incidents_list"] == "unknown") | (df["places_list"] == "unknown")]

In [None]:
# Get the counts of each incident and place by splitting the string with ", "
# and then counting the number of times each incident or place appears
incidents = df["incidents_list"].str.split(", ").explode().value_counts()[1:]
places = df["places_list"].str.split(", ").explode().value_counts()[1:]

# Only show the top 25 incidents and places
incidents = incidents[:25]
places = places[:25]

# Plot incidents and places as a bar chart with a large font size
fig, ax = plt.subplots(1, 2, figsize=(22, 10))
sns.barplot(x=incidents.values, y=incidents.index, ax=ax[0], palette="Blues_d")
sns.barplot(x=places.values, y=places.index, ax=ax[1], palette="Blues_d")
ax[0].set_title("Top 15 Incidents", fontsize=20)
ax[1].set_title("Top 15 Places", fontsize=20)
# Make the font size of the x and y axis labels larger
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].grid()
ax[1].grid()
plt.show()


In [None]:
df["Number of known incidents"] = df.incidents.apply(lambda x: len({k: v for k, v in dict(x).items() if v == 1}))
df["Number of unknown incidents"] = df.incidents.apply(lambda x: len({k: v for k, v in dict(x).items() if v == 0}))

df["Number of known places"] = df.places.apply(lambda x: len({k: v for k, v in dict(x).items() if v == 1}))
df["Number of unknown places"] = df.places.apply(lambda x: len({k: v for k, v in dict(x).items() if v == 0}))

total_known_incidents = df["Number of known incidents"].sum()
total_unknown_incidents = df["Number of unknown incidents"].sum()

total_known_places = df["Number of known places"].sum()
total_unknown_places = df["Number of unknown places"].sum()

print("Total known incidents: ", total_known_incidents)
print("Total unknown incidents: ", total_unknown_incidents)

print("Total known places: ", total_known_places)
print("Total unknown places: ", total_unknown_places)

df[(df["valid_image"] == False) & (df["downloadable"] == True) & (df["image_id"] != "-1")]


In [None]:
incident_label_counts = df["Number of known incidents"].value_counts()
place_label_counts = df["Number of known places"].value_counts()

# Replace each index with number and %%!
incident_label_counts.index = ["Unknown" if i == 0 else f"{i} label(s)" for i in incident_label_counts.index]
incident_label_counts

place_label_counts.index = ["Unknown" if i == 0 else f"{i} label(s)" for i in place_label_counts.index]
place_label_counts

# Plot the number of images with each number of labels
fig, ax = plt.subplots(1, 2, figsize=(22, 10))
sns.barplot(x=incident_label_counts.index, y=incident_label_counts.values, ax=ax[0], palette="Blues_d")
sns.barplot(x=place_label_counts.index, y=place_label_counts.values, ax=ax[1], palette="Blues_d")
ax[0].set_title("Incidents positive-label distrution", fontsize=20)
ax[1].set_title("Places positive-label distribution", fontsize=20)
# Make the font size of the x and y axis labels larger
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].grid()
ax[1].grid()
plt.show()

In [None]:
all_images = len(df)
print("Total images: ", all_images)
downloadable_images = len(df[df["downloadable"] == True])
print("Downloadable images: ", downloadable_images)
valid_images = len(df[df["valid_image"] == True])
print("Valid images: ", valid_images)
not_unknown_images = len(df[(df["Number of known incidents"] != 0) & (df["Number of known places"] != 0)])
print("Images with known incidents and places: ", not_unknown_images)

x_labels = ["Total images", "Downloadable images", "Valid images", "Images with known incidents and places"]
y_values = [all_images, downloadable_images, valid_images, not_unknown_images]
y_values = [val // 1000 for val in y_values]

# Plot the number of images with each number of labels in 
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
sns.barplot(x=x_labels, y=y_values, ax=ax)
ax.set_title("Image counts (in thousands)", fontsize=20)
# Make the font size of the x and y axis labels larger
ax.tick_params(labelsize=12)
plt.grid()
plt.show()