In [1]:
import json
from datetime import datetime

import pandas as pd
import numpy as np

# ufo_all.json

## Importing json file

In [2]:
with open("ufo_all.json") as f:
    data = json.load(f)

## Converting datetime

In [3]:
def convertingDatetime():
    convertion_errors = []
    for i, record in enumerate(data):
        #datetime_object = datetime.strptime(record["datetime"], "%d/%m/%Y %H:%M")
        if "24:00" in data[i]["datetime"]:
            data[i]["datetime"] = data[i]["datetime"].replace("24:00", "00:00")
        
        try:
            datetime_object = datetime.strptime(record["datetime"], "%d/%m/%Y %H:%M")
            record["year"] = datetime_object.year
        except:
            try:
                datetime_object = datetime.strptime(record["datetime"], "%m/%d/%Y %H:%M")
                record["year"] = datetime_object.year
            except:
                convertion_errors.append(i)
    return convertion_errors

## Dealing with errors

In [4]:
convertion_errors = convertingDatetime()
len(convertion_errors)

0

In [5]:
for i in convertion_errors:
    print("ID :",i)
    print(data[i]["datetime"])
    print()

## Creating new json file

In [6]:
with open("ufo_all_preprocessed.json", "w") as f:
    json.dump(data, f, indent=4)

# Creating freq_by_year file

In [3]:
list_of_years = [elem["year"] for elem in data]
min_year = min(list_of_years)
max_year = max(list_of_years)

freq_by_year = {}
for y in range(min_year,max_year+1):
    freq_by_year[y] = 0
    
for y in list_of_years:
    freq_by_year[y] += 1

KeyError: 'year'

In [8]:
liste_freq_by_year = np.empty((len(freq_by_year), 2), int)
for k in range(0,len(freq_by_year)):
    liste_freq_by_year[k,0] = min_year+k
    liste_freq_by_year[k,1] = freq_by_year[min_year+k]

DF_freq_by_year = pd.DataFrame(liste_freq_by_year, columns={"year", "freq"})
DF_freq_by_year.to_csv("ufo_all_freq_by_year_youri.csv")

In [9]:
with open("ufo_all_freq_by_year.json", "w") as f:
    json.dump(freq_by_year, f, sort_keys=True, indent=4)

# Creating freq_by_country file

In [4]:
freq_by_country = {}
for y in ['us', 'gb', 'ca', 'au', 'de', 'other']:
    freq_by_country[y] = 0
    
for elem in data:
    if elem["country"] != "":
        freq_by_country[elem["country"]] += 1
    else:
        freq_by_country['other'] += 1

In [5]:
with open("ufo_all_freq_by_country.json", "w") as f:
    json.dump(freq_by_country, f, sort_keys=True, indent=4)

In [6]:
# https://pythonspot.com/matplotlib-bar-chart/

import numpy as np
import matplotlib.pyplot as plt

print(list(freq_by_country.keys()))
print(list(freq_by_country.values()))
 
objects = list(freq_by_country.keys())
y_pos = np.arange(len(objects))
valueurs = list(freq_by_country.values())

plt.figure(figsize=(10,10))
plt.bar(y_pos, valueurs, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Nombre d\'observations')
plt.xlabel('Pays')
plt.title('Histogramme pour la fréquence par pays (dataset all)')
plt.savefig('Figures/freq_par_pays_all.jpg', quality=100)

['us', 'gb', 'ca', 'au', 'de', 'other']
[65114, 1905, 3000, 538, 105, 9670]


# Creating fred_by_us_state file

In [13]:
with open("us_states.json") as f:
    us_states = json.load(f)

In [14]:
freq_by_state = {}
for s in us_states.values():
    freq_by_state[s] = 0

for elem in [e for e in data if e["country"] == "us"]:
    try:
        freq_by_state[us_states[elem["state"].upper()]] += 1
    except:
        print(elem,"not in us_states_dict")

In [15]:
with open("us_states_geo.json") as f:
    us_states_geo = json.load(f)

In [16]:
for i, elem in enumerate(us_states_geo["features"]):
    us_states_geo["features"][i]["properties"]["frequency"] = freq_by_state[elem["properties"]["name"]]

In [17]:
with open("ufo_all_freq_by_us_state.json", "w") as f:
    json.dump(us_states_geo, f, sort_keys=False, indent=4)

In [18]:
freq_by_state

{'Alabama': 642,
 'Alaska': 319,
 'American Samoa': 0,
 'Arizona': 2414,
 'Arkansas': 588,
 'California': 8912,
 'Colorado': 1413,
 'Connecticut': 892,
 'Delaware': 166,
 'District of Columbia': 7,
 'Federated States Of Micronesia': 0,
 'Florida': 3835,
 'Georgia': 1255,
 'Guam': 0,
 'Hawaii': 262,
 'Idaho': 521,
 'Illinois': 2499,
 'Indiana': 1288,
 'Iowa': 678,
 'Kansas': 613,
 'Kentucky': 855,
 'Louisiana': 558,
 'Maine': 558,
 'Marshall Islands': 0,
 'Maryland': 837,
 'Massachusetts': 1256,
 'Michigan': 1836,
 'Minnesota': 1012,
 'Mississippi': 375,
 'Missouri': 1458,
 'Montana': 478,
 'Nebraska': 381,
 'Nevada': 803,
 'New Hampshire': 486,
 'New Jersey': 1255,
 'New Mexico': 720,
 'New York': 2980,
 'North Carolina': 1740,
 'North Dakota': 129,
 'Northern Mariana Islands': 0,
 'Ohio': 2275,
 'Oklahoma': 724,
 'Oregon': 1747,
 'Palau': 0,
 'Pennsylvania': 2366,
 'Puerto Rico': 25,
 'Rhode Island': 228,
 'South Carolina': 1003,
 'South Dakota': 183,
 'Tennessee': 1119,
 'Texas': 344

In [19]:
with open("us_states_geo.json") as f:
    us_states_geo = json.load(f)

In [20]:
list_states_geo = set([e["properties"]["name"] for e in us_states_geo["features"]])
list_states_dict = set(us_states.values())

print("dict not in geo")
for e in list_states_dict:
    if e not in list_states_geo:
        print(e)
    
print()
print("geo not in dict")
for e in list_states_geo:
    if e not in list_states_dict:
        print(e)

dict not in geo
American Samoa
Federated States Of Micronesia
Guam
Marshall Islands
Virgin Islands
Palau
Northern Mariana Islands

geo not in dict


In [None]:
us_states_geo