In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# df = pd.read_csv('../data/ufo_clean.csv', low_memory=False)
df = pd.read_csv('../data/processed/ufos_processed.csv', low_memory=False, on_bad_lines='skip')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79614 entries, 0 to 79613
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   datetime                   79614 non-null  object 
 1   city                       79614 non-null  object 
 2   shape                      77738 non-null  object 
 3   duration (seconds)         79614 non-null  object 
 4   duration (hours/min)       79614 non-null  object 
 5   comments                   79614 non-null  object 
 6   date posted                79614 non-null  object 
 7   latitude                   79614 non-null  float64
 8   longitude                  79614 non-null  float64
 9   comments_shapes            79614 non-null  object 
 10  calculated_duration        79614 non-null  object 
 11  duration_value             76071 non-null  float64
 12  duration_unit              79614 non-null  object 
 13  total_seconds              73394 non-null  flo

The key question: For a report `r` and a geographic valence `v` (measured in km), we need to be able to assign a score to `r` that communicates the following information:
- how many other reports `r'` are within `v` km of `r`?
- how many of those reports have a `datetime` that is within `v_t` of the `datetime` of `r`? (where `v_t` is a time valence measured in hours).
- how many of the reports that are within `v_t` of the `datetime` of `r` reported the same:
  -  `comments_shapes` (the shape of the UAP from the commentary in the report - e.g. "triangle", "cigar", "sphere", etc.)
  -  `comments_colors` (the color of the UAP from the commentary in the report - e.g. "red", "green", "blue", etc.)
  -  `calculated_duration` (measured in seconds)
  -  `number_of_objects` (the number of objects in the UAP) (this is a bit of a stretch, but it's worth a shot) for example, if `r` is a report that describes a single red sphere that was observed for 10 seconds, then `r'` is interesting and should be counted if it either describes a red sphere, a single red object, or a single object that was observed for 10 seconds (or any combination of those).

The first step is to create a function that will take a report `r` and a valence `v` and return a score that communicates the information above. We'll call this function `score_report`.

```python
def score_report(r, v):
    """Score a report `r` based on the number of other reports that are within `v` km of `r` and within `v_t` hours of the `datetime` of `r`."""
    report_score = 0 # initialize the score to 0
    # get the latitude and longitude of the report
    lat = r['latitude']
    lon = r['longitude']
    # get the datetime of the report
    dt = r['datetime']
    # get the comments_shapes, comments_colors, calculated_duration, and number_of_objects of the report
    comments_shapes = r['comments_shapes']
    comments_colors = r['comments_colors']
    calculated_duration = r['calculated_duration']
    number_of_objects = r['number_of_objects']
    # get the reports that are within `v` km of `r`
    nearby_reports = reports[reports.apply(lambda x: haversine((x['latitude'], x['longitude']), (lat, lon)) <= v, axis=1)]
    # get the reports that are within `v_t` hours of the `datetime` of `r`
    keeper_reports = nearby_reports[nearby_reports.apply(lambda x: abs((x['datetime'] - dt).total_seconds()) <= v_t, axis=1)]
    keepers = {} # initialize a dictionary to keep track of the reports that we want to keep
    # get the reports that have some combination of the same `comments_shapes`, `comments_colors`, `calculated_duration`, and `number_of_objects` as `r`
    combinations = list(itertools.product(comments_shapes, comments_colors, calculated_duration, number_of_objects))
    for combination in combinations:
        keep_set = keeper_reports[keeper_reports.apply(lambda x: combination[0] in x['comments_shapes'] and combination[1] in x['comments_colors'] and combination[2] in x['calculated_duration'] and combination[3] in x['number_of_objects'], axis=1)]
        # save the keep_set to the keepers dictionary as a key-value pair where the key is the combination and the value is the keep_set
        keepers[combination] = keep_set
    # add the number of reports in each keep_set to the report_score
    for keep_set in keepers.values():
        report_score += len(keep_set)
    return report_score, keepers
```


In [3]:
import itertools

def haversine(p1, p2):
    """Calculate the great circle distance between two points on the earth (specified in decimal degrees)"""
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [p1[1], p1[0], p2[1], p2[0]])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

def score_one_report(r, v, v_t, df):
    """Assign a float type score to a report `r` based on (1) the number of other reports that are within `v` km of `r` AND (2) within `v_t` hours of the `datetime` of `r`. If a report `r'` within `v` km of `r` contains any of the same `comments_shapes`, `comments_colors`, `calculated_duration`, and `number_of_objects` as `r`, then add 1 to the score and save `r'` to a dictionary of reports that we want to keep.
    To get the distance between two points, use the haversine function. To get the difference between two datetimes, use the `total_seconds` method.
    """

    report_score = 0 # initialize the score to 0
    # get the index of the current report
    report_index = r.name
    # get the latitude and longitude of the report
    lat = r['latitude']
    lon = r['longitude']
    # get the datetime of the report
    dt = r['datetime']
    # get the comments_shapes, comments_colors, calculated_duration, and number_of_objects of the report
    comments_shapes = r['comments_shapes']
    comments_colors = r['comments_colors']
    calculated_duration = r['calculated_duration']
    number_of_objects = r['number_of_objects']
    # get the indices of the reports that are within `v` km of `r`
    nearby_report_indices = np.where(distances[report_index] <= v)[0]
    # get the reports that are within `v` km of `r`
    nearby_reports = df.iloc[nearby_report_indices]
    # get the reports that are within `v_t` hours of the `datetime` of `r`
    keeper_reports = nearby_reports[nearby_reports.apply(lambda x: abs((x['datetime'] - dt).total_seconds()) <= v_t, axis=1)]
    keepers = {} # initialize a dictionary to keep track of the reports that we want to keep
    # get the reports that have some combination of the same `comments_shapes`, `comments_colors`, `calculated_duration`, and `number_of_objects` as `r`
    combinations = list(itertools.product(comments_shapes, comments_colors, calculated_duration, number_of_objects))
    print(len(combinations), ' combinations to check')
    for combination in combinations:
        keep_set = keeper_reports[keeper_reports.apply(lambda x: combination[0] in x['comments_shapes'] and combination[1] in x['comments_colors'] and combination[2] in x['calculated_duration'] and combination[3] in x['number_of_objects'], axis=1)]
        # save the keep_set to the keepers dictionary as a key-value pair where the key is the combination and the value is the keep_set
        keepers[combination] = keep_set
        # add the number of reports in the keep_set to the report_score
        report_score += len(keep_set)
    return report_score, keepers

def score_all_reports(df, v, v_t):
    """Score all of the reports in `df` based on the number of other reports that are within `v` km of `r` and within `v_t` hours of the `datetime` of `r`."""
    report_scores = [] # initialize a list to keep track of the report scores
    keepers = {} # initialize a dictionary to keep track of the reports that we want to keep
    for i, r in df.iterrows():
        report_score, keep_set = score_one_report(r, v, v_t, df, distances)
        report_scores.append(report_score)
        keepers[i] = keep_set
    return report_scores, keepers

In [4]:
score, keepers = score_all_reports(df, 10, 24*60*60) # score all of the reports in `df` based on the number of other reports that are within 10 km of `r` and within 24 hours of the `datetime` of `r`
print('The maximum score is', max(score)) # print the maximum score
print('The minimum score is', min(score)) # print the minimum score
print('The mean score is', np.mean(score)) # print the mean score
print('The median score is', np.median(score)) # print the median score
print(f'The number of reports with a score of 0 is {score.count(0)}') # print the number of reports with a score of 0
print(len(keepers), ' reports scored')


TypeError: score_one_report() missing 1 required positional argument: 'distances'

In [None]:
# cluster the reports by the grade level of the witness
df['grade_level'] = df['grade_level'].astype('category')
df['grade_level'] = df['grade_level'].cat.codes
df['grade_level'] = df['grade_level'].astype('category')


ModuleNotFoundError: No module named 'sklearn.datasets.samples_generator'