In [116]:
import nltk
import pandas as pd
import re
# read the data in a dataframe using Pandas
df = pd.read_csv("data/mufon_data.csv")
# the first row is the header row, use it to name the columns of the dataframe
df.columns = df.iloc[0]
# lowercase the column names and replace spaces with underscores, also remove non-alphanumeric characters from the column names
df.columns = [x.lower().replace(" ", "_").replace("/", "").replace(r"()","") for x in df.columns]
# drop the first row, which is now a duplicate of the header row
df = df.drop(df.index[0])
# separate city,state into two columns
df[["city", "state"]] = df["city,state"].str.split(",", expand=True)
# drop the city,state column
df = df.drop("city,state", axis=1)
# remove "Long Description of Sighting Report" from all text in the rows of the "description" column
df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")
# show the first 5 rows
df.head()

Unnamed: 0,source,datetime_of_event,date_submitted,duration_(seconds),city,state,state2,country,latitude,longitude,long_description,shape,short_description,topic,attachments
1,MUFON,2010-08-14 9:15PM,3/13/21,,Bogalusa,LA,LOUISIANA,USA,,,Driving home at dark we’re on a new road that...,,it was a bright blue light flying over head,blinding,
2,MUFON,2021-02-22 10:22PM,3/12/21,,Sarasota,FL,FLORIDA,USA,,,I was taking pictures of the beautiful sunris...,,crystal Clear night. Looked up at Orion. Watch...,blinding,
3,MUFON,2020-07-26 7:25PM,3/6/21,,Hillsboro,TN,TENNESSEE,USA,,,I was driving on 285 South at 845 PM on my wa...,,Looked like Venus does in the evening but was ...,blinding,DE50931F02C441D3BF88C950D96C6149.jpeg trim.74A...
4,MUFON,1996-02-20 6:00PM,2/27/21,,Elizabethtown,KY,KENTUCKY,USA,,,My husband and I saw a red orb in the sky. It...,,Huge brilliant lights,blinding,
5,MUFON,2021-02-25 6:42AM,2/25/21,,Torrington,CT,CONNECTICUT,USA,,,Was driving home from doctor appt. on busy in...,,Done like with apparent windows.,blinding,7CD98D9DF4464E5EB8D138AC87703BCF.jpeg 4FD788F9...


In [117]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("data/mufon_data.csv")

def preprocess_data(df):
    
    # the first row is the header row, use it to name the columns of the dataframe
    df.columns = df.iloc[0]
    
    # lowercase the column names and replace spaces with underscores, also remove non-alphanumeric characters from the column names
    df.columns = [x.lower().replace(" ", "_").replace("/", "").replace(r"()","") for x in df.columns]

    # convert datetime_of_event column to datetime object
    df['datetime_of_event'] = pd.to_datetime(df['datetime_of_event'], errors='coerce')
    
    # remove 'long description..'
    df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")

    # drop rows with missing datetime_of_event values
    df.dropna(subset=['datetime_of_event'], inplace=True)
    
    # fill missing values in duration_(seconds) column with 0
    df['duration_(seconds)'].fillna(0, inplace=True)
    
    # fill missing values in latitude and longitude columns with median values
    imputer = SimpleImputer(strategy='median')
    try:
        df[['latitude', 'longitude']] = imputer.fit_transform(df[['latitude', 'longitude']])
    except ValueError:
        pass
    # fill missing values in state column with 'unknown'
    df['state'].fillna('unknown', inplace=True)
    
    # fill missing values in shape column with 'unknown'
    df['shape'].fillna('unknown', inplace=True)
    
    # fill missing values in attachments column with 'unknown'
    df['attachments'].fillna('none', inplace=True)
    
    # if the value in latitude, longitude is np.NaN, or duration_(seconds) is 0, replace it with the string "pending_nuforc"
    df['latitude'] = np.where(df['latitude'].isnull(), np.NaN, df['latitude'])
    df['longitude'] = np.where(df['longitude'].isnull(), np.NaN, df['longitude'])
    df['duration_(seconds)'] = np.where(df['duration_(seconds)'] == 0, np.NaN, df['duration_(seconds)'])
    # separate city,state into two columns
    df[["city", "state"]] = df["city,state"].str.split(",", expand=True)
    # drop the city,state column
    df = df.drop("city,state", axis=1)
    # encode categorical columns
    categorical_cols = ['source', 'city', 'state', 'state2', 'country', 'shape', 'topic', 'attachments']
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    # drop duplicated columns
    df = df.drop(['state2'], axis=1)
    # save a copy of the dataframe
    df_copy = df.copy()
    # save as `data/mufon_data_preprocessed.csv`
    df_copy.to_csv('data/mufon_data_preprocessed.csv', index=False)
    return df


In [118]:
# preprocess the data
df = preprocess_data(df)
df.describe()

Unnamed: 0,source,duration_(seconds),city,state,country,shape,topic,attachments
count,413.0,0.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,0.0,,178.622276,22.62954,0.0,0.0,4.002421,90.472155
std,0.0,,104.517536,15.032192,0.0,0.0,2.590591,28.793664
min,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,,90.0,8.0,0.0,0.0,2.0,97.0
50%,0.0,,181.0,22.0,0.0,0.0,4.0,103.0
75%,0.0,,270.0,36.0,0.0,0.0,6.0,103.0
max,0.0,,360.0,48.0,0.0,0.0,8.0,134.0


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 1 to 416
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   source              413 non-null    int64         
 1   datetime_of_event   413 non-null    datetime64[ns]
 2   date_submitted      413 non-null    object        
 3   duration_(seconds)  0 non-null      float64       
 4   city                413 non-null    int64         
 5   state               413 non-null    int64         
 6   country             413 non-null    int64         
 7   latitude            0 non-null      object        
 8   longitude           0 non-null      object        
 9   long_description    413 non-null    object        
 10  shape               413 non-null    int64         
 11  short_description   413 non-null    object        
 12  topic               413 non-null    int64         
 13  attachments         413 non-null    int64         

In [120]:
# use np.where to replace all "pending_nuforc" values with np.NaN in the latitude, longitude, and duration_(seconds) columns
df['latitude'] = np.where(df['latitude'] == 'pending_nuforc', np.NaN, df['latitude'])
df['longitude'] = np.where(df['longitude'] == 'pending_nuforc', np.NaN, df['longitude'])
df['duration_(seconds)'] = np.where(df['duration_(seconds)'] == 'pending_nuforc', np.NaN, df['duration_(seconds)'])

# finally, for every column if there are any rows with "pending_nuforc" values, replace those values with np.NaN
df = df.replace('pending_nuforc', np.NaN)

In [121]:
# remove "Long Description of Sighting Report" from all text in the rows of the "description" column
df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")
df.head()

Unnamed: 0,source,datetime_of_event,date_submitted,duration_(seconds),city,state,country,latitude,longitude,long_description,shape,short_description,topic,attachments
1,0,2010-08-14 21:15:00,3/13/21,,31,17,0,,,Driving home at dark we’re on a new road that...,0,it was a bright blue light flying over head,1,103
2,0,2021-02-22 22:22:00,3/12/21,,271,8,0,,,I was taking pictures of the beautiful sunris...,0,crystal Clear night. Looked up at Orion. Watch...,1,103
3,0,2020-07-26 19:25:00,3/6/21,,121,39,0,,,I was driving on 285 South at 845 PM on my wa...,0,Looked like Venus does in the evening but was ...,1,49
4,0,1996-02-20 18:00:00,2/27/21,,90,16,0,,,My husband and I saw a red orb in the sky. It...,0,Huge brilliant lights,1,103
5,0,2021-02-25 06:42:00,2/25/21,,300,6,0,,,Was driving home from doctor appt. on busy in...,0,Done like with apparent windows.,1,35


In [122]:
# convert the "date_submitted", and "datetime_of_event" columns to datetime objects
df["date_submitted"] = pd.to_datetime(df["date_submitted"])

In [123]:
# You can now proceed to the next step of your project which is to build a model to cluster the reports, considering them like points on a plane. The plane will have 3 dimensions: time, location (lat,long). The clusters will be the groups of reports that are similar to each other. This introduces another dimension to the plane: the shape of the UAPs. The clusters will be the groups of reports that are similar to each other after the text has been vectorized with TFIDF and the shape of the UAPs has been one-hot encoded.
# I want to build a preprocessing pipeline for the text in the reports using `pipeline` in `sklearn`
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from datetime import datetime
import numpy as np

df = pd.read_csv("data/mufon_data_preprocessed.csv")

def extract_time_features(x):
    try:
        return x['datetime_of_event'].dt.year.astype(float) + x['datetime_of_event'].dt.month.astype(float)/12 + x['datetime_of_event'].dt.day.astype(float)/365
    except Exception as e:
        print(e)
        return None

print(f'Pipeline - in Development')
print(f'------------------------')
print(f'Current df has columns: {df.columns}')

#create the text preprocessing pipeline
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), # vectorize the text using Tf-idf
    ])
print(f'Created text_pipeline')
print(f'------------------------')
print(f'Defining the transformers')
print(f'------------------------')
transformers=[
    ('text', text_pipeline, "long_description"), # apply the text pipeline to the long_description column
    ('encoder', OneHotEncoder(), ['shape']), # one-hot encode the shape column
    ('num', StandardScaler(), ['latitude', 'longitude', 'duration_(seconds)']), # scale the latitude, longitude, and duration_(seconds) columns
]
# create the preprocessor
preprocessor = ColumnTransformer(transformers=transformers)
# fit and transform the data using the preprocessor
X_transformed = preprocessor.fit_transform(df)
# Next step - build a model to cluster the reports, considering them like points on a plane. The plane will have 3 dimensions: time, location (lat,long). The clusters will be the groups of reports that are similar to each other. This introduces another dimension to the plane: the shape of the UAPs. The clusters will be the groups of reports that are similar to each other after the text has been vectorized with TFIDF and the shape of the UAPs has been one-hot encoded.


Pipeline - in Development
------------------------
Current df has columns: Index(['source', 'datetime_of_event', 'date_submitted', 'duration_(seconds)',
       'city', 'state', 'country', 'latitude', 'longitude', 'long_description',
       'shape', 'short_description', 'topic', 'attachments'],
      dtype='object')
Created text_pipeline
------------------------
Defining the transformers
------------------------


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [124]:
# Building The Model - KMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score

# create the model
kmeans = KMeans(n_clusters=5, random_state=0)
# fit the model
kmeans.fit(X_transformed)
# get the labels
labels = kmeans.labels_
# get the centroids
centroids = kmeans.cluster_centers_
# get the inertia
inertia = kmeans.inertia_
# get the silhouette score
silhouette = silhouette_score(X_transformed, labels)
# get the calinski_harabasz score
calinski_harabasz = calinski_harabasz_score(X_transformed, labels)
# get the davies_bouldin score
davies_bouldin = davies_bouldin_score(X_transformed, labels)
# get the adjusted_rand score
adjusted_rand = adjusted_rand_score(df['shape'], labels)
# get the adjusted_mutual_info score
adjusted_mutual_info = adjusted_mutual_info_score(df['shape'], labels)
# get the homogeneity score
homogeneity = homogeneity_score(df['shape'], labels)
# get the completeness score
completeness = completeness_score(df['shape'], labels)
# get the v_measure score
v_measure = v_measure_score(df['shape'], labels)
# get the fowlkes_mallows score
fowlkes_mallows = fowlkes_mallows_score(df['shape'], labels)
# get the silhouette_samples
silhouette_samples = silhouette_samples(X_transformed, labels)
# get the pairwise_distances
pairwise_distances = pairwise_distances(X_transformed, centroids)

# save kmeans model as pickle file in models folder
import pickle
pickle.dump(kmeans, open('models/kmeans.pkl', 'wb'))
# save the resulting scores to a csv file in the data folder
import pandas as pd
scores = pd.DataFrame({
    'silhouette': [silhouette],
    'calinski_harabasz': [calinski_harabasz],
    'davies_bouldin': [davies_bouldin],
    'adjusted_rand': [adjusted_rand],
    'adjusted_mutual_info': [adjusted_mutual_info],
    'homogeneity': [homogeneity],
    'completeness': [completeness],
    'v_measure': [v_measure],
    'fowlkes_mallows': [fowlkes_mallows],
})
scores.to_csv('data/kmeans_scores.csv', index=False)

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Building The Model - DBSCAN
from sklearn.cluster import DBSCAN

# create the model
dbscan = DBSCAN()
# fit the model
dbscan.fit(X_transformed)
# get the labels
labels = dbscan.labels_
# get the silhouette score
silhouette = silhouette_score(X_transformed, labels)
# get the calinski_harabasz score
calinski_harabasz = calinski_harabasz_score(X_transformed, labels)
# get the davies_bouldin score
davies_bouldin = davies_bouldin_score(X_transformed, labels)
# get the adjusted_rand score
adjusted_rand = adjusted_rand_score(df['shape'], labels)
# get the adjusted_mutual_info score
adjusted_mutual_info = adjusted_mutual_info_score(df['shape'], labels)
# get the homogeneity score
homogeneity = homogeneity_score(df['shape'], labels)
# get the completeness score
completeness = completeness_score(df['shape'], labels)
# get the v_measure score
v_measure = v_measure_score(df['shape'], labels)
# get the fowlkes_mallows score
fowlkes_mallows = fowlkes_mallows_score(df['shape'], labels)
# get the silhouette_samples
silhouette_samples = silhouette_samples(X_transformed, labels)

# save dbscan model as pickle file in models folder
import pickle
pickle.dump(dbscan, open('models/dbscan.pkl', 'wb'))
# save the resulting scores to a csv file in the data folder
import pandas as pd
scores = pd.DataFrame({
    'silhouette': [silhouette],
    'calinski_harabasz': [calinski_harabasz],
    'davies_bouldin': [davies_bouldin],
    'adjusted_rand': [adjusted_rand],
    'adjusted_mutual_info': [adjusted_mutual_info],
    'homogeneity': [homogeneity],
    'completeness': [completeness],
    'v_measure': [v_measure],
    'fowlkes_mallows': [fowlkes_mallows],
})
scores.to_csv('data/dbscan_scores.csv', index=False)

You can use the preprocessed data in the data/mufon_data_preprocessed.csv file for this step.



You can use libraries such as scikit-learn, and KMeans to cluster the data using different features like location, time, shape, and even the description. Also, you may consider using DBSCAN for density-based clustering and Affinity Propagation for data with many clusters.

