In [None]:
import nltk
import pandas as pd
import re
# read the data in a dataframe using Pandas
df = pd.read_csv("data/mufon_data.csv")
# the first row is the header row, use it to name the columns of the dataframe
df.columns = df.iloc[0]
# lowercase the column names and replace spaces with underscores, also remove non-alphanumeric characters from the column names
df.columns = [x.lower().replace(" ", "_").replace("/", "").replace(r"()","") for x in df.columns]
# drop the first row, which is now a duplicate of the header row
df = df.drop(df.index[0])
# separate city,state into two columns
df[["city", "state"]] = df["city,state"].str.split(",", expand=True)
# drop the city,state column
df = df.drop("city,state", axis=1)
# remove "Long Description of Sighting Report" from all text in the rows of the "description" column
df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")
# show the first 5 rows
df.head()

In [None]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("data/mufon_data.csv")

def preprocess_data(df):
    
    # the first row is the header row, use it to name the columns of the dataframe
    df.columns = df.iloc[0]
    
    # lowercase the column names and replace spaces with underscores, also remove non-alphanumeric characters from the column names
    df.columns = [x.lower().replace(" ", "_").replace("/", "").replace(r"()","") for x in df.columns]

    # convert datetime_of_event column to datetime object
    df['datetime_of_event'] = pd.to_datetime(df['datetime_of_event'], errors='coerce')
    
    # remove 'long description..'
    df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")

    # drop rows with missing datetime_of_event values
    df.dropna(subset=['datetime_of_event'], inplace=True)
    
    # fill missing values in duration_(seconds) column with 0
    df['duration_(seconds)'].fillna(0, inplace=True)
    
    # fill missing values in latitude and longitude columns with median values
    imputer = SimpleImputer(strategy='median')
    try:
        df[['latitude', 'longitude']] = imputer.fit_transform(df[['latitude', 'longitude']])
    except ValueError:
        pass
    # fill missing values in state column with 'unknown'
    df['state'].fillna('unknown', inplace=True)
    
    # fill missing values in shape column with 'unknown'
    df['shape'].fillna('unknown', inplace=True)
    
    # fill missing values in attachments column with 'unknown'
    df['attachments'].fillna('none', inplace=True)
    
    # if the value in latitude, longitude is np.NaN, or duration_(seconds) is 0, replace it with the string "pending_nuforc"
    df['latitude'] = np.where(df['latitude'].isnull(), 'pending_nuforc', df['latitude'])
    df['longitude'] = np.where(df['longitude'].isnull(), 'pending_nuforc', df['longitude'])
    df['duration_(seconds)'] = np.where(df['duration_(seconds)'] == 0, 'pending_nuforc', df['duration_(seconds)'])
    # separate city,state into two columns
    df[["city", "state"]] = df["city,state"].str.split(",", expand=True)
    # drop the city,state column
    df = df.drop("city,state", axis=1)
    # encode categorical columns
    categorical_cols = ['source', 'city', 'state', 'state2', 'country', 'shape', 'topic', 'attachments']
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    # drop duplicated columns
    df = df.drop(['state2'], axis=1)
    # save a copy of the dataframe
    df_copy = df.copy()
    # save as `data/mufon_data_preprocessed.csv`
    df_copy.to_csv('data/mufon_data_preprocessed.csv', index=False)
    return df


In [None]:
# plot the distribution of the datetime_of_event column
df['datetime_of_event'].hist()

In [None]:
# preprocess the data
df = preprocess_data(df)
df.head()

In [None]:
df.info()

In [None]:
# remove "Long Description of Sighting Report" from all text in the rows of the "description" column
df["long_description"] = df["long_description"].str.replace("Long Description of Sighting Report", "")
df.head()

In [None]:
# convert the "date_submitted", and "datetime_of_event" columns to datetime objects
df["date_submitted"] = pd.to_datetime(df["date_submitted"])

In [54]:
# You can now proceed to the next step of your project which is to build a model to cluster the reports, considering them like points on a plane. The plane will have 3 dimensions: time, location (lat,long). The clusters will be the groups of reports that are similar to each other. This introduces another dimension to the plane: the shape of the UAPs. The clusters will be the groups of reports that are similar to each other after the text has been vectorized with TFIDF and the shape of the UAPs has been one-hot encoded.
# I want to build a preprocessing pipeline for the text in the reports using `pipeline` in `sklearn`
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from datetime import datetime
import numpy as np

df = pd.read_csv("data/mufon_data_preprocessed.csv")

def extract_time_features(x):
    try:
        return x['datetime_of_event'].dt.year.astype(float) + x['datetime_of_event'].dt.month.astype(float)/12 + x['datetime_of_event'].dt.day.astype(float)/365
    except Exception as e:
        print(e)
        return None

print(f'Pipeline - in Development')
print(f'------------------------')
print(f'Current df has columns: {df.columns}')
# create a class to select columns from a dataframe
# why? because the column transformer requires a transformer class, not a function
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.cols]

#create the text preprocessing pipeline
text_pipeline = Pipeline([
    ('selector', ColumnSelector(cols=["long_description"])), # select the long_description column
    ('tfidf', TfidfVectorizer()), # vectorize the text using Tf-idf
    ])

transformers=[
    ('text', text_pipeline, "long_description"), # apply the text pipeline to the long_description column
    ('encoder', OneHotEncoder(), ['shape']), # one-hot encode the shape column
    ('num', StandardScaler(), ['latitude', 'longitude', 'duration_(seconds)']), # scale the latitude, longitude, and duration_(seconds) columns
]
#//     ('date', FunctionTransformer(extract_time_features, validate=False), ['datetime_of_event'])

# create the preprocessor
preprocessor = ColumnTransformer(transformers=transformers)

# fit and transform the data using the preprocessor
X_transformed = preprocessor.fit_transform(df)

# create a dataframe from the transformed data
X_transformed = pd.DataFrame(X_transformed)

X_transformed.head()

Pipeline - in Development
------------------------
Current df has columns: Index(['source', 'datetime_of_event', 'date_submitted', 'duration_(seconds)',
       'city', 'state', 'country', 'latitude', 'longitude', 'long_description',
       'shape', 'short_description', 'topic', 'attachments'],
      dtype='object')


KeyError: "None of [Index(['long_description'], dtype='object')] are in the [index]"

You can use the preprocessed data in the data/mufon_data_preprocessed.csv file for this step.



You can use libraries such as scikit-learn, and KMeans to cluster the data using different features like location, time, shape, and even the description. Also, you may consider using DBSCAN for density-based clustering and Affinity Propagation for data with many clusters.

