In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
ufo = pd.read_csv('ufo_sightings_clean.csv')
ufo.head(5)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,...,month,day,hour,minute,month_day,diff_weeks,diff_months,diff_years,season,time of day
0,1949-10-10 20:30:00,San Marcos,TX,USA,Cylinder,2700.0,45 minutes,This event took place in early fall around 194...,2004-04-27,29.883056,...,10.0,10.0,20.0,30.0,Oct 10,2846.0,655.0,55.0,Fall,Evening
1,1949-10-10 21:00:00,Lackland Afb,TX,USA,Light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,...,10.0,10.0,21.0,0.0,Oct 10,2932.0,674.0,56.0,Fall,Night
2,1955-10-10 17:00:00,Chester (Uk/England),,GBR,Circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,...,10.0,10.0,17.0,0.0,Oct 10,2728.0,627.0,52.0,Fall,Evening
3,1956-10-10 21:00:00,Edna,TX,USA,Circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.978333,...,10.0,10.0,21.0,0.0,Oct 10,2466.0,567.0,47.0,Fall,Night
4,1960-10-10 20:00:00,Kaneohe,HI,USA,Light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.418056,...,10.0,10.0,20.0,0.0,Oct 10,2258.0,519.0,43.0,Fall,Evening


In [14]:
# Drop unused columns for ease of use

ufo_data = ufo.drop(columns = ['country', 'duration (seconds)', 'duration (hours/min)','time','month_day','diff_months',
                               'date','year','day','minute','diff_weeks','diff_years','comments', 'latitude', 'longitude',
                               'hour'])
ufo1 = ufo.drop(columns = ['datetime', 'country', 'duration (seconds)', 'duration (hours/min)','time','month_day','diff_months',
                          'diff_weeks','diff_years','comments', 'date posted', 'latitude', 'longitude', 'month', 'hour',
                          'date','year','day','minute'])
ufo_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79937 entries, 0 to 79936
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   datetime     79937 non-null  object 
 1   city         79937 non-null  object 
 2   state        74530 non-null  object 
 3   shape        78013 non-null  object 
 4   date posted  79937 non-null  object 
 5   month        79937 non-null  float64
 6   season       79937 non-null  object 
 7   time of day  79937 non-null  object 
dtypes: float64(1), object(7)
memory usage: 4.9+ MB


In [15]:
# Create a outcome column to add a 0 to shapes 'other' & 'unknown' and a 1 to all others
# Used this as the target in the regression

ufo1['outcome'] = '1'

for index, row in ufo1.iterrows():
    outcome = '1' 
    # if shape is null, other, or unknown then place 0 in outcome column
    if not row['shape'] or row['shape'] == 'Other' or row['shape'] == 'Unknown':
        outcome = '0' #replace '1' with '0' if criteria is met
        row['outcome'] = outcome

ufo1.head(20)

Unnamed: 0,city,state,shape,season,time of day,outcome
0,San Marcos,TX,Cylinder,Fall,Evening,1
1,Lackland Afb,TX,Light,Fall,Night,1
2,Chester (Uk/England),,Circle,Fall,Evening,1
3,Edna,TX,Circle,Fall,Night,1
4,Kaneohe,HI,Light,Fall,Evening,1
5,Bristol,TN,Sphere,Fall,Evening,1
6,Penarth (Uk/Wales),,Circle,Fall,Night,1
7,Norwalk,CT,Disk,Fall,Night,1
8,Pell City,AL,Disk,Fall,Evening,1
9,Live Oak,FL,Disk,Fall,Night,1


In [5]:
# Change outcome from string to numerical
#ufo1['outcome'] = ufo1['outcome'].apply(pd.to_numeric)

In [16]:
# Use label encoder to change ufo_data into same format as outcome column

from sklearn.preprocessing import LabelEncoder

ufo_data = ufo_data.apply(LabelEncoder().fit_transform)
ufo_data.head()

Unnamed: 0,datetime,city,state,shape,date posted,month,season,time of day
0,110,15355,57,8,142,9,0,1
1,111,9031,57,19,181,9,0,4
2,318,3150,67,4,198,9,0,1
3,358,4998,57,4,130,9,0,4
4,579,8457,13,19,131,9,0,1


In [17]:
# Assign X (data) and y (target)

X = ufo_data
y = ufo1['outcome']
print(X.shape, y.shape)

(79937, 8) (79937,)


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 100000)
lr.fit(X_train, y_train)

In [20]:
print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")

Training Data Score: 0.8505304243394716
Testing Data Score: 0.8520890668001001


In [28]:
y_pred = lr.predict(X_test)
print(f'First 20 Predictions: {y_pred[:100]}')

First 20 Predictions: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']


In [29]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1
