In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
ufo = pd.read_csv('ufo_sightings_clean.csv')
ufo.head(5)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,month,season,hour,period
0,1949-10-10 20:30:00,San Marcos,TX,US,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,2004-04-27,29.883056,-97.941111,10,Fall,20,Evening
1,1956-10-10 21:00:00,Edna,TX,US,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.978333,-96.645833,10,Fall,21,Night
2,1960-10-10 20:00:00,Kaneohe,HI,US,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.418056,-157.803611,10,Fall,20,Evening
3,1961-10-10 19:00:00,Bristol,TN,US,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,2007-04-27,36.595,-82.188889,10,Fall,19,Evening
4,1965-10-10 23:45:00,Norwalk,CT,US,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,1999-10-02,41.1175,-73.408333,10,Fall,23,Night


In [3]:
# Drop unused columns for ease of use

ufo_data = ufo.drop(columns = ['country', 'duration (seconds)', 'duration (hours/min)',
                               'comments', 'latitude', 'longitude', 'hour'])
ufo1 = ufo.drop(columns = ['datetime', 'country', 'duration (seconds)', 'duration (hours/min)',
                          'comments', 'date posted', 'latitude', 'longitude', 'month', 'hour'])
ufo_data.head()

Unnamed: 0,datetime,city,state,shape,date posted,month,season,period
0,1949-10-10 20:30:00,San Marcos,TX,cylinder,2004-04-27,10,Fall,Evening
1,1956-10-10 21:00:00,Edna,TX,circle,2004-01-17,10,Fall,Night
2,1960-10-10 20:00:00,Kaneohe,HI,light,2004-01-22,10,Fall,Evening
3,1961-10-10 19:00:00,Bristol,TN,sphere,2007-04-27,10,Fall,Evening
4,1965-10-10 23:45:00,Norwalk,CT,disk,1999-10-02,10,Fall,Night


In [4]:
# Create a outcome column to add a 0 to shapes 'other' & 'unknown' and a 1 to all others
# Used this as the target in the regression

ufo1['outcome'] = '1'

for index, row in ufo1.iterrows():
    outcome = '1' 
    # print(row['shape'])
    if not row['shape'] or row['shape'] == 'other' or row['shape'] == 'unknown':
        outcome = '0' #replace '1' with '0' if criteria is met
        row['outcome'] = outcome

ufo1.head(11)

Unnamed: 0,city,state,shape,season,period,outcome
0,San Marcos,TX,cylinder,Fall,Evening,1
1,Edna,TX,circle,Fall,Night,1
2,Kaneohe,HI,light,Fall,Evening,1
3,Bristol,TN,sphere,Fall,Evening,1
4,Norwalk,CT,disk,Fall,Night,1
5,Pell City,AL,disk,Fall,Evening,1
6,Live Oak,FL,disk,Fall,Night,1
7,Hawthorne,CA,circle,Fall,Noon,1
8,Brevard,NC,fireball,Fall,Evening,1
9,Bellmore,NY,disk,Fall,Noon,1


In [5]:
# Change outcome from string to numerical
# ufo1['outcome'] = ufo1['outcome'].apply(pd.to_numeric)

In [6]:
# Use label encoder to change ufo_data into same format as outcome column

from sklearn.preprocessing import LabelEncoder

ufo_data = ufo_data.apply(LabelEncoder().fit_transform)
ufo_data.head()

Unnamed: 0,datetime,city,state,shape,date posted,month,season,period
0,83,8823,44,8,140,9,0,1
1,268,2890,44,4,128,9,0,4
2,455,4886,11,18,129,9,0,1
3,494,1153,43,24,190,9,0,1
4,811,7156,6,11,19,9,0,4


In [7]:
# Assign X (data) and y (target)

X = ufo_data
y = ufo1['outcome']
print(X.shape, y.shape)

(63561, 8) (63561,)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 100000)
lr.fit(X_train, y_train)

In [14]:
print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")

Training Data Score: 0.8487518355359766
Testing Data Score: 0.8492228305330061


In [17]:
y_pred = lr.predict(X_test)
print(f'First 20 Predictions: {y_pred[:20]}')

First 20 Predictions: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '1' '1'
 '1' '1']


In [12]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1
