## Let's look at aliens

Source of data (Kaggle): https://www.kaggle.com/code/dhruvkalia/ufo-sightings-dbscan/data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

tempDF = pd.read_csv("Data/scrubbed.csv", low_memory=False) #make sure to use scrubbed

ufos_main = tempDF.rename({'shape': 'UFOtype', 'duration (seconds)': 'seconds', 'duration (hours/min)': 'hours_mins', 'date posted': 'posted', 'longitude ': 'longitude'}, axis=1)
ufos_main.head()

Unnamed: 0,datetime,city,state,country,UFOtype,seconds,hours_mins,comments,posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [3]:
ufos_main.shape

(80332, 11)

In [4]:
#getting rid of the time because datetime doesn't like how it's formatted
ufos_main['datetime'] = ufos_main['datetime'].str.split(' ').str[0]
ufos_main.head()

Unnamed: 0,datetime,city,state,country,UFOtype,seconds,hours_mins,comments,posted,latitude,longitude
0,10/10/1949,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [5]:
#changing dtypes
#I know there's one value that will mess with converting the lat column into a float, so I'm just getting rid of it
ufos_main.drop(ufos_main[ufos_main['latitude'] == '33q.200088'].index, inplace = True)

ufos_main.datetime=pd.to_datetime(ufos_main.datetime)
ufos_main['city'] = ufos_main['city'].astype('string')
ufos_main['state'] = ufos_main['state'].astype('string')
ufos_main['country'] = ufos_main['country'].astype('string')
ufos_main['UFOtype'] = ufos_main['UFOtype'].astype('string')
ufos_main['seconds'] = ufos_main['seconds'].astype('string')
ufos_main['hours_mins'] = ufos_main['hours_mins'].astype('string')
ufos_main['comments'] = ufos_main['comments'].astype('string')
ufos_main['posted'] = ufos_main['posted'].astype('string')
ufos_main['latitude'] = ufos_main['latitude'].astype(np.float64)

In [6]:
ufos_main.dtypes

datetime      datetime64[ns]
city                  string
state                 string
country               string
UFOtype               string
seconds               string
hours_mins            string
comments              string
posted                string
latitude             float64
longitude            float64
dtype: object

In [7]:
ufos_main.isna().sum()

datetime         0
city             0
state         5797
country       9669
UFOtype       1932
seconds          0
hours_mins       0
comments        15
posted           0
latitude         0
longitude        0
dtype: int64

In [8]:
#filling/dropping NAs
ufos_main.comments=ufos_main.comments.fillna('None or unknown')
ufos_main.UFOtype=ufos_main.UFOtype.fillna('None or unknown')
ufos_main.state=ufos_main.state.fillna('Not applicable')
ufos_main=ufos_main.dropna()
ufos_main.isna().sum()

datetime      0
city          0
state         0
country       0
UFOtype       0
seconds       0
hours_mins    0
comments      0
posted        0
latitude      0
longitude     0
dtype: int64

In [9]:
ufos_main.shape

(70662, 11)

In [10]:
caResults=ufos_main.loc[ufos_main['state'] == 'ca']

In [11]:
caResults.UFOtype.value_counts()

light              1829
circle              844
triangle            822
fireball            699
disk                630
sphere              617
other               616
unknown             564
oval                388
formation           294
changing            267
None or unknown     228
cigar               198
rectangle           150
chevron             150
flash               142
diamond             135
cylinder            120
egg                  86
teardrop             72
cross                31
cone                 28
flare                 1
round                 1
Name: UFOtype, dtype: Int64

In [151]:
locations = pd.DataFrame({'latitude':ufos_main.latitude,'longitude':ufos_main.longitude,'type':ufos_main.UFOtype})
locations.dtypes

latitude     float64
longitude    float64
type          string
dtype: object

In [152]:
locations.type = [0 if element == 'cylinder' else element for element in locations.type]
locations.type = [1 if element == 'light' else element for element in locations.type]
locations.type = [2 if element == 'circle' else element for element in locations.type]
locations.type = [3 if element == 'sphere' else element for element in locations.type]
locations.type = [4 if element == 'disk' else element for element in locations.type]
locations.type = [5 if element == 'fireball' else element for element in locations.type]
locations.type = [6 if element == 'oval' else element for element in locations.type]
locations.type = [7 if element == 'cigar' else element for element in locations.type]
locations.type = [8 if element == 'rectangle' else element for element in locations.type]
locations.type = [9 if element == 'chevron' else element for element in locations.type]
locations.type = [10 if element == 'triangle' else element for element in locations.type]
locations.type = [11 if element == 'formation' else element for element in locations.type]
locations.type = [12 if element == 'delta' else element for element in locations.type]
locations.type = [13 if element == 'changing' else element for element in locations.type]
locations.type = [14 if element == 'egg' else element for element in locations.type]
locations.type = [15 if element == 'diamond' else element for element in locations.type]
locations.type = [16 if element == 'flash' else element for element in locations.type]
locations.type = [17 if element == 'teardrop' else element for element in locations.type]
locations.type = [18 if element == 'cone' else element for element in locations.type]
locations.type = [19 if element == 'cross' else element for element in locations.type]
locations.type = [20 if element == 'pyramid' else element for element in locations.type]
locations.type = [21 if element == 'round' else element for element in locations.type]
locations.type = [22 if element == 'crescent' else element for element in locations.type]
locations.type = [23 if element == 'flare' else element for element in locations.type]
locations.type = [24 if element == 'hexagon' else element for element in locations.type]
locations.type = [25 if element == 'dome' else element for element in locations.type]
locations.type = [26 if element == 'changed' else element for element in locations.type]
locations.type = [27 if element == 'None or unknown' else element for element in locations.type]


locations = locations.dropna()
locations = locations[locations.type != 'other']
locations = locations[locations.type != 'unknown']
locations.reset_index(drop=True, inplace=True)
locations.to_csv('locations.csv')

In [153]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def knnTest(k, locations):
    x = locations[['latitude', 'longitude']]
    y = locations['type']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    Y_pred = knn.predict(X_test)
    acc_knn = round(knn.score(X_test, y_test) * 100, 2)
    return acc_knn

In [154]:
locations['type'] = pd.to_numeric(locations['type'])

value_of_k = [-1] * 150
result_of_knn = [-1] * 150 
all_results = pd.DataFrame({'value_of_k':value_of_k,'result_of_knn':result_of_knn})

for i in range(1,150):
    all_results.loc[(i),"value_of_k"]=(i+3)
    all_results.loc[(i),"result_of_knn"]= knnTest(i, locations)
    
print(all_results)

     value_of_k  result_of_knn
0            -1          -1.00
1             4          12.41
2             5          15.55
3             6          17.02
4             7          17.56
..          ...            ...
145         148          23.52
146         149          23.57
147         150          23.56
148         151          23.53
149         152          23.53

[150 rows x 2 columns]


In [155]:
all_results = all_results.drop([all_results.index[0]])
all_results.to_csv('Data/knn_results.csv')

In [157]:
tempDF = ufos_main[['seconds', 'hours_mins']]
time = pd.DataFrame(columns = ['time'])
time_type = [0] * len(tempDF.hours_mins) #0 and 28 are out of bounds and will be easy to omit if not filled
ufo_type = [28] * len(tempDF.hours_mins) #these are temp lists that will go into temp df that will go into a larger df
tempDF['hours_mins'] = pd.Series(tempDF['hours_mins'], dtype="string") 

tempDF.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


seconds       string
hours_mins    string
dtype: object

In [158]:
print(tempDF.hours_mins) #this data is an absolute mess

0        45 minutes
2        20 seconds
3          1/2 hour
4        15 minutes
5         5 minutes
            ...    
80327    10 minutes
80328    20 minutes
80329          hour
80330     5 seconds
80331    17 minutes
Name: hours_mins, Length: 70662, dtype: string


In [159]:
secs = 0
mins = 0
hrs = 0
count = 0
i = "a"

for x in range(0,60000):#can say len(tempDF.hours_mins)
    i = tempDF.hours_mins[x]
    i = i.replace(".", "")
    i = i.replace("+", "")
    i = i.replace("<", "")
    i = i.replace("?", "")
    i = i.replace(">", "")
    i = i.replace("about", "")
    i = i.replace("a few", "5")
    i = i.replace("few", "5")
    i = i.replace("under", "")
    i = i.replace("several", "10")
    i = i.replace("more", "5")
    i = i.replace("couple", "2")
    i = i.replace("~", "")
    i = i.replace("~", "")
    i = i.replace("approx", "")
    i = i.replace("or", "-")#easy values to get rid of to start processing
    if "seconds" in i: #check if recorded in seconds(if substring in string)
        if(i[0].isdigit() == True and i[1].isdigit() == True): #if they are both digits record the value
            secs = int(i[0] + i[1])
            secs = secs*60 #going to record everything in minutes
            secs = round(secs,2)
            time_type[count] = secs #recording the time
            ufo_type[count] = locations.type[count] #recording the ufo
    elif "minute" in i:#check if recorded in minutes
        if(i[0].isdigit() == True and i[1].isdigit() == True):#if they are both digits record the value
            mins = int(i[0] + i[1]) #same as seconds, but not multiplying by 60
            mins = round(mins,2)
            time_type[count] = mins
            ufo_type[count] = locations.type[count]
    elif "hour" in i:#check if recorded in hours
        if(i[0].isdigit() == True and i[1].isdigit() == True):#if they are both digits record the value
            hrs = int(i[0] + i[1])#divide by zero error
            if(hrs != 0):
                hrs = hrs/60
            hrs = round(hrs,2)
            time_type[count] = hrs
            ufo_type[count] = locations.type[count]
    count+=1#count is different from i, count is place of tempdf, not everything is going in tempdf because some rows are not useful

df_length = 5000 #arbitrary, can change anytime, but for testing over and over leave at around 5k for run time
    
ufotypes = [-1] * df_length #just to fill the time df to be used for regression
durations = [-1] * df_length
timeOfDay = [-1] * df_length
ufo_time_data = pd.DataFrame({'ufos':ufotypes,'duration_of_sighting':durations, 'time_of_day':timeOfDay})

index = 0

for count in range(0,10000): #Remember to change if change df_length, but not every value is useful so cannot be max
     if(ufo_type[count] != 28 and time_type[count] != 0):#the easy values to get rid of 
        ufo_time_data.loc[index,"ufos"]=ufo_type[count] #filling 
        ufo_time_data.loc[index,"duration_of_sighting"]=time_type[count]
        hours = ufos_main.datetime[count][-5:-3]#for time of day in military time
        minutes = ufos_main.datetime[count][-2:]
        time = hours+minutes#string concat
        intTime = int(time)
        ufo_time_data.loc[index,"time_of_day"]=intTime
        index+=1

toOmit = 0 #will change depending on df_length so no need to change here
for i in ufo_time_data.time_of_day:
    if i == -1:
        toOmit+=1 #finds how many were not filled

toKeep = df_length-toOmit #will find the last instance of the good value and get rid of the rest
ufotypes = [-1] * toKeep
durations = [-1] * toKeep
timeOfDay = [-1] * toKeep
ufo_time_df = pd.DataFrame({'ufos':ufotypes,'duration_of_sighting':durations, 'time_of_day':timeOfDay})

for i in range(0,toKeep):
    ufo_time_df.loc[i,"ufos"]=ufo_time_data.ufos[i]
    ufo_time_df.loc[i,"duration_of_sighting"]=ufo_time_data.duration_of_sighting[i]
    ufo_time_df.loc[i,"time_of_day"]=ufo_time_data.time_of_day[i]
    
ufo_time_df.head()

KeyError: 1

In [None]:
for i in range(0,len(ufo_time_df['time_of_day'])):
    if(ufo_time_df.time_of_day[i] < 1200):
        ufo_time_df.time_of_day[i] = ufo_time_df.time_of_day[i] + 2400
for i in range(0,len(ufo_time_df['time_of_day'])):
    ufo_time_df.time_of_day[i] = ufo_time_df.time_of_day[i] - 1200


In [None]:
ufo_time_df.to_csv('Data/times_and_sightings.csv')

In [None]:
import seaborn as sns
states_us = ufos_main[ufos_main.country == "us"]["state"].value_counts().index
states_ratio = ufos_main[ufos_main.country == "us"]["state"].value_counts().values
states_us = [i.upper() for i in states_us]

us_state_sightings = pd.DataFrame({'states':states_us,'sightings':states_ratio})

In [None]:
us_state_sightings.to_csv('Data/states_sightings.csv')

In [None]:
ufos_main = ufos_main.dropna()
ufos_main.drop(ufos_main[ufos_main['UFOtype'] == 'other'].index, inplace = True)
ufos_main.drop(ufos_main[ufos_main['UFOtype'] == 'unknown'].index, inplace = True)

uniqueUFOs = ufos_main.UFOtype.unique()
count = [0] * len(uniqueUFOs) 
j = 0

for i in uniqueUFOs:
    count[j] = (ufos_main['UFOtype']==i).sum()
    j+=1

sightingCounts = pd.DataFrame({'ufo':uniqueUFOs,'count':count})
sightingCounts.to_csv('Data/sighting_counts.csv')