In [1]:
#By Chad Fike

#Decision trees can classify categories and that is what we are doing here - a binary classification according to a condition. Decision trees also
#have the benefit of being a bit more insensitive to outliers which I felt like this dataset would have considering its 
#wide distribution. Decision trees are also better able to detect which feature is having more of an impact on a target feature
#which I think I demonstrated that the pop feature had much more impact on a classification outcome than income (since sightings
#one could assume would be correlated with the population at large. Overall, I think the high accuracy of the model demonstrates 
#that I make a good choice by using a decision tree classifier for this problem. 
# import libraries

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#Import the ufo csv file into a pandas dataframe

ufo_data = pd.read_csv("county-data.csv")

In [3]:
#Printing example rows from csv to provide familiarization of the data
#.head() is a pandas function that can be used on a dataframe to print the param n number of records
print(ufo_data.head(5))

   subregion          region     pop  income   ipaddr  ufo2010
0  abbeville  south carolina   25101   34670    30330        2
1     acadia       louisiana   61912   37970    38203        6
2   accomack        virginia   33341   41595    41338        2
3        ada           idaho  409061   55304  1035427       59
4      adair            iowa    7481   47623     3762        0


In [4]:
#splitting the data into domain (X) and range (y) values
#We are predicting based on income and population alone. Only these 2 features needed for predictions.
#Adding unescessary features like ipaddr or region(state) would cause data leakage (overfitting)

X = ufo_data[["pop", "income"]]

In [5]:
#Setting a binary classification target based on UFO sightings > 333 using np.where() to create condition
#So if pred value > 333 it is categorized as Substantial UFO Appearances and <= 333 as No Substantial UFO Appearances
#np.where() acts as a simple Where True, yield x, otherwise yield y - a Binary Classification
#Our condition being Where 'ufo2010' > 333, yield 'Substantial UFO appearances', otherwise yield 'No Substantial UFO appearances'

y = np.where(ufo_data['ufo2010'] > 333, 'Substantial UFO appearances', 'No Substantial UFO appearances')

In [6]:
#Split data into training and testing sets
#X will become our train/test independent variables and y our train/test target variables
#Dr. Brown taught me  0.25 is good split ratio
#Setting random state (a random seed) to 25 for verification/replication purposes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 55)

In [7]:
#Decision tree classifier
#Simple default argument with no extra params
#This is the algorithm that our model will use provided by sklearn.tree

clf = DecisionTreeClassifier()

In [8]:
#Fit Dec Tree model to the training data
#.fit() will actually do the training. It is training the independent variables (X_train) on the target (y_train) (this is our
#binary classification from above). Will learn how to predict based on income and pop
clf.fit(X_train, y_train)


In [9]:
#predict y utilzing X_test
#i.e. predicting the X_test category based on the model trained on X_train
#So we are seeing how well it predicts on data it was not trained on. The test was excluded from X_test 
#when we train_test_split above

y_pred = clf.predict(X_test)

In [10]:
#Overall accuracy score for all classes
#Our two arguments are the y_test and y_pred. It is comparing the actual binary category for the test data and verifying
#the percentage that the model predicted correct.
#EX) if a test data record actually had 'ufo2010' = 55, and the model had a y_pred of 'No Substantial UFO Appearances'
#Then it predicted correct. Calculating overalll percentage of correct guesses. 
#Essentially, how good is our model?

acc = accuracy_score(y_test, y_pred)
print(acc)

0.9983739837398374


In [11]:
#I was skeptical of the high accuracy of my dec tree model so I made example predictor
#There are only 3 records in the csv where 'ufo2010' > 333
#So model is essentially predicting based off population and I think perhaps slightly above average income
#The three regions are maricopa, LA, and cook, also the highest populated regions
#Model seems to transition prediction around ~4 million population mark for region

# create a new DataFrame with population and income values
#EX 1) pop = 3 million, income = 55,000
#So low UFO sightings

new_data1 = pd.DataFrame({'pop': [3000000], 'income': [55000]})

#EX 2) pop = 4 million, income = 55,000
#So high UFO sightings

new_data2 = pd.DataFrame({'pop': [4000000], 'income': [55000]})

# make predictions with the trained classifier

prediction1 = clf.predict(new_data1)
prediction2 = clf.predict(new_data2)

#Print out our two predictions to demonstrate

print('low population prediction' + str(prediction1))
print('high population prediction' + str(prediction2))

low population prediction['No Substantial UFO appearances']
high population prediction['Substantial UFO appearances']
