In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
ufo = pd.read_csv('./ufo_sightings_large.csv')

In [2]:
# Print the DataFrame info
print(ufo.info())

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            4935 non-null   object 
 1   city            4926 non-null   object 
 2   state           4516 non-null   object 
 3   country         4255 non-null   object 
 4   type            4776 non-null   object 
 5   seconds         4935 non-null   float64
 6   length_of_time  4792 non-null   object 
 7   desc            4932 non-null   object 
 8   recorded        4935 non-null   object 
 9   lat             4935 non-null   object 
 10  long            4935 non-null   float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            4935 non-null   d

In [3]:
# Count the missing values in the length_of_time, state, and type columns, in that order
print(ufo[['length_of_time', 'state', 'type']].isna().sum())

# Drop rows where length_of_time, state, or type are missing
ufo_no_missing = ufo.dropna(subset=['length_of_time', 'state', 'type'])

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [4]:
import re
def return_minutes(time_string):
    # Search for numbers in time_string
    num = re.search('\d+', time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["length_of_time"] = ufo["length_of_time"].astype('str').apply(lambda x: x if 'minute' in x.lower() else None)
ufo = ufo.dropna(subset=['length_of_time'])
ufo["minutes"] = ufo["length_of_time"].astype('str').apply(return_minutes)

# Take a look at the head of both of the columns

print(ufo[["length_of_time", "minutes"]].head())

     length_of_time  minutes
3   about 5 minutes      5.0
5        10 minutes     10.0
8         2 minutes      2.0
9         2 minutes      2.0
10        5 minutes      5.0


In [5]:
# Check the variance of the seconds and minutes columns
print(ufo[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo["seconds"])

# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())

seconds    425466.767742
minutes       118.377156
dtype: float64
1.135245689687893


In [6]:
# Use pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda x: 1 if x == "us" else 0)

# Print the number of unique type values
print(len(ufo["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


In [7]:
# Look at the first 5 rows of the date column
print(ufo["date"].head()) 

# Extract the month from the date column
ufo["month"] = pd.DatetimeIndex(ufo["date"]).month

# Extract the year from the date column
ufo["year"] = pd.DatetimeIndex(ufo["date"]).year

# Take a look at the head of all three columns
print(ufo[["date", "month", "year"]].head())

3    2002-11-21 05:45:00
5    2012-06-16 23:00:00
8    2013-06-09 00:00:00
9    2013-04-26 23:27:00
10   2013-09-13 20:30:00
Name: date, dtype: datetime64[ns]
                  date  month  year
3  2002-11-21 05:45:00     11  2002
5  2012-06-16 23:00:00      6  2012
8  2013-06-09 00:00:00      6  2013
9  2013-04-26 23:27:00      4  2013
10 2013-09-13 20:30:00      9  2013


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Take a look at the head of the desc field
print(ufo['desc'].head())

# Instantiate the tfidf vectorizer object
vec = TfidfVectorizer()

# Fit and transform desc using vec
desc_tfidf = vec.fit_transform(ufo['desc'])

# Look at the number of columns and rows

print(desc_tfidf.shape)

3     It was a large&#44 triangular shaped flying ob...
5     Dancing lights that would fly around and then ...
8     Brilliant orange light or chinese lantern at o...
9     Bright red light moving north to north west fr...
10    North-east moving south-west. First 7 or so li...
Name: desc, dtype: object
(2013, 3597)


In [9]:
# Add in the rest of the arguments
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
# print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

In [10]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Call the return_weights function and extend filter_list
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
        
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# volunteer = ufo.dropna(subset=['category_desc'])

# Take the title text
title_text = ufo["desc"]
# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors

text_tfidf = tfidf_vec.fit_transform(title_text)

In [12]:
# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)
vocab = {v: k for k, v in tfidf_vec.vocabulary_.items()}

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)
vocab

{1748: 'it',
 3444: 'was',
 1840: 'large',
 155: '44',
 3282: 'triangular',
 2794: 'shaped',
 1376: 'flying',
 2248: 'object',
 951: 'dancing',
 1891: 'lights',
 3158: 'that',
 3552: 'would',
 1375: 'fly',
 414: 'around',
 358: 'and',
 3163: 'then',
 2028: 'merge',
 1723: 'into',
 2289: 'one',
 1884: 'light',
 631: 'brilliant',
 2305: 'orange',
 2301: 'or',
 753: 'chinese',
 1834: 'lantern',
 431: 'at',
 1871: 'less',
 3157: 'than',
 15: '1000',
 1421: 'ft',
 2130: 'moving',
 1149: 'east',
 3207: 'to',
 3467: 'west',
 289: 'across',
 2244: 'oakville',
 2292: 'ontario',
 2049: 'midnight',
 1781: 'june',
 266: '9th',
 95: '2013',
 623: 'bright',
 2597: 'red',
 2211: 'north',
 1418: 'from',
 3159: 'the',
 1611: 'horizon',
 3198: 'till',
 1046: 'disapeared',
 526: 'behind',
 803: 'clouds',
 2935: 'south',
 1331: 'first',
 2907: 'so',
 912: 'craft',
 1526: 'half',
 1109: 'dozen',
 3045: 'stragglers',
 3172: 'they',
 3465: 'were',
 3092: 'surely',
 2221: 'not',
 2449: 'planes',
 2208: 'nor',

In [13]:
ufo1 = ufo[['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone', 'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash', 'formation', 'light', 'other', 'oval', 'rectangle',
       'sphere', 'teardrop', 'triangle', 'unknown', 'month', 'year','country_enc']]
ufo1.dropna(how='any', inplace=True)
X = ufo1.drop(columns=['country_enc'], axis=1)
y = np.array(ufo1['country_enc'])
# Take a look at the features in the X set of data


print(X.columns)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier()
# Split the X and y sets
# Take a look at the features in the X set of data
print(X.columns)

# Split the X and y sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit knn to the training sets
knn.fit(X_train, y_train)

# Print the score of knn on the test sets
print(knn.score(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Index(['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')
Index(['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')


AttributeError: 'Flags' object has no attribute 'c_contiguous'

In [15]:

import sklearn.naive_bayes as GaussianNB


nb=GaussianNB()
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

# Split the X and y sets using train_test_split, setting stratify=y 
X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)

# Fit nb to the training sets
nb.fit(X_train, y_train)

# Print the score of nb on the test sets
print(nb.score(X_test, y_test))

TypeError: 'module' object is not callable