# Preprocessing for Machine Learning in Python

## 1. Introduction to Data Preprocessing

In [11]:
import pandas as pd

volunteer = pd.read_csv('volunteer_opportunities.csv')
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [12]:
volunteer['locality'].isna().sum()

70

Answer: 70

In [13]:
# Drop the Latitude and Longitude columns from volunteer
volunteer_cols = volunteer.drop(['Latitude', 'Longitude'], axis=1)

# Drop rows with missing category_desc values from volunteer_cols
volunteer_subset = volunteer_cols.dropna(subset=['category_desc'])

# Print out the shape of the subset
print(volunteer_subset.shape)

(617, 33)


In [14]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

Answer: Floats, Integers, and objects

In [15]:
# Print the head of the hits column
print(volunteer["hits"].head())

# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype('int')

# Look at the dtypes of the dataset
print(volunteer.dtypes)

0    737
1     22
2     62
3     14
4     31
Name: hits, dtype: int64
opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int32
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64


In [16]:
volunteer['category_desc'].value_counts()

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

Answer: Environment and Emergency Preparedness

In [18]:
volunteer = volunteer.dropna(subset=['category_id', 'category_desc'])

In [19]:
from sklearn.model_selection import train_test_split

# Create a DataFrame with all columns except category_desc
X = volunteer.drop('category_desc', axis=1)

# Create a category_desc labels dataset
y = volunteer[['category_desc']]

# Use stratified sampling to split up the dataset according to the y dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Print the category_desc counts from y_train
print(y_train['category_desc'].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


## 2. Standardizing Data

Answer: Your dataset is comprised of categorical data.

In [22]:
wine = pd.read_csv('wine_types.csv')
wine.head()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [24]:
X = wine[['Proline', 'Total phenols', 'Hue', 'Nonflavanoid phenols']]
y = wine['Type']

In [25]:
from sklearn.neighbors import KNeighborsClassifier

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

knn = KNeighborsClassifier()

# Fit the knn model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.6888888888888889


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [26]:
wine.var()

Type                                0.600679
Alcohol                             0.659062
Malic acid                          1.248015
Ash                                 0.075265
Alcalinity of ash                  11.152686
Magnesium                         203.989335
Total phenols                       0.391690
Flavanoids                          0.997719
Nonflavanoid phenols                0.015489
Proanthocyanins                     0.327595
Color intensity                     5.374449
Hue                                 0.052245
OD280/OD315 of diluted wines        0.504086
Proline                         99166.717355
dtype: float64

Answer: Proline

In [28]:
import numpy as np

# Print out the variance of the Proline column
print(wine['Proline'].var())

# Apply the log normalization function to the Proline column
wine['Proline_log'] = np.log(wine['Proline'])

# Check the variance of the normalized Proline column
print(wine['Proline_log'].var())

99166.71735542428
0.17231366191842018


In [29]:
wine.describe()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,Proline_log
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,6.530303
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.415107
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,5.627621
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,6.215606
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,6.512486
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,6.892642
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,7.426549


Answer: The max of Ash is 3.23, the max of Alcalinity of ash is 30, and the max of Magnesium is 162.

In [30]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Create the scaler
scaler = StandardScaler()

# Subset the DataFrame you want to scale 
wine_subset = wine[['Ash', 'Alcalinity of ash', 'Magnesium']]

# Apply the scaler to wine_subset
wine_subset_scaled = scaler.fit_transform(wine_subset)

In [33]:
X = wine.drop(['Type'], axis=1)
y = wine['Type']

In [34]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.7777777777777778


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Instantiate a StandardScaler
scaler = StandardScaler()

# Scale the training and test features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train_scaled, y_train)

# Score the model on the test data
print(knn.score(X_test_scaled, y_test))

0.9555555555555556


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## 3. Feature Engineering

Answer: 1 and 2

In [36]:
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,
5,5056,37426,15,0,Queens Stop 'N' Swap,135,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 12 2011,approved,,,,,,,,


Answer: 2, 3, and 4

In [38]:
hiking = pd.read_json('hiking.json')
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


In [41]:
from sklearn.preprocessing import LabelEncoder

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

# Compare the two columns
hiking[["Accessible", "Accessible_enc"]].head()

Unnamed: 0,Accessible,Accessible_enc
0,Y,1
1,N,0
2,N,0
3,N,0
4,N,0


In [42]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer['category_desc'])

# Take a look at the encoded columns
category_enc.head()

Unnamed: 0,Education,Emergency Preparedness,Environment,Health,Helping Neighbors in Need,Strengthening Communities
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,1,0,0,0
5,0,0,1,0,0,0


In [44]:
running_times_5k = pd.DataFrame ({'name': {0: 'Sue', 1: 'Mark', 2: 'Sean', 3: 'Erin', 4: 'Jenny', 5: 'Russell'},
                                  'run1': {0: 20.1, 1: 16.5, 2: 23.5, 3: 21.7, 4: 25.8, 5: 30.9},
                                  'run2': {0: 18.5, 1: 17.1, 2: 25.1, 3: 21.1, 4: 27.1, 5: 29.6},
                                  'run3': {0: 19.6, 1: 16.9, 2: 25.2, 3: 20.9, 4: 26.1, 5: 31.4},
                                  'run4': {0: 20.3, 1: 17.6, 2: 24.6, 3: 22.1, 4: 26.7, 5: 30.4},
                                  'run5': {0: 18.3, 1: 17.3, 2: 23.9, 3: 22.2, 4: 26.9, 5: 29.9}})

running_times_5k

Unnamed: 0,name,run1,run2,run3,run4,run5
0,Sue,20.1,18.5,19.6,20.3,18.3
1,Mark,16.5,17.1,16.9,17.6,17.3
2,Sean,23.5,25.1,25.2,24.6,23.9
3,Erin,21.7,21.1,20.9,22.1,22.2
4,Jenny,25.8,27.1,26.1,26.7,26.9
5,Russell,30.9,29.6,31.4,30.4,29.9


In [45]:
# Use .loc to create a mean column
running_times_5k["mean"] = running_times_5k.loc[:, 'run1':'run5'].mean(axis=1)

# Take a look at the results
running_times_5k.head()

Unnamed: 0,name,run1,run2,run3,run4,run5,mean
0,Sue,20.1,18.5,19.6,20.3,18.3,19.36
1,Mark,16.5,17.1,16.9,17.6,17.3,17.08
2,Sean,23.5,25.1,25.2,24.6,23.9,24.46
3,Erin,21.7,21.1,20.9,22.1,22.2,21.6
4,Jenny,25.8,27.1,26.1,26.7,26.9,26.52


In [46]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].dt.month

# Take a look at the converted and new month columns
volunteer[['start_date_converted', 'start_date_month']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volunteer["start_date_month"] = volunteer['start_date_converted'].dt.month


Unnamed: 0,start_date_converted,start_date_month
1,2011-02-01,2
2,2011-01-29,1
3,2011-02-14,2
4,2011-02-05,2
5,2011-02-12,2


In [52]:
hiking = hiking.dropna(subset=['Length'])

In [53]:
import re

# Write a pattern to extract numbers and decimals
def return_mileage(length):
    
    # Search the text for matches
    mile = re.search("\d+\.\d+", length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(return_mileage)
hiking[["Length", "Length_num"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiking["Length_num"] = hiking["Length"].apply(return_mileage)


Unnamed: 0,Length,Length_num
0,0.8 miles,0.8
1,1.0 mile,1.0
2,0.75 miles,0.75
3,0.5 miles,0.5
4,0.5 miles,0.5


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [56]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5161290322580645


## 4. Selecting Features for Modeling

Answer: A text field that hasn't been turned into a tf/idf vector yet

In [57]:
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon,Accessible_enc,Length_num
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,,1,0.8
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,,0,1.0
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,,0,0.75
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,,0,0.5
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,,0,0.5


Answer: All of the above.

In [66]:
volunteer = pd.read_csv('volunteer_opportunities.csv')
volunteer['vol_requests_lognorm'] = np.log(volunteer['vol_requests'])
volunteer['created_month'] = pd.to_datetime(volunteer['created_date']).dt.month
volunteer = pd.concat([volunteer, pd.get_dummies(volunteer['category_desc'])], axis=1)
volunteer = volunteer[['vol_requests', 'title', 'hits', 'category_desc', 'locality', 'region', 'postalcode', 'created_date', 
                       'vol_requests_lognorm', 'created_month', 'Education', 'Emergency Preparedness',  'Environment', 
                       'Health', 'Helping Neighbors in Need', 'Strengthening Communities']]

In [67]:
# Create a list of redundant column names to drop
to_drop = ["locality", "region", "vol_requests", "category_desc", "created_date"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of volunteer_subset
volunteer_subset.head()

Unnamed: 0,title,hits,postalcode,vol_requests_lognorm,created_month,Education,Emergency Preparedness,Environment,Health,Helping Neighbors in Need,Strengthening Communities
0,Volunteers Needed For Rise Up & Stay Put! Home...,737,,3.912023,1,0,0,0,0,0,0
1,Web designer,22,10010.0,0.693147,1,0,0,0,0,0,1
2,Urban Adventures - Ice Skating at Lasker Rink,62,10026.0,2.995732,1,0,0,0,0,0,1
3,Fight global hunger and support women farmers ...,14,2114.0,6.214608,1,0,0,0,0,0,1
4,Stop 'N' Swap,31,10455.0,2.70805,1,0,0,1,0,0,0


In [76]:
wine = pd.read_csv('wine_types.csv')
wine = wine[['Flavanoids', 'Total phenols', 'Malic acid', 'OD280/OD315 of diluted wines', 'Hue']]

In [77]:
# Print out the column correlations of the wine dataset
wine.corr()

Unnamed: 0,Flavanoids,Total phenols,Malic acid,OD280/OD315 of diluted wines,Hue
Flavanoids,1.0,0.864564,-0.411007,0.787194,0.543479
Total phenols,0.864564,1.0,-0.335167,0.699949,0.433681
Malic acid,-0.411007,-0.335167,1.0,-0.36871,-0.561296
OD280/OD315 of diluted wines,0.787194,0.699949,-0.36871,1.0,0.565468
Hue,0.543479,0.433681,-0.561296,0.565468,1.0


In [78]:
# Drop that column from the DataFrame
wine = wine.drop('Flavanoids', axis=1)

wine.head()

Unnamed: 0,Total phenols,Malic acid,OD280/OD315 of diluted wines,Hue
0,2.8,1.71,3.92,1.04
1,2.65,1.78,3.4,1.05
2,2.8,2.36,3.17,1.03
3,3.85,1.95,3.45,0.86
4,2.8,2.59,2.93,1.04


In [112]:
volunteer = pd.read_csv('volunteer_opportunities.csv')
volunteer = volunteer[['category_desc', 'title']]
volunteer = volunteer.dropna(subset=['category_desc'])

In [113]:
vocab_list = [(1048, 'web'), (278, 'designer'), (1017, 'urban'), (38, 'adventures'), (490, 'ice'), (890, 'skating'), (90, 'at'), (559, 'lasker'), (832, 'rink'), (368, 'fight'), (423, 'global'), (487, 'hunger'), (68, 'and'), (944, 'support'), (1061, 'women'), (356, 'farmers'), (535, 'join'), (969, 'the'), (708, 'oxfam'), (27, 'action'), (240, 'corps'), (498, 'in'), (680, 'nyc'), (922, 'stop'), (947, 'swap'), (790, 'queens'), (911, 'staff'), (281, 'development'), (992, 'trainer'), (200, 'claro'), (145, 'brooklyn'), (1037, 'volunteer'), (93, 'attorney'), (221, 'community'), (455, 'health'), (43, 'advocates'), (942, 'supervise'), (189, 'children'), (466, 'highland'), (717, 'park'), (409, 'garden'), (1071, 'worldofmoney'), (696, 'org'), (1085, 'youth'), (60, 'amazing'), (791, 'race'), (789, 'qualified'), (133, 'board'), (620, 'member'), (860, 'seats'), (98, 'available'), (1083, 'young'), (33, 'adult'), (1006, 'tutor'), (1016, 'updated'), (11, '30'), (0, '11'), (513, 'insurance'), (199, 'claims'), (600, 'manager'), (979, 'timebanksnyc'), (432, 'great'), (340, 'exchange'), (205, 'clean'), (1015, 'up'), (81, 'asbury'), (171, 'cementary'), (918, 'staten'), (524, 'island'), (869, 'senior'), (194, 'citizen'), (392, 'friendly'), (1033, 'visitor'), (881, 'shop'), (1000, 'tree'), (161, 'care'), (1068, 'workshop'), (4, '20'), (646, 'movie'), (856, 'screener'), (380, 'for'), (870, 'seniors'), (355, 'farm'), (430, 'graphic'), (691, 'open'), (480, 'house'), (416, 'get'), (984, 'tools'), (980, 'to'), (806, 'recycling'), (1039, 'volunteers'), (660, 'needed'), (353, 'family'), (336, 'event'), (207, 'clerical'), (158, 'cancer'), (1041, 'walk'), (120, 'befitnyc'), (739, 'physical'), (30, 'activity'), (700, 'organizers'), (269, 'decision'), (266, 'day'), (5, '2011'), (661, 'needs'), (1084, 'your'), (459, 'help'), (405, 'gain'), (1021, 'valuable'), (245, 'counseling'), (344, 'experience'), (687, 'on'), (845, 'samaritans'), (9, '24'), (479, 'hour'), (255, 'crisis'), (478, 'hotline'), (457, 'heart'), (407, 'gallery'), (703, 'our'), (503, 'info'), (949, 'table'), (373, 'finding'), (471, 'homes'), (542, 'kids'), (1077, 'yiddish'), (903, 'speaking'), (472, 'homework'), (460, 'helper'), (892, 'skilled'), (800, 'rebuilding'), (982, 'together'), (468, 'home'), (818, 'repairs'), (438, 'greenteam'), (40, 'advetures'), (940, 'summer'), (931, 'streets'), (1005, 'tuesday'), (335, 'evenings'), (1060, 'with'), (612, 'masa'), (594, 'lunch'), (770, 'program'), (1018, 'us'), (706, 'outreach'), (618, 'meals'), (760, 'preparedness'), (222, 'compost'), (773, 'project'), (613, 'master'), (223, 'composter'), (178, 'certificate'), (249, 'course'), (318, 'emblemhealth'), (144, 'bronx'), (683, 'of'), (873, 'service'), (531, 'jcc'), (601, 'manhattan'), (418, 'girl'), (855, 'scout'), (872, 'series'), (296, 'dorot'), (838, 'rosh'), (452, 'hashanah'), (709, 'package'), (274, 'delivery'), (713, 'painting'), (511, 'instructor'), (530, 'jasa'), (464, 'hes'), (172, 'center'), (12, '3rd'), (70, 'annual'), (377, 'flyny'), (548, 'kite'), (366, 'festival'), (983, 'tomorrow'), (151, 'business'), (566, 'leaders'), (955, 'teach'), (110, 'basics'), (465, 'high'), (852, 'schoolers'), (410, 'gardening'), (397, 'ft'), (1004, 'tryon'), (910, 'st'), (610, 'martin'), (748, 'poetry'), (668, 'new'), (1079, 'york'), (216, 'college'), (424, 'goal'), (941, 'sunday'), (361, 'february'), (6, '2012'), (262, 'dance'), (8, '22nd'), (560, 'latino'), (604, 'march'), (2, '17'), (1013, 'university'), (848, 'saturday'), (1008, 'tutors'), (744, 'planet'), (485, 'human'), (602, 'mapping'), (420, 'give'), (1050, 'week'), (186, 'child'), (569, 'learn'), (796, 'read'), (926, 'storytelling'), (243, 'costume'), (597, 'making'), (912, 'stage'), (277, 'design'), (319, 'emergency'), (351, 'fair'), (17, '9th'), (1053, 'west'), (887, 'side'), (248, 'county'), (676, 'nutrition'), (314, 'educator'), (879, 'shape'), (306, 'east'), (13, '54st'), (801, 'rec'), (1046, 'water'), (45, 'aerobics'), (83, 'asser'), (573, 'levy'), (712, 'paint'), (57, 'alongside'), 
              (783, 'publicolor'), (936, 'students'), (536, 'jumpstart'), (797, 'readers'), (564, 'lead'), (252, 'crafts'), (408, 'games'), (348, 'face'), (751, 'popcorn'), (527, 'jackie'), (835, 'robinson'), (716, 'parent'), (375, 'fitness'), (916, 'starrett'), (197, 'city'), (585, 'line'), (263, 'dancer'), (615, 'math'), (587, 'literacy'), (114, 'be'), (209, 'climb'), (985, 'top'), (608, 'marketing'), (86, 'assistant'), (313, 'education'), (673, 'nonprofit'), (867, 'seeks'), (805, 'recruitment'), (626, 'mentors'), (810, 'register'), (92, 'attend'), (142, 'breakfast'), (701, 'orientation'), (529, 'january'), (272, 'deliver'), (1058, 'winter'), (1031, 'visit'), (65, 'an'), (525, 'isolated'), (342, 'exercise'), (213, 'coach'), (670, 'night'), (115, 'beach'), (180, 'change'), (77, 'art'), (772, 'programs'), (229, 'consumer'), (779, 'protection'), (562, 'law'), (589, 'liver'), (579, 'life'), (565, 'leader'), (901, 'soup'), (547, 'kitchen'), (307, 'eastern'), (534, 'john'), (650, 'muir'), (930, 'street'), (1024, 'vendor'), (641, 'monthly'), (959, 'team'), (367, 'fiesta'), (977, 'throgs'), (658, 'neck'), (224, 'computer'), (956, 'teacher'), (567, 'leadership'), (244, 'council'), (693, 'opportunity'), (231, 'conversation'), (461, 'helpers'), (427, 'grades'), (714, 'pantry'), (288, 'distribution'), (305, 'earth'), (960, 'tech'), (1049, 'website'), (692, 'opportunities'), (175, 'cents'), (19, 'ability'), (203, 'classroom'), (877, 'set'), (146, 'brush'), (545, 'kindness'), (999, 'transportation'), (58, 'alternatives'), (129, 'bike'), (1020, 'valet'), (1026, 'video'), (311, 'editing'), (767, 'professionals'), (921, 'stipend'), (49, 'after'), (851, 'school'), (624, 'mentor'), (666, 'networking'), (138, 'bowling'), (398, 'fun'), (449, 'harlem'), (555, 'lanes'), (866, 'seeking'), (1078, 'yoga'), (902, 'spanish'), (695, 'or'), (389, 'french'), (362, 'feed'), (488, 'hungry'), (1080, 'yorkers'), (14, '55'), (690, 'only'), (735, 'phone'), (106, 'bank'), (819, 'representative'), (795, 'reach'), (704, 'out'), (643, 'morris'), (458, 'heights'), (904, 'special'), (155, 'camp'), (946, 'susan'), (551, 'komen'), (259, 'cure'), (433, 'greater'), (47, 'affiliate'), (303, 'dumbo'), (79, 'arts'), (698, 'organizational'), (148, 'budget'), (639, 'money'), (596, 'makes'), (871, 'sense'), (994, 'training'), (889, 'site'), (1027, 'videographer'), (376, 'fly'), (152, 'by'), (970, 'theater'), (429, 'grant'), (1074, 'writer'), (745, 'planning'), (778, 'proposal'), (759, 'preparation'), (399, 'fund'), (793, 'raising'), (450, 'harm'), (808, 'reduction'), (35, 'adv'), (515, 'intern'), (875, 'serving'), (575, 'lgbt'), (34, 'adults'), (482, 'how'), (830, 'ride'), (130, 'bikes'), (821, 'research'), (401, 'fundraising'), (280, 'developement'), (233, 'cook'), (840, 'row'), (50, 'afterschool'), (630, 'middle'), (885, 'shower'), (400, 'fundraisers'), (526, 'it'), (519, 'interpreters'), (563, 'lawyers'), (446, 'haitian'), (18, 'abe'), (757, 'pre'), (412, 'ged'), (640, 'monitor'), (89, 'astoria'), (634, 'million'), (1001, 'trees'), (421, 'giveaway'), (290, 'do'), (1081, 'you'), (1044, 'want'), (595, 'make'), (283, 'difference'), (204, 'classwish'), (896, 'snow'), (883, 'shoveling'), (196, 'citizenship'), (761, 'press'), (586, 'list'), (781, 'public'), (813, 'relations'), (743, 'plan'), (829, 'review'), (394, 'friendship'), (753, 'positive'), (121, 'beginnings'), (546, 'kit'), (611, 'mary'), (803, 'recreation'), (291, 'does'), (697, 'organization'), (659, 'need'), (858, 'search'), (928, 'strategy'), (332, 'esl'), (46, 'affected'), (924, 'storm'), (995, 'transform'), (590, 'lives'), (933, 'strengthen'), (220, 'communities'), (119, 'become'), (302, 'driver'), (1025, 'veterans'), (191, 'chinese'), (997, 'translator'), (512, 'instructors'), (653, 'museum'), (621, 'membership'), (275, 'department'), (284, 'director'), (117, 'beautify'), (996, 'transitional'), (822, 'residence'), (470, 'homeless'), (623, 'men'), (953, 'tank'), (517, 'internship'), (774, 'projects'), (841, 'run'), (1056, 'wild'), (139, 'boys'), 
              (475, 'hope'), (419, 'girls'), (219, 'communications'), (792, 'raise'), (100, 'awareness'), (31, 'administrative'), (56, 'alliance'), (811, 'registrar'), (647, 'ms'), (1062, 'word'), (162, 'career'), (246, 'counselor'), (722, 'passover'), (304, 'early'), (188, 'childhood'), (149, 'build'), (747, 'plastic'), (137, 'bottle'), (857, 'sculpture'), (763, 'pride'), (523, 'is'), (538, 'just'), (76, 'around'), (238, 'corner'), (520, 'involved'), (675, 'now'), (390, 'fresh'), (53, 'air'), (957, 'teachers'), (372, 'find'), (729, 'perfect'), (533, 'job'), (684, 'office'), (1075, 'writing'), (264, 'data'), (326, 'entry'), (29, 'activism'), (738, 'photography'), (843, 'salesforce'), (265, 'database'), (261, 'customization'), (736, 'photo'), (333, 'essay'), (572, 'legal'), (42, 'advisor'), (467, 'hike'), (974, 'thon'), (236, 'coordinator'), (558, 'laser'), (950, 'tag'), (298, 'dowling'), (3, '175th'), (505, 'information'), (962, 'technology'), (352, 'fall'), (382, 'forest'), (826, 'restoration'), (541, 'kickoff'), (1002, 'trevor'), (582, 'lifeline'), (247, 'counselors'), (973, 'thomas'), (532, 'jefferson'), (614, 'materials'), (1076, 'year'), (386, 'founder'), (341, 'executive'), (453, 'haunted'), (557, 'lantern'), (989, 'tours'), (383, 'fort'), (986, 'totten'), (657, 'national'), (878, 'sexual'), (82, 'assault'), (689, 'online'), (993, 'trainers'), (48, 'african'), (63, 'american'), (210, 'clothing'), (301, 'drive'), (828, 'returning'), (865, 'seeds'), (939, 'success'), (746, 'plant'), (981, 'today'), (443, 'growth'), (1009, 'udec'), (328, 'enviromedia'), (636, 'mobile'), (606, 'maritime'), (102, 'bacchanal'), (742, 'pirates'), (365, 'fest'), (492, 'ikea'), (329, 'erie'), (111, 'basin'), (282, 'diabetes'), (88, 'association'), (364, 'feria'), (267, 'de'), (844, 'salud'), (664, 'nepali'), (105, 'bangla'), (784, 'punjabi'), (998, 'translators'), (674, 'not'), (769, 'profit'), (741, 'pioneer'), (159, 'capoeira'), (1023, 'various'), (752, 'positions'), (287, 'dispatcher'), (991, 'trainee'), (506, 'ing'), (603, 'marathon'), (388, 'free'), (593, 'love'), (135, 'books'), (268, 'dear'), (96, 'authors'), (52, 'aide'), (850, 'scheuer'), (627, 'merchandise'), (293, 'donate'), (943, 'supplies'), (360, 'feast'), (406, 'gala'), (112, 'battery'), (833, 'rise'), (919, 'stay'), (787, 'put'), (820, 'rescue'), (897, 'soccer'), (402, 'futsal'), (730, 'performing'), (36, 'advanced'), (202, 'classes'), (1070, 'world'), (854, 'science'), (1054, 'western'), (64, 'americorps'), (25, 'aces'), (310, 'economic'), (864, 'security'), (507, 'initiative'), (331, 'esi'), (633, 'mill'), (173, 'centers'), (631, 'midtown'), (1088, 'zumba'), (1030, 'vision'), (635, 'mission'), (66, 'analysis'), (552, 'lab'), (958, 'teaching'), (84, 'assist'), (827, 'resume'), (150, 'building'), (899, 'society'), (214, 'coaches'), (1040, 'vs'), (218, 'committee'), (842, 'russian'), (385, 'foster'), (170, 'celebration'), (616, 'may'), (7, '21th'), (688, 'one'), (711, 'pager'), (294, 'donation'), (489, 'hurricane'), (521, 'irene'), (354, 'far'), (836, 'rockaway'), (325, 'enjoy'), (1066, 'working'), (686, 'olympics'), (988, 'tournament'), (798, 'reading'), (719, 'partners'), (234, 'cooper'), (909, 'square'), (975, 'thrift'), (908, 'spring'), (166, 'case'), (599, 'management'), (404, 'fvcp'), (990, 'trail'), (254, 'crew'), (447, 'halloween'), (165, 'carnival'), (1042, 'walkathon'), (359, 'feasibility'), (67, 'analyst'), (749, 'police'), (868, 'seminar'), (1064, 'work'), (1035, 'visually'), (496, 'impaired'), (964, 'teens'), (972, 'this'), (322, 'energy'), (315, 'efficiency'), (321, 'end'), (859, 'season'), (156, 'campaign'), (123, 'benefits'), (802, 'reception'), (300, 'drill'), (237, 'copywriting'), (235, 'coord'), (454, 'have'), (725, 'penchant'), (55, 'all'), (971, 'things'), (1028, 'vintage'), (976, 'thriftshop'), (718, 'partner'), (726, 'pencil'), (720, 'partnership'), (710, 'packing'), (16, '8th'), (907, 'sports'), (346, 'expo'), (164, 'cares'), (184, 'cheerleaders'), (1045, 'wanted'), (445, 'habitat'), 
              (371, 'finance'), (215, 'coffee'), (324, 'english'), (755, 'practice'), (570, 'learners'), (456, 'healthy'), (28, 'active'), (978, 'time'), (122, 'benefit'), (73, 'april'), (357, 'fashion'), (929, 'strawberry'), (87, 'assistants'), (174, 'central'), (1087, 'zoo'), (1, '125th'), (127, 'bideawee'), (440, 'greeters'), (592, 'looking'), (799, 'real'), (495, 'impact'), (504, 'inform'), (728, 'people'), (756, 'practices'), (580, 'lifebeat'), (413, 'general'), (932, 'streetsquash'), (286, 'discovery'), (874, 'services'), (663, 'neighborhood'), (768, 'profiles'), (951, 'take'), (915, 'stand'), (51, 'against'), (1029, 'violence'), (345, 'expert'), (41, 'advice'), (537, 'june'), (849, 'schedule'), (258, 'crowdfunding'), (727, 'penny'), (451, 'harvest'), (434, 'green'), (185, 'chefs'), (677, 'nutritionists'), (379, 'foodies'), (625, 'mentoring'), (136, 'boom'), (669, 'newsletter'), (217, 'come'), (934, 'strides'), (1043, 'walks'), (187, 'childcare'), (898, 'social'), (619, 'media'), (422, 'giving'), (157, 'can'), (61, 'ambassador'), (10, '2nd'), (967, 'thanksgiving'), (363, 'feeding'), (662, 'needy'), (782, 'publicity'), (723, 'patient'), (163, 'caregiver'), (1032, 'visiting'), (469, 'homebound'), (358, 'fc'), (679, 'nyawc'), (384, 'forum'), (21, 'about'), (1038, 'volunteering'), (809, 'refreshments'), (847, 'sara'), (837, 'roosevelt'), (206, 'cleanup'), (116, 'beautification'), (337, 'events'), (69, 'animal'), (484, 'hudson'), (834, 'river'), (605, 'mariners'), (825, 'response'), (343, 'exhibit'), (20, 'aboard'), (584, 'lilac'), (208, 'client'), (1052, 'welcome'), (279, 'desk'), (685, 'older'), (574, 'lexington'), (251, 'craft'), (750, 'poll'), (1065, 'workers'), (518, 'interperters'), (24, 'accounting'), (85, 'assistance'), (477, 'hosting'), (776, 'promotion'), (1011, 'unicef'), (954, 'tap'), (814, 'release'), (270, 'dedication'), (771, 'programming'), (500, 'incarnation'), (295, 'donor'), (544, 'kieran'), (906, 'sponsorship'), (1069, 'workshops'), (118, 'because'), (338, 'every'), (276, 'deserves'), (179, 'chance'), (740, 'pin'), (273, 'delivered'), (886, 'shred'), (15, '5th'), (99, 'avenue'), (169, 'cdsc'), (917, 'starving'), (78, 'artist'), (884, 'show'), (948, 'system'), (396, 'front'), (880, 'share'), (553, 'lanch'), (935, 'student'), (463, 'hemophilia'), (577, 'liason'), (629, 'methodist'), (476, 'hospital'), (113, 'bay'), (831, 'ridge'), (124, 'benonhurst'), (75, 'area'), (900, 'sought'), (97, 'autistic'), (297, 'douglaston'), (788, 'qns'), (812, 'registration'), (32, 'administrator'), (153, 'call'), (426, 'governor'), (804, 'recruiter'), (786, 'purim'), (327, 'envelope'), (938, 'stuffing'), (528, 'jam'), (462, 'helpline'), (923, 'store'), (374, 'first'), (415, 'generation'), (1022, 'van'), (241, 'cortlandt'), (816, 'remembrance'), (945, 'survey'), (823, 'resonations'), (143, 'breast'), (323, 'engine'), (694, 'optimization'), (622, 'memorial'), (894, 'sloan'), (540, 'kettering'), (435, 'greenhouse'), (436, 'greening'), (227, 'concert'), (334, 'evacuation'), (824, 'resources'), (417, 'gift'), (126, 'bicycling'), (656, 'my'), (393, 'friends'), (473, 'honor'), (1051, 'weekend'), (731, 'person'), (651, 'mural'), (312, 'editor'), (732, 'personal'), (882, 'shopper'), (764, 'pro'), (134, 'bono'), (253, 'create'), (160, 'cards'), (920, 'step'), (672, 'non'), (780, 'provider'), (516, 'interns'), (645, 'motion'), (431, 'graphics'), (125, 'best'), (147, 'buddies'), (502, 'inern'), (103, 'back'), (588, 'little'), (242, 'cosmetologist'), (107, 'barber'), (1036, 'vocational'), (72, 'apartment'), (439, 'greeter'), (766, 'professional'), (1019, 'use'), (893, 'skills'), (702, 'others'), (369, 'figure'), (257, 'croton'), (190, 'chinatown'), (193, 'ci'), (758, 'prep'), (239, 'corporate'), (1063, 'wordpress'), (132, 'blog'), (510, 'instructer'), (807, 'red'), (474, 'hook'), (289, 'divert'), (966, 'textiles'), (395, 'from'), (554, 'landfill'), (437, 'greenmarket'), (965, 'textile'), (154, 'calling'), (195, 'citizens'), (497, 'improve'), (26, 'achievement'), 
              (721, 'passion'), (481, 'housing'), (1067, 'works'), (499, 'inc'), (441, 'group'), (299, 'drama'), (561, 'laundromats'), (320, 'employment'), (927, 'strategic'), (667, 'never'), (104, 'bad'), (391, 'friend'), (403, 'future'), (201, 'class'), (1059, 'wish'), (387, 'fpcj'), (1072, 'worship'), (1010, 'undergraduate'), (428, 'graduate'), (228, 'conference'), (1047, 'we'), (775, 'promote'), (550, 'knowledge'), (715, 'parade'), (74, 'archivist'), (425, 'google'), (44, 'adwords'), (493, 'imentor'), (642, 'more'), (598, 'male'), (632, 'miles'), (637, 'moms'), (183, 'charity'), (176, 'century'), (987, 'tour'), (198, 'civil'), (724, 'patrol'), (62, 'america'), (539, 'kept'), (862, 'secret'), (648, 'ms131'), (549, 'knitter'), (256, 'crochet'), (131, 'blankets'), (177, 'ceo'), (591, 'logo'), (1012, 'unique'), (1057, 'will'), (128, 'big'), (37, 'adventure'), (23, 'accountant'), (876, 'session'), (888, 'single'), (644, 'mothers'), (192, 'choice'), (895, 'smc'), (1055, 'wii'), (705, 'outdoor'), (671, 'nights'), (607, 'market'), (514, 'intake'), (638, 'monday'), (141, 'branding'), (140, 'brand'), (491, 'identity'), (649, 'mt'), (1086, 'zion'), (543, 'kidz'), (817, 'reorganize'), (578, 'library'), (378, 'food'), (91, 'athletic'), (568, 'league'), (655, 'musician'), (59, 'alzheimer'), (654, 'music'), (109, 'bash'), (765, 'proctor'), (952, 'taking'), (339, 'exams'), (777, 'promotional'), (733, 'personnel'), (95, 'august'), (891, 'skill'), (665, 'networker'), (309, 'ecological'), (785, 'puppet'), (501, 'income'), (414, 'generating'), (699, 'organizations'), (250, 'cpr'), (576, 'lgbtq'), (317, 'el'), (652, 'museo'), (271, 'del'), (108, 'barrio'), (628, 'met'), (330, 'escort'), (846, 'sand'), (167, 'castle'), (230, 'contest'), (853, 'schools'), (486, 'humanities'), (80, 'as'), (861, 'second'), (556, 'language'), (101, 'babies'), (963, 'teen'), (54, 'al'), (682, 'oerter'), (483, 'html'), (260, 'curriculum'), (737, 'photographer'), (863, 'secretary'), (754, 'pr'), (1073, 'would'), (583, 'like'), (225, 'computers'), (961, 'technical'), (442, 'grownyc'), (968, 'that'), (347, 'extraordinary'), (381, 'foreclosure'), (762, 'prevention'), (681, 'nylag'), (678, 'ny'), (226, 'concern'), (509, 'inspire'), (22, 'academic'), (1007, 'tutoring'), (794, 'rbi'), (71, 'anyone'), (211, 'cma'), (212, 'cms'), (232, 'conversion'), (308, 'eating'), (571, 'learning'), (181, 'chaperones'), (1034, 'visits'), (411, 'gear'), (1014, 'unlimited'), (581, 'lifeguard'), (350, 'facilitators'), (1003, 'troop'), (839, 'route'), (609, 'marshall'), (508, 'inmotion'), (925, 'story'), (913, 'stair'), (292, 'domestic'), (168, 'catskills'), (815, 'relief'), (316, 'effort'), (94, 'audience'), (734, 'pharmacy'), (444, 'guide'), (707, 'overnight'), (494, 'immediate'), (285, 'dirty'), (448, 'hands'), (349, 'facilitator'), (905, 'specialist'), (182, 'chapter'), (914, 'stamps'), (522, 'iridescent'), (937, 'studio'), (39, 'advertising'), (370, 'filmmakers'), (617, 'mayor'), (1082, 'youcantoo')]

vocab = {k:v for k,v in vocab_list}

In [114]:
# Add in the rest of the arguments
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

[189, 942, 466]


In [115]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Call the return_weights function and extend filter_list
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
        
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

# Filter the columns in text_tfidf to only those in filtered_words
filtered_text = text_tfidf[:, list(filtered_words)]

In [116]:
y = volunteer['category_desc']

# Split the dataset according to the class distribution of category_desc
X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5161290322580645


In [122]:
wine = pd.read_csv('wine_types.csv')

In [123]:
from sklearn.decomposition import PCA

# Instantiate a PCA object
pca = PCA()

# Define the features and labels from the wine dataset
X = wine.drop("Type", axis=1)
y = wine["Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Apply PCA to the wine dataset X vector
pca_X_train = pca.fit_transform(X_train)
pca_X_test = pca.transform(X_test)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.97795009e-01 2.02071827e-03 9.88350594e-05 5.66222566e-05
 1.26161135e-05 8.93235789e-06 3.13856866e-06 1.57406401e-06
 1.15918860e-06 7.49332354e-07 3.70332305e-07 1.94185373e-07
 8.08440051e-08]


In [124]:
# Fit knn to the training data
knn.fit(pca_X_train, y_train)

# Score knn on the test data and print it out
print(knn.score(pca_X_test, y_test))

0.7777777777777778


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## 5. Putting It All Together

In [203]:
ufo = pd.read_csv('ufo_sightings_large.csv')
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [204]:
# Print the DataFrame info
print(ufo.info())

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype('float')

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            4935 non-null   object 
 1   city            4926 non-null   object 
 2   state           4516 non-null   object 
 3   country         4255 non-null   object 
 4   type            4776 non-null   object 
 5   seconds         4935 non-null   float64
 6   length_of_time  4792 non-null   object 
 7   desc            4932 non-null   object 
 8   recorded        4935 non-null   object 
 9   lat             4935 non-null   object 
 10  long            4935 non-null   float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            4935 non-null   d

In [205]:
# Count the missing values in the length_of_time, state, and type columns, in that order
print(ufo[['length_of_time', 'state', 'type']].isna().sum())

# Drop rows where length_of_time, state, or type are missing
ufo_no_missing = ufo.dropna(subset=['length_of_time', 'state', 'type'])

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [206]:
ufo = ufo_no_missing.dropna(subset=['length_of_time'])

In [207]:
def return_minutes(time_string):

    # Search for numbers in time_string
    num = re.search("\d+", time_string)
    
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)

# Take a look at the head of both of the columns
ufo[['length_of_time', 'minutes']].head()

Unnamed: 0,length_of_time,minutes
0,2 weeks,2.0
1,30sec.,30.0
3,about 5 minutes,5.0
4,2,2.0
5,10 minutes,10.0


In [208]:
ufo = ufo.dropna(subset=['minutes'])
ufo = ufo[ufo['seconds'] > 0]

In [209]:
# Check the variance of the seconds and minutes columns
print(ufo[['seconds', 'minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

# Print out the variance of just the seconds_log column
print(ufo['seconds_log'].var())

seconds    3.198739e+09
minutes    2.411617e+02
dtype: float64
4.639277974219807


In [210]:
# Use pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda x: 1 if x == 'us' else 0)

# Print the number of unique type values
print(len(ufo['type'].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo['type'])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

21


In [211]:
# Look at the first 5 rows of the date column
print(ufo['date'].head())

# Extract the month from the date column
ufo["month"] = ufo["date"].dt.month

# Extract the year from the date column
ufo["year"] = ufo["date"].dt.year

# Take a look at the head of all three columns
print(ufo[['date', 'month', 'year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
5   2012-06-16 23:00:00
6   2009-07-12 21:30:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
3 2002-11-21 05:45:00     11  2002
5 2012-06-16 23:00:00      6  2012
6 2009-07-12 21:30:00      7  2009


In [212]:
# Take a look at the head of the desc field
print(ufo['desc'].head())

# Instantiate the tfidf vectorizer object
vec = TfidfVectorizer()

# Fit and transform desc using vec
desc_tfidf = vec.fit_transform(ufo['desc'])

# Look at the number of columns and rows
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
3    It was a large&#44 triangular shaped flying ob...
5    Dancing lights that would fly around and then ...
6    A minor amber color trail&#44 (from where we w...
Name: desc, dtype: object
(3926, 5428)


In [213]:
vocab_list = [(1664, 'it'), (3275, 'was'), (1744, 'large'), (147, '44'), (3123, 'triangular'), (2657, 'shaped'), (1320, 'flying'), (2134, 'object'), (910, 'dancing'), (1794, 'lights'), (3002, 'that'), (3379, 'would'), (1319, 'fly'), (395, 'around'), (340, 'and'), (3007, 'then'), (1923, 'merge'), (1645, 'into'), (2173, 'one'), (1787, 'light'), (604, 'brilliant'), (2188, 'orange'), (2184, 'or'), (718, 'chinese'), (1738, 'lantern'), (412, 'at'), (1774, 'less'), (3001, 'than'), (15, '1000'), (1363, 'ft'), (2021, 'moving'), (1102, 'east'), (3050, 'to'), (3298, 'west'), (273, 'across'), (2130, 'oakville'), (2176, 'ontario'), (1942, 'midnight'), (1690, 'june'), (251, '9th'), (92, '2013'), (596, 'bright'), (2472, 'red'), (2097, 'north'), (1360, 'from'), (3003, 'the'), (1539, 'horizon'), (3041, 'till'), (1003, 'disapeared'), (502, 'behind'), (766, 'clouds'), (2793, 'south'), (1276, 'first'), (2766, 'so'), (873, 'craft'), (1462, 'half'), (1063, 'dozen'), (2899, 'stragglers'), (3015, 'they'), (3296, 'were'), (2943, 'surely'), (2107, 'not'), (2330, 'planes'), (2094, 'nor'), (449, 'ball'), (2157, 'of'), (2751, 'slowly'), (2872, 'stationary'), (2031, 'multicolored'), (738, 'circular'), (1926, 'met'), (637, 'by'), (351, 'another'), (3315, 'which'), (2435, 'raised'), (1915, 'meet'), (3057, 'too'), (1025, 'displayed'), (3153, 'ufo'), (1421, 'going'), (2737, 'sky'), (3206, 'uso'), (3280, 'watched'), (3202, 'us'), (1331, 'for'), (864, 'couple'), (1963, 'minutes'), (1326, 'follows'), (3085, 'train'), (3077, 'tracks'), (1607, 'in'), (3350, 'winter'), (46, '1931'), (2586, 'saw'), (1859, 'machine'), (401, 'as'), (2524, 'riding'), (2171, 'on'), (1543, 'horse'), (1065, 'draw'), (2265, 'pasture'), (13, '10'), (2343, 'pm'), (23, '12'), (249, '99'), (2602, 'scottsdale'), (386, 'arizona'), (2778, 'something'), (1482, 'have'), (2067, 'never'), (2623, 'seen'), (498, 'before'), (3097, 'traveling'), (259, 'above'), (1414, 'glow'), (691, 'central'), (1987, 'montana'), (1928, 'metalic'), (2822, 'sphere'), (2551, 'rotating'), (1240, 'fast'), (2271, 'pattern'), (3287, 'we'), (1076, 'driving'), (3069, 'town'), (3011, 'there'), (1656, 'is'), (1424, 'golf'), (865, 'course'), (1530, 'holes'), (760, 'close'), (2535, 'road'), (3312, 'when'), (313, 'all'), (2954, 'suuden'), (2262, 'passing'), (3137, 'turned'), (2158, 'off'), (474, 'bayou'), (558, 'blvd'), (2177, 'onto'), (1420, 'godwinson'), (337, 'an'), (1209, 'extremely'), (1061, 'down'), (3355, 'with'), (3081, 'trail'), (2757, 'smok'), (1337, 'formation'), (167, '44counted'), (31, '15'), (2198, 'orbs'), (3190, 'until'), (3219, 'vanished'), (628, 'bursts'), (782, 'color'), (811, 'concentrated'), (384, 'area'), (764, 'cloud'), (486, 'beautiful'), (2708, 'silver'), (784, 'colored'), (2583, 'saucer'), (258, 'about'), (2729, 'size'), (2553, 'round'), (42, '18'), (3311, 'wheeler'), (3141, 'turquoise'), (2293, 'perimeter'), (750, 'clearly'), (125, '3997'), (8, '05'), (225, '64'), (4, '02'), (0, '00'), (1808, 'little'), (881, 'creek'), (1723, 'ky'), (1572, 'humming'), (2085, 'noise'), (1549, 'house'), (2654, 'shaking'), (2754, 'small'), (1914, 'medium'), (2730, 'sized'), (536, 'black'), (2471, 'rectangular'), (2647, 'several'), (3390, 'years'), (3104, 'tree'), (1800, 'line'), (2622, 'seemingly'), (627, 'burning'), (3194, 'up'), (2966, 'tail'), (1032, 'dissipated'), (3067, 'towards'), (1141, 'end'), (2698, 'sight'), (524, 'big'), (1966, 'miss'), (187, '44orange'), (2475, 'redish'), (2136, 'objects'), (1312, 'floating'), (1533, 'home'), (2643, 'sets'), (3147, 'two'), (2940, 'sunset'), (2485, 'remained'), (3316, 'while'), (2211, 'other'), (2013, 'moved'), (2800, 'southwest'), (2875, 'stayed'), (3126, 'tripled'), (2725, 'sitting'), (671, 'car'), (1831, 'looking'), (2715, 'singal'), (1147, 'engine'), (2329, 'plane'), (2932, 'suddenly'), (1014, 'disc'), (3295, 'went'), (2263, 'past'), (3240, 'very'), (3319, 'white'), (703, 'chased'), (219, '52'), (2054, 'near'), (3380, 'wright'), (2274, 'patterson'), (291, 'air'), (1332, 'force'), (466, 'base'), (2373, 'power'), (2218, 'out'), (652, 'came'), (2620, 'seemed'), (733, 'circle'), (793, 'come'), (442, 'back'), (2421, 'quickly'), (1695, 'just'), (3218, 'vanishe'), (3022, 'thirteen'), (2038, 'my'), (1437, 'grandmother'), (2000, 'mother'), (1483, 'having'), (1593, 'ice'), (880, 'cream'), (2224, 'outside'), (990, 'diner'), (205, '45'), (214, '50'), (947, 'degrees'), (1418, 'glows'), (699, 'changing'), (979, 'different'), (289, 'again'), (3393, 'yellow'), (1556, 'hovering'), (962, 'description'), (2367, 'possible'), (2700, 'sighting'), (2886, 'still'), (2377, 'present'), (2857, 'star'), (1796, 'like'), (3052, 'together'), (2571, 'same'), (2816, 'speed'), (987, 'dimmed'), (1007, 'disappeared'), (1289, 'flare'), (713, 'chevron'), (2705, 'silent'), (2229, 'over'), (108, '30'), (1954, 'min'), (682, 'caught'), (3246, 'video'), (2980, 'tape'), (1270, 'fireball'), (286, 'after'), (1252, 'few'), (1445, 'greenish'), (970, 'diamond'), (1279, 'five'), (2755, 'smaller'), (929, 'daytime'), (2076, 'night'), (964, 'desert'), (1555, 'hovered'), (2814, 'sped'), (438, 'away'), (1969, 'mississauga'), (658, 'canada'), (368, 'appeared'), (477, 'be'), (1517, 'high'), (2368, 'possibly'), (357, 'any'), (1098, 'earthling'), (1861, 'made'), (1665, 'item'), (1612, 'incredible'), (2900, 'straight'), (2084, 'no'), (2788, 'sound'), (1297, 'flat'), (549, 'blinking'), (1729, 'lake'), (2210, 'oswego'), (2425, 'quot'), (1365, 'full'), (2685, 'shortly'), (21, '11'), (35, '16'), (2936, 'sunday'), (943, 'defied'), (282, 'aerodynamiccs'), (1719, 'know'), (554, 'blue'), (2410, 'purple'), (1219, 'faded'), (909, 'danced'), (885, 'cricle'), (2529, 'rise'), (1011, 'disapper'), (2512, 'retiring'), (1430, 'got'), (494, 'bed'), (2544, 'room'), (911, 'dark'), (3337, 'window'), (2744, 'slightly'), (2179, 'open'), (1489, 'head'), (1770, 'left'), (1378, 'gateway'), (298, 'airport'), (1924, 'merged'), (352, 'anouther'), (489, 'became'), (3040, 'tight'), (197, '44straight'), (374, 'approx'), (124, '39'), (2656, 'shape'), (1630, 'intense'), (1092, 'each'), (770, 'cntr'), (191, '44rest'), (1324, 'followed'), (2144, 'observer'), (2889, 'stoppe'), (616, 'brownwood'), (2999, 'texas'), (117, '33'), (1900, 'mass'), (2832, 'sporadically'), (1611, 'inconsistent'), (2791, 'sounds'), (3284, 'waves'), (106, '29'), (151, '442008'), (242, '8pm'), (181, '44my'), (3402, 'yr'), (2168, 'old'), (1398, 'girls'), (407, 'asked'), (3308, 'what'), (1598, 'if'), (2679, 'shooting'), (3053, 'told'), (3005, 'them'), (2018, 'moves'), (504, 'being'), (1020, 'disk'), (1254, 'field'), (232, '70'), (133, '39s'), (2258, 'pass'), (2335, 'play'), (702, 'chase'), (366, 'appear'), (324, 'altitude'), (1088, 'during'), (749, 'clear'), (926, 'day'), (179, '44make'), (1005, 'disappear'), (1465, 'hammond'), (1600, 'illinois'), (2960, 'sylvania'), (1907, 'mccord'), (1691, 'junior'), (2219, 'outer'), (2803, 'space'), (78, '20'), (1829, 'looked'), (2862, 'stars'), (1292, 'flash'), (599, 'brightlights'), (336, 'amp'), (3292, 'weird'), (2088, 'noisies'), (2902, 'strange'), (1476, 'happennings'), (3268, 'walking'), (1817, 'local'), (457, 'bar'), (1896, 'mars'), (490, 'because'), (657, 'can'), (3293, 'well'), (2525, 'right'), (2117, 'now'), (2403, 'pulsating'), (520, 'bethel'), (2250, 'park'), (2236, 'pa'), (721, 'christmas'), (1174, 'eve'), (2572, 'san'), (356, 'antonio'), (653, 'camera'), (945, 'definitely'), (857, 'could'), (1597, 'idetify'), (3023, 'this'), (2200, 'ordinary'), (1306, 'flies'), (2706, 'silently'), (1176, 'evening'), (2340, 'please'), (2506, 'respect'), (2386, 'privacy'), (697, 'changed'), (998, 'directions'), (1196, 'experience'), (3184, 'unkown'), (3230, 'vegas'), (856, 'couch'), (2604, 'screen'), (1053, 'door'), (1475, 'happened'), (2617, 'see'), (296, 'airplane'), (632, 'but'), (762, 'closer'), (2194, 'orb'), (463, 'barnes'), (2446, 'rd'), (3269, 'wallingford'), (2225, 'oval'), (555, 'blueish'), (3412, 'zig'), (3409, 'zaging'), (1028, 'dissapears'), (2898, 'strage'), (2163, 'ohio'), (2533, 'river'), (2746, 'slow'), (2813, 'spectacular'), (3117, 'triangle'), (2619, 'seem'), (2605, 'se'), (973, 'did'), (2012, 'move'), (1822, 'long'), (1105, 'eastern'), (2439, 'random'), (2405, 'pulse'), (292, 'aircraft'), (2381, 'pressure'), (3283, 'wave'), (290, 'ahead'), (1444, 'green'), (2922, 'strobing'), (1086, 'duration'), (2120, 'nuforc'), (2108, 'note'), (2717, 'sirius'), (2277, 'pd'), (508, 'bell'), (1026, 'dissapear'), (2458, 'reappear'), (1985, 'monmouth'), (863, 'county'), (2068, 'new'), (1675, 'jersey'), (1886, 'maple'), (1455, 'grove'), (364, 'apparent'), (1273, 'firey'), (574, 'bottom'), (2305, 'photos'), (670, 'captured'), (1080, 'drops'), (1751, 'late'), (287, 'afternoon'), (2436, 'raleigh'), (2052, 'nc'), (377, 'april'), (243, '8th'), (2216, 'our'), (1838, 'lost'), (1996, 'morphed'), (1008, 'disappearing'), (2836, 'spotted'), (790, 'columbus'), (2024, 'ms'), (1948, 'military'), (293, 'aircrafts'), (369, 'appearing'), (1646, 'investigate'), (1447, 'grey'), (1354, 'friend'), (1239, 'fashion'), (2697, 'sideways'), (994, 'direct'), (1366, 'future'), (445, 'backyard'), (2767, 'soaring'), (1295, 'flashing'), (1165, 'erratically'), (1846, 'low'), (98, '22'), (1494, 'headlights'), (690, 'centered'), (101, '25'), (25, '13'), (140, '40'), (2844, 'squared'), (3241, 'vessel'), (1860, 'macon'), (1370, 'ga'), (2244, 'paper'), (1685, 'journal'), (2867, 'stated'), (1989, 'month'), (2578, 'satalites'), (3256, 'visible'), (137, '39winking'), 
              (3154, 'ufos'), (700, 'charleston'), (2592, 'sc'), (2909, 'streaming'), (2823, 'spheres'), (955, 'des'), (1978, 'moines'), (1653, 'iowa'), (453, 'balls'), (1338, 'formations'), (1677, 'jets'), (3066, 'toward'), (1702, 'kennedy'), (689, 'center'), (3249, 'viewed'), (775, 'cocoa'), (1281, 'fl'), (1417, 'glowing'), (476, 'bday'), (2256, 'party'), (1125, 'else'), (328, 'am'), (2498, 'reporting'), (1457, 'gulf'), (406, 'ashtabula'), (2719, 'sister'), (612, 'brother'), (1757, 'law'), (1478, 'hard'), (1198, 'explain'), (3031, 'three'), (2928, 'suburb'), (1269, 'fire'), (510, 'beloit'), (3324, 'wi'), (570, 'border'), (2278, 'pea'), (2344, 'pod'), (1345, 'four'), (1790, 'lighted'), (2281, 'peas'), (3059, 'top'), (3360, 'witnessed'), (3044, 'times'), (3110, 'tremendeous'), (1164, 'erratic'), (2014, 'moveme'), (3095, 'travel'), (2100, 'northern'), (1314, 'florida'), (2408, 'pulsing'), (2686, 'shot'), (2538, 'rocket'), (3185, 'unlike'), (1936, 'mi'), (2789, 'soundless'), (2727, 'six'), (3025, 'those'), (467, 'baseball'), (2920, 'strobe'), (1768, 'lebanon'), (893, 'ct'), (1586, 'i4'), (1192, 'exit'), (29, '14'), (2824, 'spherical'), (1463, 'halo'), (380, 'arc'), (2231, 'overhead'), (2487, 'remember'), (2240, 'pale'), (743, 'city'), (1087, 'durham'), (1325, 'following'), (996, 'direction'), (1806, 'lite'), (3034, 'through'), (2143, 'observed'), (593, 'brie'), (1232, 'far'), (1413, 'glod'), (2111, 'nothig'), (3398, 'you'), (2027, 'much'), (2074, 'nice'), (1922, 'mercey'), (1545, 'hot'), (2840, 'springs'), (2827, 'spinning'), (2773, 'solid'), (2674, 'shinny'), (3092, 'transparent'), (2833, 'spot'), (1492, 'heading'), (1566, 'huge'), (2189, 'orangeish'), (1340, 'forming'), (898, 'curved'), (1460, 'had'), (2781, 'somewhere'), (688, 'cen'), (36, '160'), (1993, 'more'), (3150, 'type'), (1339, 'formed'), (2615, 'seconds'), (3286, 'way'), (1791, 'lighthouse'), (1043, 'does'), (729, 'cigar'), (1910, 'me'), (1120, 'electric'), (1616, 'indian'), (673, 'carolina'), (2260, 'passenger'), (2609, 'seat'), (844, 'conversion'), (3214, 'van'), (1981, 'mom'), (2694, 'siblings'), (408, 'asleep'), (1134, 'emitted'), (927, 'daylight'), (2566, 'rush'), (1547, 'hour'), (1441, 'gray'), (439, 'awesome'), (3396, 'yet'), (882, 'creepy'), (2831, 'spooky'), (2669, 'shined'), (2671, 'shiniest'), (722, 'chrome'), (1182, 'ever'), (805, 'completely'), (2422, 'quiet'), (2699, 'sighted'), (1746, 'las'), (2496, 'report'), (1179, 'event'), (3294, 'wells'), (198, '44texas'), (3358, 'without'), (109, '300'), (141, '400'), (1245, 'feet'), (2299, 'phoenix'), (441, 'az'), (2186, 'orage'), (1405, 'glenville'), (3384, 'wv'), (1557, 'hovers'), (1552, 'houston'), (3148, 'tx'), (1227, 'fall'), (60, '1973'), (2834, 'spotlight'), (1971, 'mist'), (3199, 'upward'), (2002, 'motion'), (1967, 'missile'), (1754, 'launch'), (200, '44then'), (2187, 'orang'), (3281, 'watching'), (2010, 'mountain'), (1231, 'fanwood'), (2081, 'nj'), (2638, 'series'), (2801, 'southwestern'), (2985, 'teardrop'), (3282, 'water'), (1805, 'lit'), (1301, 'flew'), (573, 'bothell'), (3261, 'wa'), (1929, 'metallic'), (1042, 'dodge'), (2911, 'street'), (1206, 'exprsway'), (2170, 'omaha'), (2053, 'ne'), (1487, 'hbccufo'), (659, 'canadian'), (974, 'didn'), (134, '39t'), (2946, 'surrounded'), (2064, 'neon'), (1553, 'hover'), (914, 'dart'), (1902, 'massive'), (785, 'colorful'), (1624, 'instantly'), (1520, 'highway'), (112, '31'), (1617, 'indiana'), (957, 'descend'), (3106, 'trees'), (1521, 'hill'), (999, 'directly'), (3236, 'vernon'), (1415, 'glowball'), (1361, 'front'), (496, 'bedroom'), (1247, 'felt'), (633, 'butterfly'), (2628, 'sensation'), (712, 'chest'), (3000, 'th'), (901, 'cylinder'), (414, 'atlantic'), (2152, 'ocean'), (774, 'coastal'), (1529, 'hobe'), (1937, 'miami'), (310, 'alien'), (2339, 'playing'), (1343, 'forth'), (1117, 'egg'), (5, '03'), (1724, 'l7'), (1548, 'hours'), (3221, 'vanishing'), (1767, 'leaving'), (560, 'bobbing'), (3183, 'unknown'), (3144, 'twinkling'), (849, 'corners'), (3168, 'underneath'), (393, 'arou'), (3017, 'thin'), (2321, 'pinkish'), (2528, 'rings'), (1069, 'drifting'), (2026, 'mt'), (3210, 'va'), (2371, 'potomac'), (1096, 'early'), (50, '1963'), (51, '1964'), (2490, 'reno'), (2913, 'streets'), (3130, 'trying'), (1265, 'find'), (1546, 'hotel'), (2114, 'noticed'), (2312, 'pie'), (1472, 'hanging'), (739, 'circus'), (2251, 'parki'), (57, '197'), (1490, 'headed'), (3373, 'work'), (1538, 'horizion'), (2309, 'pics'), (938, 'decide'), (3400, 'yourself'), (564, 'bolingbrook'), (1904, 'may'), (41, '17th'), (81, '2001'), (1912, 'meadows'), (1771, 'length'), (1303, 'flickered'), (2494, 'replace'), (2923, 'strong'), (2238, 'pacific'), (1799, 'lincoln'), (2606, 'sea'), (1459, 'gypsy'), (478, 'beach'), (796, 'coming'), (3100, 'travels'), (1962, 'minute'), (404, 'ascends'), (1408, 'gliding'), (1877, 'manhattan'), (991, 'dinner'), (2775, 'some'), (858, 'couldn'), (503, 'beileve'), (1836, 'los'), (342, 'angeles'), (1888, 'march'), (83, '2004'), (1241, 'faster'), (359, 'anything'), (2029, 'multi'), (787, 'coloured'), (541, 'blasts'), (2667, 'shifting'), (2957, 'swaying'), (93, '2055'), (1561, 'hrs'), (1114, 'edt'), (2716, 'single'), (3111, 'tremendously'), (1203, 'explodes'), (276, 'activity'), (1700, 'keller'), (2580, 'satellites'), (410, 'associated'), (708, 'chemtrails'), (3096, 'traveled'), (3192, 'unusual'), (1880, 'manner'), (2345, 'point'), (3410, 'zags'), (1834, 'loops'), (1959, 'minnesotas'), (3371, 'woods'), (2056, 'nearly'), (1157, 'equilateral'), (848, 'corner'), (3063, 'total'), (2707, 'sillouette'), (2693, 'si'), (1601, 'illuminated'), (1021, 'disks'), (2011, 'mountains'), (1284, 'flame'), (2098, 'northeast'), (1819, 'location'), (1385, 'georgia'), (1208, 'extreme'), (1304, 'flickering'), (3132, 'tubular'), (950, 'delta'), (572, 'both'), (934, 'debris'), (2701, 'sightings'), (1714, 'kingstown'), (2521, 'ri'), (2155, 'odd'), (1778, 'lewisville'), (2563, 'running'), (1760, 'leader'), (2565, 'rural'), (2997, 'tests'), (2771, 'soil'), (1364, 'fuel'), (686, 'cell'), (3222, 'vapor'), (1047, 'dogs'), (461, 'barking'), (1842, 'loudly'), (1287, 'flanked'), (2174, 'ones'), (2721, 'sited'), (2314, 'pier'), (2276, 'pawleys'), (1659, 'island'), (491, 'become'), (420, 'auburn'), (1244, 'federal'), (1851, 'luminous'), (2325, 'pittsburgh'), (3389, 'year'), (1036, 'distinct'), (2045, 'naked'), (1210, 'eye'), (3405, 'yuma'), (2015, 'movement'), (3347, 'winn'), (1040, 'dixie'), (2252, 'parking'), (1839, 'lot'), (1825, 'longwood'), (3317, 'whippany'), (1998, 'morris'), (1747, 'last'), (2760, 'smoking'), (731, 'cigarette'), (327, 'always'), (1041, 'do'), (2493, 'repeated'), (1140, 'encounter'), (65, '1980'), (483, 'beardstown'), (818, 'confirmed'), (2049, 'nature'), (3079, 'traditional'), (1336, 'format'), (2359, 'port'), (645, 'california'), (3229, 'veers'), (3220, 'vanishes'), (1885, 'many'), (1056, 'dots'), (2413, 'put'), (2222, 'outlined'), (3099, 'travelling'), (2675, 'shiny'), (1006, 'disappeards'), (982, 'dim'), (322, 'also'), (888, 'crossed'), (1442, 'great'), (1034, 'distances'), (2683, 'short'), (335, 'amount'), (3043, 'time'), (2459, 'reappeare'), (735, 'circles'), (2829, 'splitting'), (2722, 'siting'), (3215, 'vancouver'), (1485, 'haze'), (248, '96'), (405, 'ashland'), (2202, 'oregon'), (72, '1996'), (1797, 'lima'), (1927, 'metal'), (862, 'country'), (2866, 'state'), (332, 'amber'), (1765, 'least'), (224, '6000'), (233, '7000'), (262, 'absolutely'), (1280, 'fixed'), (2044, 'nailed'), (2326, 'place'), (475, 'bb'), (1504, 'held'), (391, 'arms'), (110, '30am'), (3027, 'thought'), (499, 'began'), (530, 'binoculars'), (1578, 'husband'), (497, 'been'), (2618, 'seeing'), (3302, 'western'), (231, '6pm'), (1183, 'every'), (1580, 'hw'), (1814, 'lo'), (2507, 'rest'), (1293, 'flashed'), (2516, 'revealed'), (121, '35ish'), (3328, 'wife'), (2681, 'shopping'), (446, 'bag'), (741, 'ciruclar'), (1079, 'dropping'), (1291, 'flares'), (1423, 'golden'), (511, 'below'), (3386, 'wyoming'), (1257, 'figeting'), (2509, 'resturant'), (647, 'called'), (227, '66'), (14, '100'), (2695, 'side'), (1033, 'distance'), (1872, 'making'), (2758, 'smoke'), (2890, 'stopped'), (3264, 'waiting'), (74, '1999'), (875, 'crafts'), (3116, 'triange'), (2629, 'separate'), (3357, 'within'), (2994, 'ten'), (562, 'body'), (2786, 'sort'), (317, 'almost'), (581, 'bouy'), (2175, 'only'), (1199, 'explainable'), (930, 'dayton'), (763, 'closing'), (2443, 'rapidly'), (797, 'commercial'), (3054, 'tomah'), (3351, 'wisconsin'), (2769, 'softly'), (969, 'diameter'), (3102, 'traversed'), (154, '44800'), (965, 'detectable'), (1832, 'looks'), (814, 'condor'), (887, 'cross'), (532, 'bird'), (1077, 'drone'), (1094, 'eagle'), (2415, 'qu'), (2828, 'split'), (1258, 'fighter'), (1676, 'jet'), (1636, 'intercepted'), (2614, 'second'), (3062, 'toronto'), (2885, 'steps'), (361, 'apartment'), (3103, 'travling'), (3414, 'zigzagging'), (2182, 'opposite'), (600, 'brightly'), (1639, 'intermittently'), (2091, 'non'), (2955, 'sw'), (952, 'denver'), (2738, 'skyline'), (2518, 'rex'), (859, 'couldnt'), (506, 'believe'), (3018, 'thing'), (786, 'colors'), (171, '44gold'), (1330, 'football'), (1426, 'gondola'), (2925, 'structure'), (3080, 'traffic'), (822, 'congestion'), (447, 'bakersfield'), (639, 'ca'), (1671, 'january'), (67, '1983'), (1425, 'golfing'), (3291, 'weekend'), (2040, 'myrtle'), (422, 'august'), (85, '2006'), (3354, 'witc'), (2390, 'pronged'), (223, '60'), (1777, 'level'), (2577, 'sat'), (834, 'contemplated'), (1558, 'how'), 
              (2600, 'school'), (1752, 'later'), (1256, 'fiery'), (1166, 'erraticly'), (2028, 'mufon'), (783, 'colorado'), (3361, 'witnesses'), (634, 'buzz'), (38, '17'), (3093, 'transport'), (3211, 'vail'), (815, 'cone'), (2917, 'strip'), (1913, 'medford'), (2372, 'potterville'), (236, '7pm'), (96, '21'), (1816, 'lobed'), (3301, 'westerly'), (1761, 'leading'), (2963, 'tacoma'), (1660, 'isle'), (715, 'child'), (1560, 'however'), (1516, 'hig'), (2468, 'rectangle'), (2003, 'motionless'), (1394, 'gilbert'), (1309, 'flipping'), (1588, 'i5freeway'), (1218, 'fade'), (2455, 'reapear'), (2327, 'places'), (1905, 'maybe'), (103, '26'), (868, 'cousins'), (448, 'balcony'), (479, 'beachfront'), (435, 'avon'), (2797, 'southern'), (1362, 'fruitville'), (2575, 'sarasota'), (1097, 'earth'), (256, 'abnormal'), (1673, 'jefferson'), (2587, 'say'), (1513, 'here'), (3395, 'yes'), (383, 'are'), (606, 'bristol'), (3049, 'tn'), (1263, 'film'), (3322, 'who'), (1721, 'knows'), (1574, 'hundreds'), (2285, 'people'), (825, 'considered'), (427, 'authentic'), (1480, 'has'), (966, 'determined'), (1527, 'hoax'), (3260, 'visual'), (3248, 'view'), (1947, 'miles'), (2515, 'returned'), (3345, 'wingless'), (3131, 'tube'), (2290, 'perfectly'), (1311, 'floated'), (2703, 'signal'), (563, 'boise'), (1594, 'id'), (2642, 'set'), (1818, 'located'), (654, 'campbells'), (1074, 'drive'), (2670, 'shiney'), (1737, 'lane'), (2096, 'normal'), (1541, 'horizontal'), (3042, 'tilting'), (1735, 'landed'), (1925, 'mesa'), (3326, 'width'), (1090, 'dusk'), (1178, 'evenly'), (323, 'alternated'), (522, 'between'), (1650, 'invisible'), (3098, 'travelled'), (3084, 'trails'), (3120, 'triangles'), (184, '44objects'), (915, 'darted'), (3406, 'zag'), (568, 'boomerang'), (1294, 'flashes'), (2696, 'sides'), (480, 'beam'), (2672, 'shining'), (3065, 'touching'), (1450, 'ground'), (1655, 'irregular'), (1994, 'morning'), (2673, 'shinning'), (3353, 'wispy'), (2197, 'orbiting'), (607, 'broad'), (3359, 'witness'), (1017, 'discovers'), (1893, 'marks'), (904, 'cylindrical'), (2474, 'reddish'), (827, 'consistent'), (443, 'background'), (2731, 'sk'), (622, 'bunch'), (737, 'circlular'), (960, 'descends'), (3161, 'uncertain'), (3314, 'whether'), (2639, 'serious'), (2318, 'pinal'), (165, '44circular'), (190, '44red'), (1221, 'fading'), (174, '44huge'), (195, '44silent'), (1961, 'mintues'), (2853, 'stalled'), (297, 'airplanes'), (2658, 'shapes'), (876, 'cranston'), (2462, 'reappears'), (976, 'diff'), (20, '10th'), (76, '1pm'), (1419, 'go'), (2935, 'sun'), (3335, 'wilsonville'), (176, '44il'), (64, '1979'), (1720, 'known'), (1667, 'its'), (2430, 'radioactive'), (3278, 'waste'), (1084, 'dump'), (2646, 'seventies'), (2676, 'ship'), (2557, 'row'), (745, 'classic'), (1050, 'dome'), (129, '39dancing'), (3234, 'ventura'), (2541, 'role'), (2161, 'offshore'), (2033, 'multiple'), (980, 'differing'), (1644, 'intervention'), (2907, 'streaking'), (1271, 'fireballs'), (732, 'cincinnati'), (2162, 'oh'), (773, 'coast'), (846, 'copper'), (971, 'diamonds'), (3299, 'westbound'), (3149, 'tyler'), (1138, 'en'), (693, 'cette'), (345, 'ann'), (1093, 'eacute'), (1672, 'je'), (951, 'demeur'), (430, 'avec'), (1983, 'mon'), (845, 'copain'), (2398, 'puis'), (126, '39avais'), (3072, 'trac'), (3158, 'un'), (692, 'cercle'), (933, 'de'), (2279, 'peace'), (1845, 'love'), (1686, 'jours'), (428, 'avant'), (878, 'crayons'), (1251, 'feutre'), (1696, 'juste'), (2245, 'par'), (3014, 'these'), (1537, 'hoovering'), (1242, 'father'), (2782, 'son'), (611, 'brookyln'), (2125, 'ny'), (58, '1970'), (590, 'breaks'), (3401, 'ypsilanti'), (1991, 'moon'), (977, 'differen'), (2799, 'southwards'), (2545, 'rose'), (2720, 'site'), (1881, 'manuever'), (2209, 'orlando'), (698, 'changes'), (2360, 'portal'), (379, 'aptos'), (2289, 'perfect'), (1661, 'isosceles'), (804, 'complete'), (3180, 'unison'), (2124, 'nw'), (3138, 'turning'), (2684, 'shorted'), (769, 'cluster'), (1637, 'interchanging'), (362, 'apeared'), (3037, 'thrust'), (1031, 'dissappeared'), (2460, 'reappeared'), (809, 'con'), (2971, 'taking'), (1044, 'dog'), (2855, 'standing'), (940, 'deck'), (1215, 'faces'), (1082, 'due'), (1525, 'his'), (937, 'decently'), (2492, 'repeat'), (2688, 'show'), (80, '2000'), (756, 'cloaked'), (808, 'components'), (1640, 'international'), (2645, 'seven'), (3019, 'things'), (1135, 'emitting'), (1841, 'loud'), (2690, 'shrieking'), (2192, 'orangey'), (2320, 'pink'), (1622, 'inside'), (119, '33right'), (2295, 'persistent'), (381, 'arced'), (556, 'bluish'), (1651, 'involved'), (2904, 'strangley'), (53, '1966'), (239, '8211'), (537, 'blackfoot'), (3213, 'valley'), (2227, 'ovando'), (1237, 'farmland'), (3399, 'young'), (3365, 'woman'), (3266, 'walk'), (1933, 'metro'), (2267, 'path'), (484, 'bearing'), (319, 'along'), (1802, 'lines'), (1175, 'even'), (3368, 'woode'), (372, 'approached'), (1865, 'main'), (2180, 'opened'), (2338, 'played'), (2206, 'original'), (2310, 'picture'), (3378, 'worth'), (3372, 'words'), (148, '44000'), (3032, 'threw'), (2540, 'rocks'), (2234, 'ovoid'), (3394, 'yellowish'), (2037, 'mutiney'), (473, 'bay'), (1573, 'hundred'), (1322, 'folks'), (685, 'celebrating'), (213, '4th'), (1688, 'july'), (3177, 'union'), (906, 'dad'), (1716, 'kite'), (2484, 'relized'), (547, 'blinked'), (2636, 'sequence'), (193, '44saw'), (2916, 'string'), (2199, 'order'), (1807, 'litghts'), (2843, 'square'), (1710, 'kiltered'), (1318, 'fluttering'), (373, 'approaching'), (2795, 'southeast'), (2480, 'region'), (1236, 'farmington'), (544, 'blimp'), (3276, 'washington'), (2610, 'seattle'), (1863, 'madrona'), (1454, 'groups'), (73, '1998'), (3273, 'wanted'), (2379, 'press'), (2119, 'nuclear'), (2448, 'reactor'), (1507, 'helicopters'), (2870, 'static'), (2440, 'randomly'), (94, '20min'), (2172, 'once'), (1081, 'drove'), (440, 'awoken'), (669, 'capsule'), (1734, 'land'), (2968, 'take'), (2508, 'restaurant'), (2093, 'noon'), (836, 'continued'), (2451, 'ready'), (2399, 'pull'), (1075, 'driveway'), (1428, 'good'), (1035, 'distant'), (1118, 'eight'), (1643, 'intervals'), (2034, 'murfreesboro'), (450, 'ballls'), (2733, 'skies'), (1559, 'howad'), (3217, 'vanish'), (867, 'cousin'), (1512, 'her'), (2976, 'tall'), (2319, 'pine'), (748, 'cleaning'), (1277, 'fish'), (2411, 'purpule'), (1687, 'ju'), (1222, 'faint'), (2444, 'rate'), (2625, 'seethrough'), (1124, 'elongated'), (2284, 'pentagon'), (2351, 'poles'), (2947, 'surrounding'), (1972, 'misterious'), (1532, 'hollywood'), (91, '2012'), (2929, 'suburban'), (1957, 'minneapolis'), (250, '9pm'), (2441, 'range'), (2714, 'since'), (460, 'barely'), (928, 'days'), (2450, 'reading'), (1375, 'garden'), (3387, 'yard'), (802, 'compared'), (253, 'abilene'), (1267, 'finished'), (2500, 'rerun'), (1390, 'ghost'), (1575, 'hunters'), (1742, 'laptop'), (3207, 'usual'), (1429, 'google'), (894, 'cube'), (2888, 'stop'), (1078, 'dropped'), (1376, 'gas'), (2871, 'station'), (953, 'departed'), (2191, 'oranges'), (1184, 'exact'), (360, 'apart'), (1931, 'meteors'), (2447, 're'), (1152, 'entering'), (3174, 'unexplained'), (2336, 'playa'), (948, 'del'), (2519, 'rey'), (2952, 'suspicious'), (1486, 'hazy'), (1939, 'michigan'), (546, 'blink'), (1427, 'gone'), (3157, 'umo'), (70, '1988'), (2153, 'oct'), (1260, 'figures'), (621, 'buildings'), (2526, 'rigid'), (281, 'aerial'), (272, 'acrobatics'), (2743, 'slight'), (2953, 'sutle'), (3167, 'under'), (2819, 'spencerport'), (3385, 'wx'), (684, 'cavu'), (648, 'calm'), (139, '3rd'), (626, 'burned'), (1307, 'flight'), (958, 'descended'), (2873, 'stationery'), (3026, 'though'), (3235, 'venus'), (3421, 'zooming'), (635, 'buzzing'), (2086, 'noiseless'), (344, 'angles'), (424, 'aura'), (2878, 'steady'), (1409, 'glimmering'), (861, 'counties'), (1941, 'middle'), (2397, 'puget'), (2102, 'northward'), (2783, 'sonora'), (1916, 'meets'), (2212, 'others'), (1010, 'disappears'), (390, 'armada'), (454, 'baltimore'), (514, 'beltway'), (2148, 'observing'), (2881, 'steelers'), (1372, 'game'), (2286, 'peoria'), (2055, 'nearby'), (2363, 'ports'), (2103, 'northwest'), (87, '2008'), (355, 'anthony'), (1374, 'gap'), (1119, 'el'), (2257, 'paso'), (2, '00pm'), (1980, 'moline'), (1228, 'falling'), (1099, 'earths'), (417, 'atomsphere'), (1736, 'landing'), (1970, 'missouri'), (3257, 'vision'), (2601, 'scope'), (2558, 'rows'), (525, 'bigger'), (598, 'brighter'), (2633, 'seperate'), (1569, 'hum'), (696, 'change'), (2860, 'starlike'), (1067, 'dribbled'), (2915, 'stright'), (852, 'corona'), (3152, 'uah'), (1576, 'huntsville'), (301, 'al'), (2023, 'mph'), (2072, 'next'), (667, 'cape'), (777, 'cod'), (1341, 'forms'), (2118, 'nowhere'), (2776, 'somehow'), (2942, 'sure'), (3313, 'where'), (3115, 'triang'), (2748, 'slowely'), (917, 'darts'), (2637, 'sequentially'), (382, 'arched'), (3411, 'zenith'), (2110, 'notheastern'), (1965, 'mirror'), (905, 'cylndrical'), (1062, 'downtown'), (2794, 'southbound'), (925, 'davie'), (131, '39nt'), (1887, 'marble'), (3251, 'viewpoint'), (2348, 'points'), (1491, 'heades'), (2846, 'sse'), (3107, 'treetop'), (2488, 'remote'), (681, 'catskill'), (2661, 'sharp'), (343, 'angle'), (456, 'banos'), (1581, 'hwy'), (104, '27'), (34, '15am'), (2423, 'quietly'), (398, 'arrow'), (526, 'biggest'), (1142, 'ended'), (1698, 'keep'), (936, 'decended'), (402, 'ascended'), (1335, 'form'), (1019, 'dishpan'), (1400, 'glacier'), (2280, 'peak'), (2483, 'releasing'), (3047, 'tiny'), (1811, 'littleton'), (2895, 'storms'), (3223, 'various'), (821, 'conformations'), (3306, 'westward'), (1137, 'empty'), (2842, 'spurts'), 
              (694, 'chain'), (1155, 'equally'), (2805, 'spaced'), (830, 'constant'), (676, 'cascades'), (1745, 'larger'), (1982, 'moment'), (2133, 'obj'), (1662, 'iss'), (2331, 'planet'), (2547, 'roseville'), (211, '48066'), (210, '48'), (1169, 'est'), (2926, 'student'), (1054, 'doors'), (392, 'arnold'), (1975, 'mo'), (1706, 'kettleman'), (772, 'coalinga'), (2596, 'scarry'), (2933, 'summer'), (2660, 'shapped'), (2543, 'rooftops'), (1506, 'helicopter'), (2342, 'plus'), (1467, 'hampton'), (99, '23'), (3058, 'took'), (3420, 'zoomed'), (1510, 'help'), (1641, 'interrupted'), (7, '04'), (3254, 'visable'), (1159, 'eratic'), (501, 'behavior'), (1753, 'lauderdale'), (1170, 'estimated'), (1579, 'huvering'), (177, '44jerked'), (199, '44than'), (3021, 'third'), (2438, 'rancho'), (895, 'cucamonga'), (2138, 'oblong'), (2463, 'rear'), (209, '460'), (720, 'christiansburg'), (539, 'blacksburg'), (2287, 'peppers'), (1250, 'ferry'), (583, 'box'), (997, 'directional'), (1106, 'eastery'), (2077, 'nights'), (3105, 'treeline'), (2608, 'season'), (62, '1975'), (668, 'capitan'), (215, '500'), (1946, 'mile'), (1642, 'interstate'), (981, 'digital'), (649, 'camara'), (3205, 'use'), (2593, 'scan'), (2739, 'skys'), (1990, 'months'), (2112, 'nothing'), (1657, 'isaw'), (2337, 'playback'), (1150, 'enlarging'), (1435, 'graham'), (674, 'caroline'), (235, '75'), (620, 'building'), (631, 'busy'), (130, '39m'), (3381, 'writing'), (120, '35'), (1960, 'mins'), (370, 'appears'), (1386, 'get'), (2581, 'satillite'), (1296, 'flashlight'), (1682, 'joined'), (752, 'clifton'), (2083, 'nne'), (2761, 'smoky'), (1974, 'mnts'), (1456, 'guelph'), (138, '3bright'), (3122, 'trianglular'), (2847, 'st'), (680, 'catharines'), (1027, 'dissapeared'), (3036, 'thru'), (2080, 'nite'), (2993, 'temecula'), (164, '44ca'), (2183, 'ops'), (972, 'dice'), (740, 'cirlcle'), (758, 'clockface'), (1988, 'montauk'), (2835, 'spotlighting'), (2664, 'shed'), (158, '44at'), (851, 'cornwall'), (2548, 'rotate'), (1161, 'erractic'), (3225, 'varying'), (2818, 'speeds'), (2009, 'mount'), (1758, 'lbert'), (1055, 'dot'), (3242, 'vey'), (1514, 'hi'), (1212, 'faa'), (3208, 'usually'), (2763, 'smoothly'), (2051, 'navigation'), (1857, 'ma'), (3330, 'will'), (3140, 'turns'), (270, 'accross'), (2635, 'september'), (90, '2011'), (1317, 'fluorescent'), (3253, 'virginia'), (932, 'dc'), (2691, 'shut'), (1246, 'fell'), (1329, 'footage'), (2568, 'sacramento'), (2884, 'stepped'), (1681, 'job'), (589, 'break'), (3267, 'walked'), (3129, 'trunk'), (1216, 'facing'), (2975, 'talking'), (3407, 'zagged'), (2564, 'runway'), (1197, 'experienced'), (122, '360'), (2531, 'rising'), (1323, 'follow'), (1201, 'explanations'), (2937, 'sunny'), (602, 'brihgte'), (1712, 'kinda'), (2467, 'recktangular'), (2864, 'started'), (2427, 'rad'), (1143, 'ends'), (266, 'accelerates'), (423, 'augusta'), (2810, 'sparkling'), (839, 'contrai'), (3064, 'totally'), (3179, 'unique'), (2383, 'previously'), (114, '31st'), (220, '55'), (1113, 'edina'), (3203, 'usa'), (2868, 'staten'), (183, '44ny'), (2598, 'scattered'), (1352, 'fresno'), (1592, 'i95'), (2522, 'richmond'), (2298, 'phillipsburg'), (1107, 'easton'), (2453, 'realized'), (268, 'accompanied'), (2941, 'super'), (1801, 'linear'), (3305, 'westside'), (2207, 'originally'), (2735, 'skipped'), (791, 'columns'), (172, '44green'), (153, '447'), (246, '95'), (162, '44boone'), (2196, 'orbital'), (2190, 'orangelights'), (1422, 'gold'), (2996, 'tessellated'), (2520, 'rhomboidal'), (2243, 'panes'), (1288, 'flap'), (2412, 'pursuit'), (3228, 'vector'), (3173, 'unexplainably'), (3303, 'westminster'), (1908, 'md'), (1875, 'man'), (436, 'awakened'), (2380, 'pressed'), (3004, 'their'), (1618, 'indicator'), (2313, 'pieces'), (1692, 'junk'), (2477, 'reentering'), (2924, 'strs'), (843, 'converging'), (3198, 'upwad'), (493, 'becoming'), (1453, 'grouping'), (452, 'balloons'), (278, 'adult'), (1248, 'female'), (535, 'bizarre'), (2499, 'reports'), (2353, 'polished'), (326, 'aluminum'), (2105, 'nose'), (419, 'attitude'), (1051, 'domed'), (1496, 'heard'), (1163, 'erradically'), (2848, 'stabilized'), (1668, 'itself'), (1605, 'images'), (2552, 'roughly'), (2203, 'organge'), (734, 'circled'), (1786, 'ligh'), (3175, 'unidentified'), (3388, 'yards'), (2261, 'passes'), (1809, 'littlefield'), (2495, 'replacement'), (3143, 'twice'), (2366, 'positioned'), (2874, 'stay'), (2486, 'remaining'), (3, '01'), (10, '07'), (263, 'abt'), (1535, 'honey'), (2332, 'planetary'), (2401, 'pullman'), (1389, 'gettysburg'), (1810, 'littlestown'), (1523, 'him'), (3200, 'upwards'), (884, 'crew'), (2624, 'sees'), (2385, 'prior'), (52, '1965'), (538, 'blackout'), (280, 'advanced'), (2273, 'patterns'), (1316, 'fluid'), (1570, 'human'), (1469, 'handle'), (2796, 'southeastern'), (1321, 'foley'), (302, 'alabama'), (2723, 'sitings'), (132, '39re'), (1940, 'mid'), (3133, 'tucson'), (550, 'blinks'), (1149, 'enjoying'), (623, 'burger'), (309, 'ali'), (1620, 'inn'), (240, '8230'), (156, '44and'), (3416, 'zipping'), (1870, 'make'), (1404, 'glendale'), (1282, 'fl200'), (1898, 'masking'), (2239, 'pacing'), (295, 'airliner'), (2908, 'streaks'), (3074, 'tracers'), (2678, 'shoot'), (2747, 'slowed'), (1705, 'kept'), (1205, 'expose'), (2792, 'source'), (2934, 'summverville'), (2511, 'retired'), (2927, 'submariner'), (3172, 'unexplainable'), (2479, 'reflective'), (2887, 'stood'), (2549, 'rotated'), (2001, 'motio'), (307, 'alexandria'), (761, 'closely'), (1451, 'group'), (2078, 'nighttime'), (1629, 'intelligent'), (437, 'aware'), (2653, 'shaft'), (3078, 'tractor'), (3109, 'trek'), (723, 'chrysler'), (2611, 'sebring'), (2523, 'ridge'), (426, 'austin'), (3087, 'trajectory'), (1357, 'frightening'), (146, '43'), (2226, 'ovals'), (2016, 'movements'), (371, 'approach'), (2402, 'pulsated'), (3415, 'zipped'), (3289, 'wednesday'), (2375, 'preparing'), (2969, 'taken'), (3127, 'truck'), (3340, 'winds'), (2597, 'scary'), (2726, 'situation'), (1563, 'hudson'), (2376, 'presence'), (1678, 'jetted'), (2288, 'perceived'), (2768, 'soft'), (3134, 'tues'), (2680, 'shoots'), (1613, 'incredibly'), (828, 'const'), (3232, 'vehicles'), (1350, 'freeway'), (1733, 'lamps'), (714, 'chicago'), (1599, 'il'), (3197, 'upright'), (288, 'aftewr'), (462, 'barn'), (1355, 'friends'), (128, '39clock'), (1728, 'lafever'), (1826, 'loo'), (1855, 'lycoming'), (1874, 'mall'), (3332, 'williamsport'), (2283, 'pennsylvania'), (1866, 'maine'), (1431, 'goto'), (2464, 'reason'), (1781, 'li'), (1654, 'iridescent'), (2087, 'noises'), (575, 'boulder'), (1571, 'humanoid'), (2977, 'tampa'), (421, 'aug'), (89, '2010'), (1095, 'earliest'), (2249, 'paralyzing'), (1782, 'liberty'), (482, 'beams'), (726, 'church'), (2641, 'services'), (261, 'abruptly'), (3038, 'thunderstorm'), (609, 'bronx'), (3397, 'york'), (1568, 'hulman'), (1626, 'institute'), (2986, 'technology'), (3307, 'wetlands'), (2048, 'national'), (2514, 'return'), (2382, 'previous'), (1764, 'leaped'), (1072, 'drips'), (1917, 'memorial'), (2640, 'service'), (2264, 'pastor'), (1589, 'i70e'), (771, 'co'), (1780, 'lg'), (877, 'crashed'), (314, 'allegedly'), (285, 'affected'), (918, 'dashboard'), (1180, 'events'), (2630, 'separated'), (2983, 'teal'), (3238, 'vertical'), (1499, 'heavy'), (1471, 'hangglider'), (2879, 'stealth'), (2650, 'sha'), (789, 'columbia'), (2644, 'setting'), (334, 'amended'), (920, 'date'), (2949, 'suspect'), (2358, 'porch'), (1921, 'mentioned'), (1253, 'fiance'), (1488, 'he'), (2569, 'said'), (1243, 'feb'), (646, 'call'), (907, 'dallas'), (1830, 'lookes'), (3121, 'triangluar'), (2753, 'slowy'), (3270, 'walmart'), (664, 'canyon'), (1812, 'lived'), (1381, 'general'), (433, 'aviation'), (3362, 'witnessing'), (829, 'constan'), (1964, 'mir'), (311, 'aligned'), (1718, 'knew'), (3277, 'wasn'), (1997, 'morphing'), (870, 'covered'), (244, '90'), (376, 'approximately'), (3318, 'whit'), (3142, 'twenty'), (2223, 'outlining'), (2585, 'savannah'), (1112, 'edges'), (2292, 'perhaps'), (2275, 'paused'), (992, 'dipped'), (2349, 'polar'), (2195, 'orbit'), (2692, 'shuttle'), (3101, 'traverse'), (150, '442001'), (1882, 'manueverablilty'), (3024, 'thornhill'), (897, 'cured'), (2599, 'scepticisim'), (1083, 'dull'), (3329, 'wildest'), (136, '39ve'), (1699, 'keeping'), (1167, 'escort'), (1827, 'look'), (2470, 'rectanglular'), (2246, 'parachute'), (3124, 'tried'), (2764, 'snap'), (855, 'cou'), (2141, 'obserbed'), (3196, 'upper'), (2417, 'quadrant'), (3341, 'windshield'), (444, 'backing'), (2787, 'sough'), (1108, 'eastward'), (2626, 'self'), (169, '44daughter'), (1278, 'fishing'), (767, 'cloudy'), (1992, 'moonlight'), (869, 'cover'), (180, '44mind'), (507, 'believer'), (267, 'accidentally'), (2302, 'photographed'), (842, 'converged'), (2851, 'staight'), (736, 'circling'), (1412, 'globes'), (2303, 'photographic'), (350, 'anomaly'), (3325, 'wide'), (1658, 'ish'), (161, '44blue'), (2215, 'ou'), (1012, 'disappered'), (1585, 'i35'), (2724, 'sititng'), (832, 'constellation'), (1116, 'effortlessly'), (207, '45min'), (1542, 'horizontally'), (3239, 'vertically'), (212, '495'), (247, '95s'), (2315, 'pilot'), (1022, 'dispatched'), (2574, 'santa'), (458, 'barbara'), (2759, 'smokey'), (1707, 'kfc'), (1057, 'doughter'), (1743, 'lar'), (2075, 'niceville'), (1889, 'marina'), (1018, 'discovery'), (43, '180'), (946, 'degree'), (3136, 'turn'), (455, 'band'), (354, 'anteloe'), (2988, 'tehachapi'), (19, '10pm'), (1766, 'leavin'), (2317, 'pin'), (3083, 'trailing'), (1059, 'douhgnut'), (2131, 'ob'), (2135, 'object0'), (226, '65'), (3245, 'vicinity'), 
              (61, '1974'), (2702, 'sign'), (2663, 'she'), (168, '44craft'), (16, '100ft'), (1085, 'dumps'), (2316, 'pima'), (1955, 'mine'), (963, 'desendes'), (988, 'dimming'), (597, 'brightening'), (2534, 'riverside'), (2607, 'searching'), (3336, 'wind'), (847, 'copters'), (2903, 'strangely'), (1840, 'lots'), (3285, 'waxhaw'), (3419, 'zody'), (954, 'dept'), (2893, 'store'), (661, 'canoga'), (451, 'balloon'), (975, 'diego'), (88, '2009'), (1290, 'flared'), (415, 'atleast'), (3028, 'thousand'), (509, 'belmont'), (1185, 'exactly'), (551, 'blocked'), (1275, 'firing'), (2241, 'palm'), (942, 'deep'), (1160, 'erie'), (902, 'cylinders'), (348, 'anoka'), (1958, 'minnesota'), (778, 'cohesive'), (3181, 'unit'), (2065, 'net'), (1448, 'grid'), (3370, 'woodinville'), (1577, 'hurry'), (2821, 'spher'), (935, 'december'), (221, '58pm'), (1272, 'fireworks'), (1052, 'don'), (1071, 'drink'), (2820, 'spere'), (3344, 'winged'), (1315, 'flowing'), (1432, 'government'), (1234, 'farmer'), (1595, 'idaho'), (2627, 'semi'), (578, 'bound'), (1683, 'jordan'), (2201, 'ore'), (418, 'atp'), (1918, 'memphis'), (2145, 'observers'), (316, 'almond'), (1211, 'eyes'), (601, 'brightness'), (2394, 'proximity'), (638, 'c130'), (2918, 'stripes'), (3247, 'videotaping'), (2576, 'saratoga'), (3186, 'unlit'), (178, '44lit'), (819, 'confirming'), (2369, 'post'), (1358, 'frisbee'), (329, 'amazed'), (587, 'bradenton'), (1590, 'i80'), (485, 'beatrice'), (2057, 'nebraska'), (1848, 'lumenescent'), (707, 'chemtrail'), (916, 'darting'), (333, 'ambiant'), (378, 'aprox'), (1850, 'luminescent'), (1792, 'lighting'), (2378, 'president'), (630, 'bush'), (2815, 'speech'), (2741, 'sleek'), (3118, 'triangled'), (2362, 'portland'), (2165, 'okanagan'), (2817, 'speeding'), (2655, 'shap'), (557, 'blurred'), (1342, 'fort'), (1949, 'mill'), (701, 'charlotte'), (1725, 'la'), (1213, 'face'), (2449, 'read'), (1603, 'illumination'), (2713, 'simultaneously'), (203, '44white'), (2931, 'sudden'), (2454, 'really'), (3071, 'tr'), (2041, 'myself'), (613, 'brothers'), (2089, 'noisless'), (3068, 'tower'), (1704, 'kentucky'), (229, '68'), (237, '80'), (577, 'bounced'), (204, '44with'), (961, 'describe'), (2387, 'probe'), (1813, 'living'), (2204, 'organism'), (1200, 'explanation'), (2476, 'redlands'), (1911, 'mead'), (640, 'cabazon'), (2631, 'separates'), (792, 'combine'), (1584, 'i294'), (2129, 'oaklawn'), (2921, 'strobed'), (1871, 'makes'), (3119, 'triangler'), (603, 'brillant'), (1748, 'lasting'), (1380, 'gave'), (717, 'chills'), (2807, 'span'), (2892, 'stops'), (2510, 'resumes'), (1518, 'higher'), (1634, 'intensly'), (2030, 'multicolor'), (2742, 'sleeping'), (1873, 'male'), (1647, 'investigated'), (3164, 'unconfirmed'), (3112, 'tri'), (1190, 'exhibited'), (349, 'anomalous'), (194, '44shinny'), (234, '737'), (874, 'crafted'), (1943, 'midtown'), (413, 'atlanta'), (3162, 'uncle'), (55, '1968'), (2914, 'striations'), (1793, 'lightning'), (566, 'bolts'), (1132, 'eminating'), (102, '25mph'), (2962, 'tablet'), (985, 'dimley'), (32, '150'), (1327, 'fomation'), (2559, 'rr'), (2099, 'northeastern'), (2166, 'oklahoma'), (986, 'dimly'), (1226, 'fairly'), (1388, 'getting'), (1171, 'eubank'), (588, 'brake'), (365, 'apparently'), (3231, 'vehicle'), (1004, 'disapp'), (2880, 'steel'), (1395, 'girder'), (1536, 'hoovered'), (1564, 'hudsonville'), (1515, 'hides'), (2005, 'motivation'), (1609, 'included'), (919, 'database'), (1440, 'graveyard'), (1920, 'mendon'), (2527, 'ring'), (394, 'aroun'), (3165, 'uncontrollable'), (2116, 'november'), (218, '50pm'), (1173, 'eureka'), (3051, 'tobacco'), (1214, 'faceing'), (1436, 'grandmesa'), (1869, 'major'), (2008, 'motorists'), (3139, 'turnpike'), (1534, 'homes'), (2352, 'police'), (2046, 'name'), (2429, 'radiating'), (1153, 'entire'), (487, 'beavercreek'), (1935, 'mexico'), (2047, 'naples'), (1162, 'erracticly'), (1666, 'itermittant'), (3413, 'zigzag'), (2004, 'motions'), (989, 'dims'), (1145, 'enfield'), (751, 'cleveland'), (2334, 'plasma'), (3338, 'windowi'), (2945, 'surpr'), (523, 'beyond'), (665, 'capabilities'), (1879, 'manmade'), (993, 'dipper'), (2178, 'ooltewah'), (1399, 'gives'), (931, 'dazzling'), (2613, 'seco'), (3271, 'walton'), (33, '1500'), (2682, 'shoreline'), (608, 'broke'), (3113, 'tria'), (2704, 'silence'), (3265, 'wake'), (400, 'arvada'), (69, '1986'), (2882, 'steelville'), (59, '1971'), (1230, 'family'), (1225, 'fairfield'), (3212, 'vallejo'), (2254, 'partially'), (396, 'arranged'), (2950, 'suspected'), (1689, 'jumpers'), (1493, 'headlight'), (2555, 'roused'), (650, 'cambria'), (1333, 'forest'), (2497, 'reported'), (683, 'caused'), (1189, 'exercise'), (1741, 'lapse'), (580, 'bouse'), (1479, 'harquahala'), (1049, 'doing'), (2990, 'teleporting'), (531, 'birch'), (1401, 'glance'), (2017, 'moven'), (2433, 'rainbow'), (411, 'asu'), (1823, 'longer'), (2428, 'radiant'), (1565, 'hue'), (1391, 'giant'), (2323, 'pitch'), (2536, 'roar'), (429, 'ave'), (1731, 'lakewood'), (1359, 'frist'), (1308, 'fling'), (655, 'campfire'), (879, 'crazy'), (2230, 'overflight'), (1701, 'kendall'), (1795, 'ligts'), (2780, 'sometimes'), (2876, 'stays'), (79, '200'), (3391, 'yelled'), (3163, 'uncomperhensable'), (666, 'capablities'), (1759, 'lead'), (2311, 'pictures'), (1503, 'heiskell'), (2849, 'stadium'), (2790, 'soundlessly'), (1111, 'edge'), (2668, 'shimmering'), (3363, 'wobbled'), (3155, 'uknown'), (624, 'burgettstown'), (188, '44pa'), (2059, 'needle'), (2582, 'saturday'), (118, '33pm'), (2101, 'northest'), (1123, 'elmwood'), (618, 'buffalo'), (1285, 'flames'), (2062, 'neighbors'), (2217, 'ours'), (339, 'anchorage'), (304, 'alaska'), (967, 'detroit'), (3342, 'windsor'), (678, 'castle'), (1458, 'gunnison'), (2785, 'soon'), (3012, 'thereafter'), (2228, 'ove'), (488, 'beaverton'), (3255, 'visibility'), (1944, 'midvale'), (2570, 'salt'), (3209, 'utah'), (2296, 'personal'), (2982, 'taylorville'), (2501, 'research'), (2560, 'rt'), (45, '190'), (1773, 'leominster'), (518, 'bernardino'), (1030, 'dissapered'), (1999, 'most'), (1104, 'easterly'), (2762, 'smooth'), (548, 'blinkers'), (201, '44two'), (3383, 'wsw'), (1168, 'ese'), (2811, 'sparland'), (2856, 'standstill'), (1844, 'louisville'), (710, 'chesapeake'), (2347, 'pointed'), (896, 'cull'), (2634, 'seperately'), (2595, 'scarey'), (3060, 'tops'), (1403, 'gleaming'), (1951, 'milroy'), (2556, 'route'), (116, '322'), (318, 'alone'), (2150, 'occasions'), (636, 'bwi'), (3020, 'think'), (1181, 'eventually'), (2752, 'slows'), (274, 'acted'), (375, 'approximate'), (2905, 'streak'), (416, 'atmosphere'), (2850, 'staggered'), (1847, 'lower'), (3297, 'wesleyan'), (3182, 'university'), (949, 'delaware'), (2469, 'rectangler'), (2891, 'stopping'), (1906, 'maze'), (941, 'decoration'), (464, 'barrel'), (2322, 'pinpoint'), (1300, 'fleet'), (823, 'connected'), (2998, 'tethers'), (1310, 'float'), (3375, 'working'), (26, '130'), (779, 'cold'), (730, 'cigarett'), (2958, 'swiftly'), (1540, 'horizonal'), (1238, 'fas'), (533, 'birds'), (1852, 'lunar'), (1110, 'eclipse'), (2838, 'spring'), (54, '1967'), (2432, 'railroad'), (3075, 'track'), (1531, 'hollandale'), (1973, 'mn'), (2901, 'strang'), (2826, 'spining'), (1619, 'initially'), (1397, 'girlfriend'), (1524, 'hinckley'), (641, 'cabin'), (2354, 'polution'), (1608, 'incident'), (2974, 'talked'), (3366, 'wondered'), (1631, 'intensely'), (1446, 'grew'), (1115, 'eerie'), (100, '24'), (1798, 'limbs'), (2255, 'partly'), (810, 'concealed'), (2979, 'tandem'), (754, 'climbed'), (2550, 'rotates'), (3159, 'unaffected'), (2948, 'surroundings'), (512, 'belpre'), (206, '45714'), (1373, 'gandy'), (592, 'bridge'), (135, '39tl'), (299, 'airpt'), (2050, 'navarre'), (2711, 'similar'), (1060, 'dover'), (284, 'afb'), (143, '41'), (727, 'chute'), (3039, 'tiger'), (660, 'canal'), (321, 'alpine'), (765, 'cloudless'), (30, '141'), (1582, 'hyw'), (3033, 'throug'), (367, 'appearances'), (3250, 'viewing'), (1856, 'm42'), (2991, 'telescope'), (338, 'anamolous'), (2106, 'noss'), (331, 'amazing'), (841, 'conventional'), (2779, 'sometime'), (315, 'allemands'), (2020, 'movies'), (2750, 'slowing'), (2961, 'synchronized'), (2221, 'outline'), (2482, 'relatively'), (3061, 'topsail'), (3086, 'traingular'), (2364, 'pos'), (2333, 'planets'), (1068, 'drifted'), (1130, 'emerge'), (142, '408'), (2841, 'spur'), (3392, 'yelling'), (2588, 'saying'), (2400, 'pulled'), (1064, 'drapes'), (68, '1985'), (1930, 'meteor'), (2689, 'shower'), (2039, 'mylar'), (1717, 'kites'), (173, '44headlights'), (2809, 'sparking'), (1410, 'glittering'), (3321, 'whittier'), (2573, 'sanikiluaq'), (2123, 'nunavut'), (18, '101'), (2930, 'such'), (724, 'chubby'), (900, 'cylender'), (1632, 'intensified'), (1502, 'heights'), (2728, 'sixties'), (3108, 'treetops'), (2297, 'philadelphia'), (908, 'dance'), (1878, 'manitoba'), (2839, 'springfield'), (1551, 'housing'), (1481, 'hatteras'), (1328, 'foot'), (1938, 'miamisburg'), (2772, 'soldiers'), (1302, 'flicker'), (1625, 'instead'), (2740, 'slanted'), (984, 'diminishing'), (1776, 'lethbridge'), (306, 'alberta'), (2965, 'tahoe'), (1127, 'emanated'), (1509, 'hell'), (1351, 'fresh'), (1730, 'lakes'), (1708, 'kids'), (24, '12th'), (1313, 'floor'), (813, 'condo'), (1148, 'enjoy'), (1016, 'discoloration'), (540, 'blacktop'), (363, 'apopka'), (1037, 'distinctly'), (3091, 'transluscent'), (1129, 'ember'), (3029, 'thousands'), (2384, 'pri'), (115, '32'), (39, '170'), (644, 'calaveras'), (3135, 'tumbling'), (569, 'booster'), (3216, 'vandenburge'), (2242, 'palms'), (1890, 'marine'), (1259, 'figure'), (505, 'belgrade'), 
              (71, '1992'), (186, '44or'), (1498, 'heat'), (806, 'complety'), (2424, 'quincy'), (84, '2005'), (629, 'burton'), (1353, 'friday'), (2154, 'october'), (86, '2007'), (1023, 'disperse'), (1968, 'missing'), (1474, 'happend'), (2079, 'nine'), (3376, 'works'), (3204, 'usaf'), (1713, 'kingman'), (1274, 'firie'), (1703, 'kennesaw'), (2554, 'rounded'), (3048, 'tips'), (3343, 'wing'), (1679, 'jetties'), (2765, 'snow'), (2894, 'storm'), (1349, 'freaky'), (1156, 'equidistant'), (889, 'crossing'), (2784, 'sons'), (552, 'blocks'), (2539, 'rockford'), (160, '44blinked'), (663, 'cant'), (1202, 'explane'), (1402, 'glaring'), (1382, 'generated'), (2006, 'motor'), (565, 'bolted'), (872, 'crack'), (3170, 'unearthly'), (576, 'bounce'), (2391, 'property'), (2590, 'says'), (619, 'buglike'), (2912, 'streetlight'), (1526, 'hit'), (2537, 'rock'), (1649, 'investigators'), (3056, 'tonight'), (2291, 'performing'), (3191, 'unusal'), (1884, 'manuvers'), (3364, 'woke'), (471, 'bathed'), (755, 'clinton'), (3070, 'township'), (1934, 'metroparkway'), (397, 'arrived'), (2594, 'scared'), (1500, 'heck'), (2648, 'sf'), (1001, 'disapear'), (2649, 'sh'), (3090, 'transform'), (2416, 'quad'), (719, 'chopper'), (824, 'consecutive'), (1606, 'impossible'), (1876, 'maneuvers'), (757, 'cloaking'), (469, 'bass'), (1477, 'harbor'), (1864, 'magnitude'), (1867, 'maintianed'), (1154, 'equal'), (2632, 'separati'), (2121, 'number'), (2812, 'specific'), (75, '19th'), (22, '11am'), (2567, 'ruth'), (472, 'battle'), (529, 'biking'), (432, 'average'), (2481, 'regrouped'), (1984, 'monday'), (709, 'cherry'), (107, '2nd'), (2837, 'spotting'), (2865, 'starts'), (3076, 'tracking'), (2270, 'patten'), (2247, 'parade'), (347, 'annex'), (2392, 'prospect'), (610, 'brooklyn'), (1775, 'let'), (2115, 'noticing'), (747, 'clayton'), (1177, 'evenings'), (1674, 'jellyfish'), (675, 'carrier'), (2414, 'pyramid'), (1897, 'marysville'), (3331, 'willamette'), (711, 'chesepeake'), (2938, 'sunrise'), (3193, 'unusually'), (3195, 'upon'), (2142, 'observation'), (241, '85'), (1903, 'mauldin'), (3145, 'twirling'), (617, 'buena'), (265, 'accelerated'), (2489, 'rendezvous'), (1151, 'enormous'), (2269, 'patrol'), (871, 'covina'), (3156, 'ultra'), (2687, 'should'), (1815, 'loading'), (3187, 'unloading'), (1058, 'douglas'), (2967, 'tails'), (886, 'criss'), (582, 'bowling'), (3279, 'watch'), (425, 'aurora'), (2662, 'shawnee'), (1722, 'ks'), (2589, 'saylorsburg'), (794, 'comes'), (2970, 'takes'), (1843, 'louis'), (959, 'descending'), (672, 'carmel'), (2213, 'otherwise'), (1186, 'exception'), (1693, 'jupiter'), (890, 'cruising'), (1268, 'firballs'), (1788, 'lightbulb'), (3417, 'zips'), (1334, 'forked'), (1977, 'moderate'), (1623, 'instantaneous'), (1377, 'gaseous'), (571, 'borealis'), (2151, 'occurred'), (515, 'bend'), (1858, 'mach'), (2395, 'pst'), (271, 'acelerating'), (1986, 'monrovia'), (1286, 'flaming'), (105, '28'), (2164, 'oject'), (853, 'cosmic'), (2959, 'switched'), (1784, 'lifted'), (2127, 'oak'), (40, '172nd'), (3334, 'wilson'), (1371, 'gabriel'), (643, 'cajon'), (2060, 'neighbor'), (883, 'crete'), (3176, 'unimaginable'), (44, '19'), (2169, 'older'), (409, 'assend'), (260, 'abright'), (2095, 'norhwest'), (826, 'consisted'), (817, 'configuration'), (465, 'barrie'), (1207, 'extra'), (2995, 'terrestrial'), (1144, 'energy'), (1024, 'display'), (2214, 'ottawa'), (1379, 'gatineau'), (2104, 'norwalk'), (170, '44glowing'), (2579, 'satellite'), (1633, 'intensity'), (2092, 'none'), (1684, 'jose'), (795, 'comet'), (2259, 'passed'), (403, 'ascending'), (1131, 'emerged'), (1347, 'francisco'), (2712, 'simple'), (1194, 'expanded'), (2896, 'story'), (2542, 'roof'), (1501, 'height'), (2651, 'shadow'), (2061, 'neighborhood'), (1348, 'franklin'), (3262, 'wached'), (519, 'bernie'), (1188, 'executing'), (1383, 'geometric'), (1976, 'model'), (3030, 'thr'), (1803, 'link'), (3125, 'trinagle'), (3346, 'wings'), (3189, 'unsusal'), (820, 'confirms'), (3237, 'verona'), (983, 'dimed'), (385, 'argos'), (1868, 'mainville'), (1899, 'mason'), (579, 'bourbonnais'), (706, 'chautauqua'), (1604, 'ilm'), (159, '44before'), (3046, 'tinted'), (49, '1958'), (1122, 'eleven'), (1396, 'girl'), (1511, 'hemet'), (1038, 'distortion'), (3016, 'thier'), (330, 'amazement'), (3304, 'weston'), (2473, 'redding'), (1522, 'hillsboro'), (3290, 'week'), (3088, 'tranlucent'), (2734, 'skin'), (1126, 'elyria'), (2457, 'reappe'), (2981, 'taped'), (3243, 'vibrating'), (840, 'contrail'), (1255, 'fields'), (801, 'companions'), (1002, 'disapeard'), (238, '800'), (1583, 'i20'), (2877, 'steadily'), (325, 'altitudes'), (591, 'brentwood'), (1452, 'grouped'), (3114, 'triagular'), (2137, 'objest'), (1121, 'electricity'), (798, 'common'), (1591, 'i805'), (66, '1981'), (222, '5pm'), (1835, 'loose'), (860, 'counter'), (759, 'clockwise'), (2324, 'pitched'), (1046, 'doggy'), (2181, 'operated'), (2235, 'own'), (1368, 'fying'), (3009, 'thens'), (1100, 'eases'), (585, 'boyton'), (202, '44which'), (3404, 'yukon'), (2718, 'sirun'), (2910, 'streams'), (27, '135'), (2147, 'observes'), (2035, 'muskoka'), (854, 'cottagers'), (2513, 'retreats'), (3377, 'world'), (3244, 'vibration'), (2561, 'rudder'), (1195, 'expect'), (1505, 'heli'), (1711, 'kind'), (1638, 'interesting'), (2156, 'odor'), (677, 'case'), (1411, 'glob'), (1128, 'emanating'), (387, 'arkansas'), (228, '67'), (1891, 'marker'), (1628, 'intelligence'), (1635, 'intercept'), (2113, 'notice'), (2389, 'process'), (1740, 'laps'), (2205, 'origin'), (1952, 'milton'), (2304, 'photographs'), (269, 'account'), (1039, 'disturbing'), (679, 'catch'), (725, 'chunks'), (614, 'brown'), (1979, 'mold'), (264, 'acad'), (1433, 'grad'), (561, 'bobs'), (12, '09'), (2562, 'rungs'), (1726, 'ladder'), (517, 'bent'), (559, 'boarder'), (1995, 'morph'), (2984, 'tear'), (1956, 'mineola'), (2109, 'noted'), (2503, 'residence'), (2621, 'seeming'), (1567, 'hugging'), (642, 'cabos'), (1789, 'lighte'), (2442, 'rapid'), (1101, 'easily'), (1009, 'disappearred'), (1820, 'locations'), (1932, 'meters'), (2964, 'tag'), (2616, 'sedona'), (1953, 'milwaukee'), (1089, 'durning'), (1136, 'emitts'), (816, 'conected'), (2883, 'steep'), (753, 'climb'), (2858, 'stared'), (2122, 'numerous'), (3349, 'winnipeg'), (2248, 'parallel'), (1139, 'encircling'), (149, '4410000'), (2798, 'southward'), (1550, 'houses'), (799, 'commute'), (2478, 'reflecting'), (1229, 'falls'), (542, 'blazed'), (3082, 'trailed'), (1369, 'fyling'), (831, 'constantly'), (312, 'alignment'), (275, 'action'), (3263, 'wading'), (2356, 'pool'), (2677, 'ships'), (1950, 'million'), (3259, 'visitors'), (216, '50am'), (3367, 'woodbridge'), (2806, 'spaceship'), (3339, 'windows'), (1015, 'discernible'), (2301, 'photo'), (812, 'conclusive'), (3408, 'zagging'), (788, 'colours'), (2770, 'sohn'), (3226, 'vassar'), (2185, 'ora'), (1443, 'greece'), (1439, 'grants'), (2082, 'nm'), (553, 'blood'), (2897, 'strafe'), (2409, 'pure'), (3045, 'tinged'), (341, 'angel'), (3166, 'uncovered'), (1406, 'glide'), (1894, 'marlborough'), (1652, 'inward'), (1892, 'markings'), (2863, 'start'), (1013, 'disapwar'), (2504, 'residential'), (1862, 'madison'), (705, 'chattanooga'), (3233, 'venice'), (2193, 'orangish'), (152, '444'), (95, '20pm'), (346, 'annapolis'), (3224, 'varnville'), (1392, 'gigantic'), (17, '100kts'), (3356, 'withfour'), (768, 'cloverleaf'), (1193, 'exiting'), (155, '44a'), (923, 'daughter'), (528, 'bike'), (1709, 'kildonan'), (922, 'daugh'), (1614, 'independent'), (2233, 'overwhelme'), (3369, 'wooded'), (434, 'avoided'), (1384, 'george'), (2978, 'tan'), (2944, 'surface'), (2612, 'secluded'), (2437, 'ranch'), (389, 'arlington'), (1750, 'lat'), (892, 'cst'), (254, 'ability'), (2951, 'suspend'), (3006, 'themselves'), (2370, 'posted'), (255, 'able'), (2419, 'queens'), (1283, 'fla'), (3010, 'theory'), (2361, 'porthole'), (656, 'campsite'), (320, 'alongside'), (492, 'becomes'), (2665, 'shell'), (3146, 'twitching'), (2869, 'stati'), (837, 'contoocook'), (2073, 'nh'), (3252, 'village'), (1785, 'lifts'), (3073, 'trace'), (468, 'basketball'), (2374, 'practice'), (2308, 'pick'), (9, '06'), (2069, 'newfoundland'), (1554, 'hoverd'), (1945, 'might'), (2208, 'orion'), (513, 'belt'), (1610, 'incomprhensible'), (2859, 'starlight'), (2465, 'recently'), (2132, 'obect'), (1416, 'glowed'), (2852, 'staionary'), (1464, 'halt'), (995, 'directio'), (1694, 'jus'), (1070, 'drifts'), (2090, 'noislessly'), (1220, 'fades'), (2774, 'som'), (2845, 'squid'), (3327, 'wierd'), (1739, 'lanterns'), (1755, 'launched'), (279, 'adults'), (1438, 'granite'), (185, '44on'), (1833, 'loop'), (2989, 'telephone'), (2350, 'pole'), (595, 'briefly'), (3013, 'thermal'), (1663, 'issaquah'), (56, '1969'), (2709, 'silvery'), (833, 'containing'), (2019, 'movie'), (2546, 'rosebud'), (1233, 'farm'), (144, '413'), (1393, 'gila'), (1596, 'identify'), (1680, 'jig'), (1669, 'jag'), (1849, 'luminescence'), (728, 'cicular'), (1146, 'engaged'), (1045, 'dogfight'), (744, 'clarksville'), (1602, 'illuminating'), (127, '39ball'), (3169, 'underside'), (3333, 'wilmington'), (1299, 'flattened'), (1224, 'fairfax'), (2365, 'position'), (1824, 'longmont'), (3300, 'wester'), (850, 'cornfield'), (2407, 'pulses'), (2066, 'nevada'), (1587, 'i580'), (2128, 'oakland'), (1484, 'hayward'), (1763, 'leandro'), (277, 'actually'), (2973, 'talk'), (2992, 'tell'), (358, 'anyone'), (3323, 'why'), (2532, 'risk'), (1562, 'hu'), (3352, 'wish'), (3382, 'written'), (3272, 'want'), (3374, 'worked'), (2159, 'office'), (807, 'complex'), (605, 'bringing'), (3094, 'trash'), 
              (662, 'cans'), (1670, 'jan'), (2854, 'standard'), (1109, 'eating'), (1853, 'lunch'), (781, 'collor'), (944, 'defies'), (2307, 'physics'), (2070, 'newport'), (1762, 'leads'), (1837, 'loss'), (2160, 'officers'), (2042, 'mysterious'), (1158, 'equinox'), (77, '1st'), (1621, 'insects'), (470, 'bat'), (2445, 'ray'), (2268, 'patio'), (866, 'court'), (495, 'bedford'), (803, 'compelled'), (2491, 'rep'), (500, 'begin'), (835, 'continue'), (97, '2145'), (1466, 'hampshire'), (3348, 'winnepasauke'), (1497, 'heart'), (1191, 'exhibiting'), (2802, 'spa'), (1821, 'lone'), (2043, 'mystery'), (3188, 'unorderly'), (3418, 'zizzaged'), (912, 'darkened'), (1029, 'dissapeered'), (2294, 'period'), (305, 'albany'), (2282, 'peculiar'), (2659, 'shapeshifting'), (2987, 'teenagers'), (2906, 'streaked'), (303, 'alarming'), (2652, 'shadowey'), (516, 'beneath'), (1000, 'disa'), (2956, 'swarmed'), (1901, 'massachusetts'), (1783, 'lie'), (913, 'darkness'), (2406, 'pulsed'), (2530, 'rises'), (924, 'daughters'), (1235, 'farmhouse'), (2584, 'saucers'), (1407, 'glided'), (899, 'cycle'), (111, '30ftlength'), (217, '50feet'), (2420, 'question'), (1172, 'eugene'), (1772, 'lentil'), (1091, 'ea'), (704, 'chasing'), (716, 'chillicothe'), (1261, 'fill'), (2149, 'occasionally'), (252, 'abduction'), (1473, 'happen'), (481, 'beamed'), (388, 'arlee'), (543, 'bliking'), (48, '1953'), (6, '0300'), (534, 'birmingham'), (903, 'cylindical'), (1508, 'helium'), (1262, 'filled'), (3274, 'warm'), (2036, 'must'), (3201, 'urbana'), (157, '44around'), (1, '00am'), (175, '44i'), (1715, 'kitchen'), (166, '44circuler'), (1749, 'lasts'), (1519, 'highly'), (594, 'brief'), (2505, 'residue'), (742, 'cirular'), (2022, 'movingtoward'), (3008, 'thenhorizontally'), (2272, 'patterned'), (584, 'boyfriend'), (3055, 'tone'), (2388, 'proce'), (208, '45pm'), (687, 'cemetary'), (1756, 'laure'), (2341, 'plume'), (47, '1951'), (1449, 'grosse'), (2346, 'pointe'), (2232, 'overlooking'), (1804, 'linked'), (1223, 'fairchild'), (294, 'airforce'), (1854, 'lustrous'), (625, 'burien'), (3151, 'tysons'), (1895, 'marriot'), (1217, 'factory'), (545, 'blinding'), (978, 'difference'), (1828, 'looke'), (776, 'coconut'), (2393, 'prove'), (2058, 'need'), (1048, 'doi'), (800, 'commuting'), (521, 'bethesda'), (1461, 'hagerstown'), (2710, 'simi'), (2517, 'reviewing'), (2418, 'quarter'), (3160, 'unbelievable'), (2357, 'pop'), (1919, 'mena'), (300, 'akansas'), (2452, 'realeases'), (2756, 'smells'), (2126, 'nyc'), (245, '911'), (113, '311'), (1187, 'exchange'), (615, 'brownish'), (2032, 'multipe'), (2466, 'recived'), (2300, 'phone'), (353, 'answered'), (3309, 'whats'), (399, 'arrowhead'), (2139, 'oblonged'), (2736, 'skipping'), (2502, 'resembles'), (1387, 'gets'), (2025, 'mst'), (2426, 'rachel'), (123, '375'), (1468, 'han'), (3227, 've'), (3089, 'transco'), (1204, 'exploding'), (2808, 'sparatically'), (1697, 'kalih'), (695, 'chandler'), (2355, 'pomona'), (1367, 'fwy'), (63, '1978'), (2830, 'spokane'), (2167, 'olathe'), (28, '135th'), (1627, 'instructed'), (2396, 'pu'), (2253, 'parkland'), (1495, 'heads'), (2749, 'slower'), (2237, 'pace'), (308, 'algona'), (37, '167'), (2591, 'sb'), (2146, 'observery'), (921, 'datetime'), (746, 'classical'), (956, 'desacends'), (1133, 'emits'), (1779, 'lexington'), (459, 'barbourville'), (2071, 'newton'), (651, 'camden'), (1266, 'fine'), (3288, 'weaverville'), (2939, 'sunroof'), (2861, 'starring'), (2777, 'somethi'), (2063, 'neighbours'), (283, 'afar'), (3320, 'whitish'), (1615, 'independently'), (2431, 'raid'), (196, '44slow'), (3403, 'yucaipa'), (2220, 'outerspace'), (2434, 'rainer'), (82, '2003'), (3171, 'unexpected'), (2461, 'reappearing'), (3035, 'throughout'), (1727, 'lafayette'), (2603, 'scottsville'), (2804, 'spacecraft'), (1732, 'laminating'), (586, 'bozeman'), (1264, 'finally'), (1769, 'led'), (1883, 'manuevers'), (838, 'contorting'), (163, '44bright'), (189, '44rectangular'), (2328, 'plain'), (527, 'bight'), (2456, 'reapearing'), (2825, 'spiltting'), (1344, 'fortunate'), (567, 'bonneville'), (1298, 'flats'), (1648, 'investigator'), (2404, 'pulsations'), (2007, 'motorcycle'), (1544, 'hospital'), (3310, 'whatsoever'), (1470, 'hang'), (1356, 'friendship'), (1528, 'hobbs'), (2266, 'patches'), (1103, 'easter'), (2666, 'sheridan'), (1909, 'mdt'), (11, '08'), (230, '696'), (1249, 'ferndale'), (1066, 'dream'), (3128, 'true'), (2919, 'strob'), (780, 'college'), (1346, 'fourth'), (2972, 'talahasee'), (1434, 'gradually'), (1305, 'flickers'), (3258, 'visiting'), (145, '41st'), (431, 'avenue'), (182, '44myself'), (939, 'decided'), (2732, 'ski'), (2306, 'physical'), (257, 'abnormalities'), (2745, 'sligtly'), (2140, 'obscurred'), (3178, 'uniontown'), (968, 'diam'), (192, '44round'), (891, 'cruses'), (1073, 'driv')]

vocab = {k:v for k,v in vocab_list}

def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices if i in vocab and i in zipped})
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index if i in original_vocab]

In [215]:
import warnings
warnings.filterwarnings('ignore')

# Make a list of features to drop
to_drop = ufo[['city', 'state', 'country', 'lat', 'long', 'date', 'recorded', 'length_of_time', 'desc', 'seconds', 'minutes']]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

In [221]:
ufo = ufo_dropped.drop(['type'], axis=1)

In [222]:
X = ufo.drop('country_enc', axis=1)
y = ufo['country_enc']

In [224]:
# Take a look at the features in the X set of data
print(X.columns)

# Split the X and y sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit knn to the training sets
knn.fit(X_train, y_train)

# Print the score of knn on the test sets
print(knn.score(X_test, y_test))

Index(['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')
0.8523421588594705


In [225]:
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

# Split the X and y sets using train_test_split, setting stratify=y 
X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)

# Fit nb to the training sets
nb.fit(X_train, y_train)

# Print the score of nb on the test sets
print(nb.score(X_test, y_test))

0.6283095723014257
