In [4]:
import pandas as pd
import numpy as np
import re

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


volunteer_org = pd.read_csv("./data/volunteer_opportunities.csv")
wine = pd.read_csv("./data/wine_types.csv")

X = wine[["Proline", "Total phenols", "Hue", "Nonflavanoid phenols"]]
y = wine["Type"]

hiking = pd.read_json("./data/hiking.json")

running_times_5k = pd.DataFrame({
    "name": ["Sue", "Mark", "Sean", "Erin", "Jenny", "Russell"],
    "run1": [20.1, 16.5, 23.5, 21.7, 25.8, 30.9],
    "run2": [18.5, 17.1, 25.1, 21.1, 27.1, 29.6],
    "run3": [19.6, 16.9, 25.2, 20.9, 26.1, 31.4],
    "run4": [20.3, 17.6, 24.6, 22.1, 26.7, 30.4],
    "run5": [18.3, 17.3, 23.9, 22.2, 26.9, 29.9]})

volunteer = volunteer_org[['vol_requests', 'title', 'hits', 'category_desc', 'locality', 'region', 'postalcode', 'created_date']]

ufo = pd.read_csv("./data/ufo_sightings_large.csv")
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [5]:
# Print the DataFrame info
print(ufo.info())

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            4935 non-null   object 
 1   city            4926 non-null   object 
 2   state           4516 non-null   object 
 3   country         4255 non-null   object 
 4   type            4776 non-null   object 
 5   seconds         4935 non-null   float64
 6   length_of_time  4792 non-null   object 
 7   desc            4932 non-null   object 
 8   recorded        4935 non-null   object 
 9   lat             4935 non-null   object 
 10  long            4935 non-null   float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            4935 non-null   d

In [8]:
# Count the missing values in the length_of_time, state, and type columns, in that order
print(ufo[["length_of_time", "state", "type"]].isna().sum())

# Drop rows where length_of_time, state, or type are missing
ufo_no_missing = ufo.dropna()

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

In [19]:
# Remove the one's with colons in them
ufo_no_missing = ufo_no_missing[~ufo_no_missing["length_of_time"].str.contains(":")]

In [22]:
def return_minutes(time_string):

    # Search for numbers in time_string
    num = re.search(r"\d+", time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo_no_missing["minutes"] = ufo_no_missing["length_of_time"].apply(return_minutes)

# Take a look at the head of both of the columns
print(ufo_no_missing[["length_of_time", "minutes"]].head())

    length_of_time  minutes
0          2 weeks      2.0
1           30sec.     30.0
3  about 5 minutes      5.0
4                2      2.0
5       10 minutes     10.0


In [26]:
ufo_no_missing = ufo_no_missing[ufo_no_missing["seconds"] != 0]

In [27]:
# Check the variance of the seconds and minutes columns
print(ufo_no_missing[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo_no_missing["seconds_log"] = np.log(ufo_no_missing["seconds"])

# Print out the variance of just the seconds_log column
print(ufo_no_missing["seconds_log"].var())

seconds    3.623519e+06
minutes    1.006533e+03
dtype: float64
1.2657316625460762


In [28]:
# Use pandas to encode us values as 1 and others as 0
ufo_no_missing["country_enc"] = ufo_no_missing["country"].apply(lambda val: 1 if val == "us" else 0)

# Print the number of unique type values
print(len(ufo_no_missing["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo_no_missing["type"])

# Concatenate this set back to the ufo DataFrame
ufo_no_missing = pd.concat([ufo_no_missing, type_set], axis=1)

21


In [30]:
# Look at the first 5 rows of the date column
print(ufo_no_missing["date"].head())

# Extract the month from the date column
ufo_no_missing["month"] = ufo_no_missing["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo_no_missing["year"] = ufo_no_missing["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo_no_missing[["date", "month", "year"]].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010
5 2012-06-16 23:00:00      6  2012


In [31]:
# Take a look at the head of the desc field
print(ufo_no_missing["desc"].head())

# Instantiate the tfidf vectorizer object
vec = TfidfVectorizer()

# Fit and transform desc using vec
desc_tfidf = vec.fit_transform(ufo_no_missing["desc"])

# Look at the number of columns and rows
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
5    Dancing lights that would fly around and then ...
Name: desc, dtype: object
(3820, 5319)


In [37]:
'''
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    
    for i in range(0, vector.shape[0]):
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
        
    return set(filter_list)
'''

In [38]:
'''
vocab = {1664: 'it',  3275: 'was', 1744: 'large', 147: '44', 3123: 'triangular', 2657: 'shaped', 1320: 'flying', 2134: 'object', 910: 'dancing', 1794: 'lights', 3002: 'that', 3379: 'would', 1319: 'fly', 395: 'around', 340: 'and', 3007: 'then', 1923: 'merge', 1645: 'into', 2173: 'one', 1787: 'light', 604: 'brilliant', 2188: 'orange', 2184: 'or', 718: 'chinese', 1738: 'lantern', 412: 'at', 1774: 'less', 3001: 'than', 15: '1000', 1363: 'ft', 2021: 'moving', 1102: 'east', 3050: 'to', 3298: 'west', 273: 'across', 2130: 'oakville', 2176: 'ontario', 1942: 'midnight', 1690: 'june', 251: '9th', 92: '2013', 596: 'bright', 2472: 'red', 2097: 'north', 1360: 'from', 3003: 'the', 1539: 'horizon', 3041: 'till', 1003: 'disapeared', 502: 'behind', 766: 'clouds', 2793: 'south', 1276: 'first', 2766: 'so', 873: 'craft', 1462: 'half', 1063: 'dozen', 2899: 'stragglers', 3015: 'they', 3296: 'were', 2943: 'surely', 2107: 'not', 2330: 'planes', 2094: 'nor', 449: 'ball', 2157: 'of', 2751: 'slowly', 2872: 'stationary', 2031: 'multicolored', 738: 'circular', 1926: 'met', 637: 'by', 351: 'another', 3315: 'which', 2435: 'raised', 1915: 'meet', 3057: 'too', 1025: 'displayed', 3153: 'ufo', 1421: 'going', 2737: 'sky', 3206: 'uso', 3280: 'watched', 3202: 'us', 1331: 'for', 864: 'couple', 1963: 'minutes', 1326: 'follows', 3085: 'train', 3077: 'tracks', 1607: 'in', 3350: 'winter', 46: '1931', 2586: 'saw', 1859: 'machine', 401: 'as', 2524: 'riding', 2171: 'on', 1543: 'horse', 1065: 'draw', 2265: 'pasture', 13: '10', 2343: 'pm', 23: '12', 249: '99', 2602: 'scottsdale', 386: 'arizona', 2778: 'something', 1482: 'have', 2067: 'never', 2623: 'seen', 498: 'before', 3097: 'traveling', 259: 'above', 1414: 'glow', 691: 'central', 1987: 'montana', 1928: 'metalic', 2822: 'sphere', 2551: 'rotating', 1240: 'fast', 2271: 'pattern', 3287: 'we', 1076: 'driving', 3069: 'town', 3011: 'there', 1656: 'is', 1424: 'golf', 865: 'course', 1530: 'holes', 760: 'close', 2535: 'road', 3312: 'when', 313: 'all', 2954: 'suuden', 2262: 'passing', 3137: 'turned', 2158: 'off', 474: 'bayou', 558: 'blvd', 2177: 'onto', 1420: 'godwinson', 337: 'an', 1209: 'extremely', 1061: 'down', 3355: 'with', 3081: 'trail', 2757: 'smok', 1337: 'formation', 167: '44counted', 31: '15', 2198: 'orbs', 3190: 'until', 3219: 'vanished', 628: 'bursts', 782: 'color', 811: 'concentrated', 384: 'area', 764: 'cloud', 486: 'beautiful', 2708: 'silver', 784: 'colored', 2583: 'saucer', 258: 'about', 2729: 'size', 2553: 'round', 42: '18', 3311: 'wheeler', 3141: 'turquoise', 2293: 'perimeter', 750: 'clearly', 125: '3997', 8: '05', 225: '64', 4: '02', 0: '00', 1808: 'little', 881: 'creek', 1723: 'ky', 1572: 'humming', 2085: 'noise', 1549: 'house', 2654: 'shaking', 2754: 'small', 1914: 'medium', 2730: 'sized', 536: 'black', 2471: 'rectangular', 2647: 'several', 3390: 'years', 3104: 'tree', 1800: 'line', 2622: 'seemingly', 627: 'burning', 3194: 'up', 2966: 'tail', 1032: 'dissipated', 3067: 'towards', 1141: 'end', 2698: 'sight', 524: 'big', 1966: 'miss', 187: '44orange', 2475: 'redish', 2136: 'objects', 1312: 'floating', 1533: 'home', 2643: 'sets', 3147: 'two', 2940: 'sunset', 2485: 'remained', 3316: 'while', 2211: 'other', 2013: 'moved', 2800: 'southwest', 2875: 'stayed', 3126: 'tripled', 2725: 'sitting', 671: 'car', 1831: 'looking', 2715: 'singal', 1147: 'engine', 2329: 'plane', 2932: 'suddenly', 1014: 'disc', 3295: 'went', 2263: 'past', 3240: 'very', 3319: 'white', 703: 'chased', 219: '52', 2054: 'near', 3380: 'wright', 2274: 'patterson', 291: 'air', 1332: 'force', 466: 'base', 2373: 'power', 2218: 'out', 652: 'came', 2620: 'seemed', 733: 'circle', 793: 'come', 442: 'back', 2421: 'quickly', 1695: 'just', 3218: 'vanishe', 3022: 'thirteen', 2038: 'my', 1437: 'grandmother', 2000: 'mother', 1483: 'having', 1593: 'ice', 880: 'cream', 2224: 'outside', 990: 'diner', 205: '45', 214: '50', 947: 'degrees', 1418: 'glows', 699: 'changing', 979: 'different', 289: 'again', 3393: 'yellow', 1556: 'hovering', 962: 'description', 2367: 'possible', 2700: 'sighting', 2886: 'still', 2377: 'present', 2857: 'star', 1796: 'like', 3052: 'together', 2571: 'same', 2816: 'speed', 987: 'dimmed', 1007: 'disappeared', 1289: 'flare', 713: 'chevron', 2705: 'silent', 2229: 'over', 108: '30', 1954: 'min', 682: 'caught', 3246: 'video', 2980: 'tape', 1270: 'fireball', 286: 'after', 1252: 'few', 1445: 'greenish', 970: 'diamond', 1279: 'five', 2755: 'smaller', 929: 'daytime', 2076: 'night', 964: 'desert', 1555: 'hovered', 2814: 'sped', 438: 'away', 1969: 'mississauga', 658: 'canada', 368: 'appeared', 477: 'be', 1517: 'high', 2368: 'possibly', 357: 'any', 1098: 'earthling', 1861: 'made', 1665: 'item', 1612: 'incredible', 2900: 'straight', 2084: 'no', 2788: 'sound', 1297: 'flat', 549: 'blinking', 1729: 'lake', 2210: 'oswego', 2425: 'quot', 1365: 'full', 2685: 'shortly', 21: '11', 35: '16', 2936: 'sunday', 943: 'defied', 282: 'aerodynamiccs', 1719: 'know', 554: 'blue', 2410: 'purple', 1219: 'faded', 909: 'danced', 885: 'cricle', 2529: 'rise', 1011: 'disapper', 2512: 'retiring', 1430: 'got', 494: 'bed', 2544: 'room', 911: 'dark', 3337: 'window', 2744: 'slightly', 2179: 'open', 1489: 'head', 1770: 'left', 1378: 'gateway', 298: 'airport', 1924: 'merged', 352: 'anouther', 489: 'became', 3040: 'tight', 197: '44straight', 374: 'approx', 124: '39', 2656: 'shape', 1630: 'intense', 1092: 'each', 770: 'cntr', 191: '44rest', 1324: 'followed', 2144: 'observer', 2889: 'stoppe', 616: 'brownwood', 2999: 'texas', 117: '33', 1900: 'mass', 2832: 'sporadically', 1611: 'inconsistent', 2791: 'sounds', 3284: 'waves', 106: '29', 151: '442008', 242: '8pm', 181: '44my', 3402: 'yr', 2168: 'old', 1398: 'girls', 407: 'asked', 3308: 'what', 1598: 'if', 2679: 'shooting', 3053: 'told', 3005: 'them', 2018: 'moves', 504: 'being', 1020: 'disk', 1254: 'field', 232: '70', 133: '39s', 2258: 'pass', 2335: 'play', 702: 'chase', 366: 'appear', 324: 'altitude', 1088: 'during', 749: 'clear', 926: 'day', 179: '44make', 1005: 'disappear', 1465: 'hammond', 1600: 'illinois', 2960: 'sylvania', 1907: 'mccord', 1691: 'junior', 2219: 'outer', 2803: 'space', 78: '20', 1829: 'looked', 2862: 'stars', 1292: 'flash', 599: 'brightlights', 336: 'amp', 3292: 'weird', 2088: 'noisies', 2902: 'strange', 1476: 'happennings', 3268: 'walking', 1817: 'local', 457: 'bar', 1896: 'mars', 490: 'because', 657: 'can', 3293: 'well', 2525: 'right', 2117: 'now', 2403: 'pulsating', 520: 'bethel', 2250: 'park', 2236: 'pa', 721: 'christmas', 1174: 'eve', 2572: 'san', 356: 'antonio', 653: 'camera', 945: 'definitely', 857: 'could', 1597: 'idetify', 3023: 'this', 2200: 'ordinary', 1306: 'flies', 2706: 'silently', 1176: 'evening', 2340: 'please', 2506: 'respect', 2386: 'privacy', 697: 'changed', 998: 'directions', 1196: 'experience', 3184: 'unkown', 3230: 'vegas', 856: 'couch', 2604: 'screen', 1053: 'door', 1475: 'happened', 2617: 'see', 296: 'airplane', 632: 'but', 762: 'closer', 2194: 'orb', 463: 'barnes', 2446: 'rd', 3269: 'wallingford', 2225: 'oval', 555: 'blueish', 3412: 'zig', 3409: 'zaging', 1028: 'dissapears', 2898: 'strage', 2163: 'ohio', 2533: 'river', 2746: 'slow', 2813: 'spectacular', 3117: 'triangle', 2619: 'seem', 2605: 'se', 973: 'did', 2012: 'move', 1822: 'long', 1105: 'eastern', 2439: 'random', 2405: 'pulse', 292: 'aircraft', 2381: 'pressure', 3283: 'wave', 290: 'ahead', 1444: 'green', 2922: 'strobing', 1086: 'duration', 2120: 'nuforc', 2108: 'note', 2717: 'sirius', 2277: 'pd', 508: 'bell', 1026: 'dissapear', 2458: 'reappear', 1985: 'monmouth', 863: 'county', 2068: 'new', 1675: 'jersey', 1886: 'maple', 1455: 'grove', 364: 'apparent', 1273: 'firey', 574: 'bottom', 2305: 'photos', 670: 'captured', 1080: 'drops', 1751: 'late', 287: 'afternoon', 2436: 'raleigh', 2052: 'nc', 377: 'april', 243: '8th', 2216: 'our', 1838: 'lost', 1996: 'morphed', 1008: 'disappearing', 2836: 'spotted', 790: 'columbus', 2024: 'ms', 1948: 'military', 293: 'aircrafts', 369: 'appearing', 1646: 'investigate', 1447: 'grey', 1354: 'friend', 1239: 'fashion', 2697: 'sideways', 994: 'direct', 1366: 'future', 445: 'backyard', 2767: 'soaring', 1295: 'flashing', 1165: 'erratically', 1846: 'low', 98: '22', 1494: 'headlights', 690: 'centered', 101: '25', 25: '13', 140: '40', 2844: 'squared', 3241: 'vessel', 1860: 'macon', 1370: 'ga', 2244: 'paper', 1685: 'journal', 2867: 'stated', 1989: 'month', 2578: 'satalites', 3256: 'visible', 137: '39winking', 3154: 'ufos', 700: 'charleston', 2592: 'sc', 2909: 'streaming', 2823: 'spheres', 955: 'des', 1978: 'moines', 1653: 'iowa', 453: 'balls', 1338: 'formations', 1677: 'jets', 3066: 'toward', 1702: 'kennedy', 689: 'center', 3249: 'viewed', 775: 'cocoa', 1281: 'fl', 1417: 'glowing', 476: 'bday', 2256: 'party', 1125: 'else', 328: 'am', 2498: 'reporting', 1457: 'gulf', 406: 'ashtabula', 2719: 'sister', 612: 'brother', 1757: 'law', 1478: 'hard', 1198: 'explain', 3031: 'three', 2928: 'suburb', 1269: 'fire', 510: 'beloit', 3324: 'wi', 570: 'border', 2278: 'pea', 2344: 'pod', 1345: 'four', 1790: 'lighted', 2281: 'peas', 3059: 'top', 3360: 'witnessed', 3044: 'times', 3110: 'tremendeous', 1164: 'erratic', 2014: 'moveme', 3095: 'travel', 2100: 'northern', 1314: 'florida', 2408: 'pulsing', 2686: 'shot', 2538: 'rocket', 3185: 'unlike', 1936: 'mi', 2789: 'soundless', 2727: 'six', 3025: 'those', 467: 'baseball', 2920: 'strobe', 1768: 'lebanon', 893: 'ct', 1586: 'i4', 1192: 'exit', 29: '14', 2824: 'spherical', 1463: 'halo', 380: 'arc', 2231: 'overhead', 2487: 'remember', 2240: 'pale', 743: 'city', 1087: 'durham', 1325: 'following', 996: 'direction', 1806: 'lite', 3034: 'through', 2143: 'observed', 593: 'brie', 1232: 'far', 1413: 'glod', 2111: 'nothig', 3398: 'you', 2027: 'much', 2074: 'nice', 1922: 'mercey', 1545: 'hot', 2840: 'springs', 2827: 'spinning', 2773: 'solid', 2674: 'shinny', 3092: 'transparent', 2833: 'spot', 1492: 'heading', 1566: 'huge', 2189: 'orangeish', 1340: 'forming', 898: 'curved', 1460: 'had', 2781: 'somewhere', 688: 'cen', 36: '160', 1993: 'more', 3150: 'type', 1339: 'formed', 2615: 'seconds', 3286: 'way', 1791: 'lighthouse', 1043: 'does', 729: 'cigar', 1910: 'me', 1120: 'electric', 1616: 'indian', 673: 'carolina', 2260: 'passenger', 2609: 'seat', 844: 'conversion', 3214: 'van', 1981: 'mom', 2694: 'siblings', 408: 'asleep', 1134: 'emitted', 927: 'daylight', 2566: 'rush', 1547: 'hour', 1441: 'gray', 439: 'awesome', 3396: 'yet', 882: 'creepy', 2831: 'spooky', 2669: 'shined', 2671: 'shiniest', 722: 'chrome', 1182: 'ever', 805: 'completely', 2422: 'quiet', 2699: 'sighted', 1746: 'las', 2496: 'report', 1179: 'event', 3294: 'wells', 198: '44texas', 3358: 'without', 109: '300', 141: '400', 1245: 'feet', 2299: 'phoenix', 441: 'az', 2186: 'orage', 1405: 'glenville', 3384: 'wv', 1557: 'hovers', 1552: 'houston', 3148: 'tx', 1227: 'fall', 60: '1973', 2834: 'spotlight', 1971: 'mist', 3199: 'upward', 2002: 'motion', 1967: 'missile', 1754: 'launch', 200: '44then', 2187: 'orang', 3281: 'watching', 2010: 'mountain', 1231: 'fanwood', 2081: 'nj', 2638: 'series', 2801: 'southwestern', 2985: 'teardrop', 3282: 'water', 1805: 'lit', 1301: 'flew', 573: 'bothell', 3261: 'wa', 1929: 'metallic', 1042: 'dodge', 2911: 'street', 1206: 'exprsway', 2170: 'omaha', 2053: 'ne', 1487: 'hbccufo', 659: 'canadian', 974: 'didn', 134: '39t', 2946: 'surrounded', 2064: 'neon', 1553: 'hover', 914: 'dart', 1902: 'massive', 785: 'colorful', 1624: 'instantly', 1520: 'highway', 112: '31', 1617: 'indiana', 957: 'descend', 3106: 'trees', 1521: 'hill', 999: 'directly', 3236: 'vernon', 1415: 'glowball', 1361: 'front', 496: 'bedroom', 1247: 'felt', 633: 'butterfly', 2628: 'sensation', 712: 'chest', 3000: 'th', 901: 'cylinder', 414: 'atlantic', 2152: 'ocean', 774: 'coastal', 1529: 'hobe', 1937: 'miami', 310: 'alien', 2339: 'playing', 1343: 'forth', 1117: 'egg', 5: '03', 1724: 'l7', 1548: 'hours', 3221: 'vanishing', 1767: 'leaving', 560: 'bobbing', 3183: 'unknown', 3144: 'twinkling', 849: 'corners', 3168: 'underneath', 393: 'arou', 3017: 'thin', 2321: 'pinkish', 2528: 'rings', 1069: 'drifting', 2026: 'mt', 3210: 'va', 2371: 'potomac', 1096: 'early', 50: '1963', 51: '1964', 2490: 'reno', 2913: 'streets', 3130: 'trying', 1265: 'find', 1546: 'hotel', 2114: 'noticed', 2312: 'pie', 1472: 'hanging', 739: 'circus', 2251: 'parki', 57: '197', 1490: 'headed', 3373: 'work', 1538: 'horizion', 2309: 'pics', 938: 'decide', 3400: 'yourself', 564: 'bolingbrook', 1904: 'may', 41: '17th', 81: '2001', 1912: 'meadows', 1771: 'length', 1303: 'flickered', 2494: 'replace', 2923: 'strong', 2238: 'pacific', 1799: 'lincoln', 2606: 'sea', 1459: 'gypsy', 478: 'beach', 796: 'coming', 3100: 'travels', 1962: 'minute', 404: 'ascends', 1408: 'gliding', 1877: 'manhattan', 991: 'dinner', 2775: 'some', 858: 'couldn', 503: 'beileve', 1836: 'los', 342: 'angeles', 1888: 'march', 83: '2004', 1241: 'faster', 359: 'anything', 2029: 'multi', 787: 'coloured', 541: 'blasts', 2667: 'shifting', 2957: 'swaying', 93: '2055', 1561: 'hrs', 1114: 'edt', 2716: 'single', 3111: 'tremendously', 1203: 'explodes', 276: 'activity', 1700: 'keller', 2580: 'satellites', 410: 'associated', 708: 'chemtrails', 3096: 'traveled', 3192: 'unusual', 1880: 'manner', 2345: 'point', 3410: 'zags', 1834: 'loops', 1959: 'minnesotas', 3371: 'woods', 2056: 'nearly', 1157: 'equilateral', 848: 'corner', 3063: 'total', 2707: 'sillouette', 2693: 'si', 1601: 'illuminated', 1021: 'disks', 2011: 'mountains', 1284: 'flame', 2098: 'northeast', 1819: 'location', 1385: 'georgia', 1208: 'extreme', 1304: 'flickering', 3132: 'tubular', 950: 'delta', 572: 'both', 934: 'debris', 2701: 'sightings', 1714: 'kingstown', 2521: 'ri', 2155: 'odd', 1778: 'lewisville', 2563: 'running', 1760: 'leader', 2565: 'rural', 2997: 'tests', 2771: 'soil', 1364: 'fuel', 686: 'cell', 3222: 'vapor', 1047: 'dogs', 461: 'barking', 1842: 'loudly', 1287: 'flanked', 2174: 'ones', 2721: 'sited', 2314: 'pier', 2276: 'pawleys', 1659: 'island', 491: 'become', 420: 'auburn', 1244: 'federal', 1851: 'luminous', 2325: 'pittsburgh', 3389: 'year', 1036: 'distinct', 2045: 'naked', 1210: 'eye', 3405: 'yuma', 2015: 'movement', 3347: 'winn', 1040: 'dixie', 2252: 'parking', 1839: 'lot', 1825: 'longwood', 3317: 'whippany', 1998: 'morris', 1747: 'last', 2760: 'smoking', 731: 'cigarette', 327: 'always', 1041: 'do', 2493: 'repeated', 1140: 'encounter', 65: '1980', 483: 'beardstown', 818: 'confirmed', 2049: 'nature', 3079: 'traditional', 1336: 'format', 2359: 'port', 645: 'california', 3229: 'veers', 3220: 'vanishes', 1885: 'many', 1056: 'dots', 2413: 'put', 2222: 'outlined', 3099: 'travelling', 2675: 'shiny', 1006: 'disappeards', 982: 'dim', 322: 'also', 888: 'crossed', 1442: 'great', 1034: 'distances', 2683: 'short', 335: 'amount', 3043: 'time', 2459: 'reappeare', 735: 'circles', 2829: 'splitting', 2722: 'siting', 3215: 'vancouver', 1485: 'haze', 248: '96', 405: 'ashland', 2202: 'oregon', 72: '1996', 1797: 'lima', 1927: 'metal', 862: 'country', 2866: 'state', 332: 'amber', 1765: 'least', 224: '6000', 233: '7000', 262: 'absolutely', 1280: 'fixed', 2044: 'nailed', 2326: 'place', 475: 'bb', 1504: 'held', 391: 'arms', 110: '30am', 3027: 'thought', 499: 'began', 530: 'binoculars', 1578: 'husband', 497: 'been', 2618: 'seeing', 3302: 'western', 231: '6pm', 1183: 'every', 1580: 'hw', 1814: 'lo', 2507: 'rest', 1293: 'flashed', 2516: 'revealed', 121: '35ish', 3328: 'wife', 2681: 'shopping', 446: 'bag', 741: 'ciruclar', 1079: 'dropping', 1291: 'flares', 1423: 'golden'}
'''

In [40]:
'''
# Make a list of features to drop
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo_no_missing.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)
'''

'\n# Make a list of features to drop\nto_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]\n\n# Drop those features\nufo_dropped = ufo_no_missing.drop(to_drop, axis=1)\n\n# Let\'s also filter some words out of the text vector we created\nfiltered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)\n'

In [41]:
'''
# Take a look at the features in the X set of data
print(X.columns)

# Split the X and y sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit knn to the training sets
knn.fit(X_train, y_train)

# Print the score of knn on the test sets
print(knn.score(X_test, y_test))
'''

'\n# Take a look at the features in the X set of data\nprint(X.columns)\n\n# Split the X and y sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)\n\n# Fit knn to the training sets\nknn.fit(X_train, y_train)\n\n# Print the score of knn on the test sets\nprint(knn.score(X_test, y_test))\n'

In [42]:
'''
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

# Split the X and y sets using train_test_split, setting stratify=y 
X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)

# Fit nb to the training sets
nb.fit(X_train, y_train)

# Print the score of nb on the test sets
print(nb.score(X_test, y_test))
'''

'\n# Use the list of filtered words we created to filter the text vector\nfiltered_text = desc_tfidf[:, list(filtered_words)]\n\n# Split the X and y sets using train_test_split, setting stratify=y \nX_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y, stratify=y, random_state=42)\n\n# Fit nb to the training sets\nnb.fit(X_train, y_train)\n\n# Print the score of nb on the test sets\nprint(nb.score(X_test, y_test))\n'