[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dgdelisss/Blue2_HW6_UFO_Text/blob/master/TextMining.ipynb)

**Project Description:**
We will analyze data on reported incidents of UFO sightings. Utilizing data collected by an organization dedicated to this topic, we will apply topic clustering techniques to identify commonalities among these sightings and interpret the results to provide a summary of the major themes of these reports. After clustering among the full dataset, we will then focus on comparing UFO sightings in California, Arizona, and Nevada again using clustering to investigate their similarities and differences.  

**Analysis: **
We will perform topic clustering on the text column from our dataset to identify major topics of discussion. We will then use this clustering to analyze any commonalities or anomalies based on descriptors of UFO shape, size, etc. We’ll start with a cluster analysis of the full dataset, and then narrow the focus to comparing sightings exclusively in California, Nevada, and Arizona.

**Deliverables: **
We will provide the following deliverables at the end of the project:
A dataset containing reports of UFO sightings
A set of insights derived from the dataset
A short in-class presentation of our findings, discussions of their meaning, and general “lessons learned” from our project. 


# Packages and Installations:

In [1]:
#installs any packages not available by default
!pip install gensim
!pip install wordcloud
%time

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.7MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 15.1MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

In [2]:
#importing packages neeeded for Text Analysis
import pandas as pd
import numpy as np
import nltk
import sklearn
import gensim
import re
import string
import wordcloud
import os
%time

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.54 µs


In [3]:
##Specific Text Mining Features from SKLEARN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
#Other specific useful packages
from wordcloud import WordCloud
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt
import matplotlib as mpl
%time

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.78 µs


In [4]:
#Downloading features from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
%time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 9.78 µs


# User Defined Functions:

In [0]:
#Flatten Function (This will collapse a list of lists into just one list)
flatten = lambda l: [item for sublist in l for item in sublist]

In [0]:
#Stringer

def Stringer(list):
  new_list = []
  for i in list:
    new = str(i)
    new_list.append(new)
  return(new_list)

In [0]:
#Term Vector Function
def Term_Vectors(doc):
  punc = re.compile( '[%s]' % re.escape( string.punctuation ) )
  term_vec = [ ]

  for d in doc:
      d = str(d)
      d = d.lower()
      d = punc.sub( '', d )
      term_vec.append( nltk.word_tokenize( d ) )

  return(term_vec)
     

In [0]:
#Stop Word Function
def Stop_Word(term_vec, stop_words = nltk.corpus.stopwords.words( 'english' )):

  for i in range( 0, len( term_vec ) ):
      
      term_list = [ ]

      for term in term_vec[i]:
          if term not in stop_words:
              term_list.append( term )

      term_vec[i] = term_list

  return(term_vec)

In [0]:
#Porter Stem Function

def Porter_Stem(term_vec):
  porter = nltk.stem.porter.PorterStemmer()

  for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
      term_vec[ i ][ j ] = porter.stem( term_vec[ i ][ j ] )

  return(term_vec)


In [0]:
#Lemmatizer Function
def lemmatizer(term_vec):
  for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
      try: pos = str(wn.synsets(j)[0].pos())
      except: pos = "n"
      term_vec[i][j] = str(WordNetLemmatizer().lemmatize(term_vec[i][j],pos))
  return(term_vec)
      


In [0]:
##Basic Word Cloud Function

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        max_words=50,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()


# Initial Data Importation and Cleaning:

In [0]:
#imports ufo dataset from our data.world repo
ufoset = pd.read_csv('https://query.data.world/s/t5l7slkbhurybmuxkfgncobbaknf7i')

In [13]:
#subsets data by selected states, removes every column but State and Text
states = ["CA","NV","AR","NM", "NC"]
subset_ufoset = ufoset.loc[ufoset['state'].isin(states)]

encounters = subset_ufoset[['text','state']]

#New datasets for each state
CA_encounters = encounters.loc[ufoset['state'] == "CA"]
NV_encounters = encounters.loc[ufoset['state'] == "NV"]
AR_encounters = encounters.loc[ufoset['state'] == "AR"]
NM_encounters = encounters.loc[ufoset['state'] == "NM"]
NC_encounters = encounters.loc[ufoset['state'] == "NC"]

#Word Vectors
All_States = ufoset['text'].values.tolist()
SelectStates_vect = encounters['text'].values.tolist()
CA_vect = CA_encounters['text'].values.tolist()
NV_vect = NV_encounters['text'].values.tolist()
AR_vect = AR_encounters['text'].values.tolist()
NM_vect = NM_encounters['text'].values.tolist()
NC_vect = NC_encounters['text'].values.tolist()

print("Lists created.")
%time

Lists created.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


# Begin Text Processing with Term Vectors, Stopwords, and Stemming:

In [14]:
#Creates Term Vectors for all word vectors

All_term = Term_Vectors(All_States)
SelectStates_term = Term_Vectors(SelectStates_vect)
CA_term = Term_Vectors(CA_vect)
NV_term = Term_Vectors(NV_vect)
AR_term =Term_Vectors(AR_vect)
NM_term =Term_Vectors(NM_vect)
NC_term =Term_Vectors(NC_vect)

print("Term Vectors  Complete.")
%time

Term Vectors  Complete.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.3 µs


In [15]:
stopword = nltk.corpus.stopwords.words('english')
custom_words = ['summary','SUMMARY']
stopword += custom_words

print("Stop Words Created.")
%time

Stop Words Created.
CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 13.4 µs


In [16]:
#Stop Word filter for all Vectors
All_stop = Stop_Word(All_term,stopword)
SelectStates_stop = Stop_Word(SelectStates_term,stopword)
CA_stop = Stop_Word(CA_term,stopword)
NV_stop = Stop_Word(NV_term,stopword)
AR_stop = Stop_Word(AR_term,stopword)
NM_stop = Stop_Word(NM_term,stopword)
NC_stop = Stop_Word(NC_term,stopword)

print("Stop Words filter Applied to Term Vectors.")
%time

Stop Words filter Applied to Term Vectors.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs


In [17]:
#Lemmatizing for All Vectors
#Results look way cleaner than porter stemming

All_lem = lemmatizer(All_stop)
SelectStates_lem = lemmatizer(SelectStates_stop)
CA_lem = lemmatizer(CA_stop)
NV_lem = lemmatizer(NV_stop)
AR_lem = lemmatizer(AR_stop)
NM_lem = lemmatizer(NM_stop)
NC_lem = lemmatizer(NC_stop)

print("Lemmatization Complete.")
%time

Lemmatization Complete.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


In [18]:
#Will probably need to refilter the vectors after stemming - not sure how much filter terms are needed yet
nextfilter = ["'","-","look","saw","like","seen","see","could","would","also","got","said","seem","go","well","even"]

All_filt = Stop_Word(All_lem,nextfilter)
SelectStates_filt = Stop_Word(SelectStates_lem,nextfilter)
CA_filt = Stop_Word(CA_lem,nextfilter)
NV_filt = Stop_Word(NV_lem,nextfilter)
AR_filt = Stop_Word(AR_lem,nextfilter)
NM_filt = Stop_Word(NM_lem,nextfilter)
NC_filt = Stop_Word(NC_lem,nextfilter)

print("Text Filtering Complete")
%time

Text Filtering Complete
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.54 µs


In [19]:
#Setting Up Vocab Lists

Select_vdict = {'index': SelectStates_filt,'word': SelectStates_term}
Select_vocab = pd.DataFrame(Select_vdict)
Select_vocab = Select_vocab.set_index('index')

print('there are ' + str(Select_vocab.shape[0]) + ' items in Select_vocab')

print("Vocab Vectors Complete")
%time

there are 18947 items in Select_vocab
Vocab Vectors Complete
CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 14.3 µs


18947

#tfidf Vectorization & K-Means Clustering

In [20]:
All_tfidf = TfidfVectorizer(All_filt, decode_error = "replace")
SelectStates_tfidf = TfidfVectorizer(SelectStates_filt, decode_error = "replace")
CA_tfidf = TfidfVectorizer(CA_filt, decode_error = "replace")
NV_tfidf = TfidfVectorizer(NV_filt, decode_error = "replace")
AR_tfidf = TfidfVectorizer(AR_filt, decode_error = "replace")
NM_tfidf = TfidfVectorizer(NM_filt, decode_error = "replace")
NC_tfidf = TfidfVectorizer(NC_filt, decode_error = "replace")

print("Tfidf Vectors Complete.")
%time

Tfidf Vectors Complete.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


In [21]:
##Document Similarity Matrices

#All_matrix = All_tfidf.fit_transform(ufoset['text'].values.astype('U'))
SelectStates_matrix = SelectStates_tfidf.fit_transform(encounters['text'].values.astype('U'))
CA_matrix = CA_tfidf.fit_transform(CA_encounters['text'].values.astype('U'))
CA_matrix = CA_tfidf.fit_transform(CA_encounters['text'].values.astype('U'))
NV_matrix = NV_tfidf.fit_transform(NV_encounters['text'].values.astype('U'))
AR_matrix = AR_tfidf.fit_transform(AR_encounters['text'].values.astype('U'))
NM_matrix = NM_tfidf.fit_transform(NM_encounters['text'].values.astype('U'))
NC_matrix = NC_tfidf.fit_transform(NC_encounters['text'].values.astype('U'))

print("Similarity Matrices Complete.")
%time

Similarity Matrices Complete.
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.54 µs


In [22]:
#Get term names
#All_terms = All_tfidf.get_feature_names()
select_terms = SelectStates_tfidf.get_feature_names()
CA_terms = CA_tfidf.get_feature_names()
NV_terms = NV_tfidf.get_feature_names()
AR_terms = AR_tfidf.get_feature_names()
NM_terms = NM_tfidf.get_feature_names()
NC_terms = NC_tfidf.get_feature_names()

print("Term Names Complete.")
%time

Term Names Complete.
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs


In [23]:
#Pairwise Similaritiy Distances Calculation

#All_dist = 1 - cosine_similarity(All_matrix)
SelectStates_dist = 1 - cosine_similarity(SelectStates_matrix)
CA_dist = 1 - cosine_similarity(CA_matrix)
NV_dist = 1 - cosine_similarity(NV_matrix)
AR_dist = 1 - cosine_similarity(AR_matrix)
NM_dist = 1 - cosine_similarity(NM_matrix)
NC_dist = 1 - cosine_similarity(NC_matrix)

print("Pairwise Complete Distances Calculated")
%time

Pairwise Complete Distances Calculated
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.5 µs


In [24]:
## KMeans Clustering with n = 5

#All_kmeans = KMeans(n_clusters=5,random_state =0).fit(All_matrix)
SelectStates_kmeans = KMeans(n_clusters=5,random_state =0).fit(SelectStates_matrix)
CA_kmeans = KMeans(n_clusters=5,random_state =0).fit(CA_matrix)
NV_kmeans = KMeans(n_clusters=5,random_state =0).fit(NV_matrix)
AR_kmeans = KMeans(n_clusters=5,random_state =0).fit(AR_matrix)
NM_kmeans = KMeans(n_clusters=5,random_state =0).fit(NM_matrix)
NC_kmeans = KMeans(n_clusters=5,random_state =0).fit(NC_matrix)

print("K-Means Clustering Complete")
%time

K-Means Clustering Complete
CPU times: user 14 µs, sys: 2 µs, total: 16 µs
Wall time: 14.8 µs


In [25]:
#Get Cluster Labels

#All_States_clusters = All_kmeans.labels_.tolist()
SelectStates_clusters = SelectStates_kmeans.labels_.tolist()
CA_clusters = CA_kmeans.labels_.tolist()
NV_clusters = NV_kmeans.labels_.tolist()
AR_clusters = AR_kmeans.labels_.tolist()
NM_clusters = NM_kmeans.labels_.tolist()
NC_clusters = NC_kmeans.labels_.tolist()

print("Cluster Labels Complete.")
%time

Cluster Labels Complete.
CPU times: user 15 µs, sys: 2 µs, total: 17 µs
Wall time: 23.6 µs


In [0]:
Select_State = {'index': SelectStates_clusters,'clusters': SelectStates_clusters, 'State': encounters['state'], "Text":encounters['text'] }
Select_Frame = pd.DataFrame(Select_State)
Select_Frame = Select_Frame.set_index('index')

In [27]:
Select_Frame['clusters'].value_counts() #number of 'encounters' per cluster (clusters from 0 to 4)

4    5440
0    4842
3    4522
1    2714
2    1429
Name: clusters, dtype: int64

In [38]:
order_centroids[1,:5]

array([36817, 36933,  4053, 40388, 37357])

In [0]:
SelectStates_clusters[36817]

In [45]:
Select_vocab.items()

<generator object DataFrame.iteritems at 0x7ff52ce57fc0>

In [32]:
from __future__ import print_function

print("Top terms per cluster:")
print("")
#sort cluster centers by proximity to centroid
order_centroids = SelectStates_kmeans.cluster_centers_.argsort()[:, ::-1] 

for i in range(5):
    print("Cluster words:", i, ":", end='')
    
    for ind in order_centroids[i, :5]: #replace 6 with n words per cluster
        
        #print(Select_vocab.iloc[select_terms[ind]], end =",")
        #print(list(Select_vocab.keys())[list(Select_vocab.values()).index(select_terms[ind])],end =",")
        print(select_terms[ind], end = ",")
        
print("")
print("")

Top terms per cluster:

Cluster words: 0 :the,and,to,was,it,Cluster words: 1 :the,they,and,were,to,Cluster words: 2 :provides,information,elects,anonymous,remain,Cluster words: 3 :it,the,was,and,to,Cluster words: 4 :the,and,it,in,to,



In [0]:
#Multidimension Scaling

import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(SelectStates_dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()

# Experimental Code for Figureing out Next Steps:

In [43]:
#Flattening List of Lists of Each State - Might be useful for State Comparisons
All_flat = flatten(All_filt)
CA_flat = flatten(CA_filt)
NV_flat = flatten(NV_filt)
AR_flat = flatten(AR_filt)
NM_flat = flatten(NM_filt)

print("Flattened...")
%time

Flattened...
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs


In [0]:
#Creates a list of lists of our 4 states 
States = [CA_flat,NV_flat,AR_flat,NM_flat]
%time

In [46]:
#Basic Exploration of Word Counts
Counter(All_flat).most_common(50)
%time

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [0]:
All_kmeans.shape()