**Project Description:**
We will analyze data on reported incidents of UFO sightings. Utilizing data collected by an organization dedicated to this topic, we will apply topic clustering techniques to identify commonalities among these sightings and interpret the results to provide a summary of the major themes of these reports. After clustering among the full dataset, we will then focus on comparing UFO sightings in California, Arizona, and Nevada again using clustering to investigate their similarities and differences.  

**Analysis: **
We will perform topic clustering on the text column from our dataset to identify major topics of discussion. We will then use this clustering to analyze any commonalities or anomalies based on descriptors of UFO shape, size, etc. We’ll start with a cluster analysis of the full dataset, and then narrow the focus to comparing sightings exclusively in California, Nevada, and Arizona.

**Deliverables: **
We will provide the following deliverables at the end of the project:
A dataset containing reports of UFO sightings
A set of insights derived from the dataset
A short in-class presentation of our findings, discussions of their meaning, and general “lessons learned” from our project. 


# Packages and Installations:

In [74]:
#installs any packages not available by default
!pip install gensim
!pip install wordcloud
%time



You are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Wall time: 0 ns


You are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [75]:
#importing packages neeeded for Text Analysis
import pandas as pd
import numpy as np
import nltk
import sklearn
import gensim
import re
import string
import wordcloud
import os
import pylab as pl
import requests
import random


%time

Wall time: 0 ns


In [76]:
##Specific Text Mining Features from SKLEARN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
#Other specific useful packages
from wordcloud import WordCloud
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
from os import path
from PIL import Image
from io import BytesIO
import scipy.stats as scs
from scipy import sparse
from scipy.stats.distributions import chi2
from scipy import stats
from sklearn import metrics
from scipy.spatial.distance import cdist

%time

Wall time: 0 ns


In [77]:
#Downloading features from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
%time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Wall time: 0 ns


# User Defined Functions:

In [78]:
#Flatten Function (This will collapse a list of lists into just one list)
flatten = lambda l: [item for sublist in l for item in sublist]

In [79]:
#Flattens an Entire list into a large string
def pancake(list):
    pancake = ""
    for i in list:
        pancake = pancake + " " + str(i)
    return(pancake)

In [80]:
#Unicoder

def Unicoder(list):
  new_list = []
  for i in list:
    new = str(i)
    new.encode('utf-8')
    new_list.append(new)
  return(new_list)


In [81]:
#Stringer

def Stringer(list):
  new_list = []
  for i in list:
    new = str(i)
    new_list.append(new)
  return(new_list)

In [82]:
#StringOnly

def StringOnly(list):
  new_list = []
  for i in list:
    if isinstance(i, str):
        new_list.append(i)
  return(new_list)

In [83]:
#Term Vector Function
def Term_Vectors(doc):
  punc = re.compile( '[%s]' % re.escape( string.punctuation ) )
  term_vec = [ ]

  for d in doc:
      d = str(d)
      d = d.lower()
      d = punc.sub( '', d )
      term_vec.append( nltk.word_tokenize( d ) )

  return(term_vec)
     

In [84]:
#Stop Word Function
def Stop_Word(term_vec, stop_words = nltk.corpus.stopwords.words( 'english' )):

  for i in range( 0, len( term_vec ) ):
      
      term_list = [ ]

      for term in term_vec[i]:
          if term not in stop_words:
              term_list.append( term )

      term_vec[i] = term_list

  return(term_vec)

In [85]:
#Porter Stem Function - Lemmatizer was better

def Porter_Stem(term_vec):
  porter = nltk.stem.porter.PorterStemmer()

  for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
      term_vec[ i ][ j ] = porter.stem( term_vec[ i ][ j ] )

  return(term_vec)


In [86]:
#Lemmatizer Function
def lemmatizer(term_vec):
  for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
      try: pos = str(wn.synsets(j)[0].pos())
      except: pos = "n"
      term_vec[i][j] = str(WordNetLemmatizer().lemmatize(term_vec[i][j],pos))
  return(term_vec)
      


In [87]:
##Basic Word Cloud Function

def show_wordcloud(data, title = None, mask = None, max_words = 500):
    
    cloud = WordCloud(
        background_color='white',
        max_words=max_words,
        max_font_size=50, 
        scale=3,
        mask = mask,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)
 
    plt.imshow(cloud)
    plt.show()

In [88]:
##Color Dictionary Function Based on Cluster #s

def colordict(num):
  color_dict = {}
  if num <= 7:
    color_dict[0] = "#1b9e77"
    color_dict[1] = "#d95f02"
    color_dict[2] = "#7570b3"
    color_dict[3] = "#e7298a"
    color_dict[4] = "#66a61e"
    color_dict[5] = "#e6ab02"
    color_dict[6] = "#a6761d"  
    color_dict[7] = "#666666"  
  elif num == 8:
    color_dict[0] = "#e41a1c"
    color_dict[1] = "#377eb8"
    color_dict[2] = "#4daf4a"
    color_dict[3] = "#984ea3"
    color_dict[4] = "#ff7f00"
    color_dict[5] = "#ffff33"
    color_dict[6] = "#a65628"
    color_dict[7] = "#f781bf"
    color_dict[8] = "#999999"  
  elif num >= 9:
    color_dict[0] = "#a6cee3"
    color_dict[1] = "#1f78b4"
    color_dict[2] = "#b2df8a"
    color_dict[3] = "#33a02c"
    color_dict[4] = "#fb9a99"
    color_dict[5] = "#e31a1c"
    color_dict[6] = "#fdbf6f"
    color_dict[7] = "#ff7f00"
    color_dict[8] = "#cab2d6"
    color_dict[9] = "#6a3d9a"
    color_dict[10] = "#ffff99"
    color_dict[11] = "#b15928"
    
  return(color_dict)


In [89]:
def ChiSquareChart(obs,exp):
                       
    cols = obs.columns
    axis_name = obs.axes[1].name
    
    df = pd.DataFrame()
    
    for i in cols:
        new_row =(((obs[i]-exp[i])**2)/exp[i])
        new_row = ["{:0.3e}".format(x) for x in new_row ]
        
        df[i] = new_row
        
    df = df.rename_axis(axis_name, axis="columns")
    return(df)


In [90]:
def Matrix_PValue(chimatrix,dof):
    
    Chi_Frame.iloc[0].values
   
    cols = chimatrix.columns
    axis_name = chimatrix.axes[1].name
    
    df = pd.DataFrame()
    
    for i in cols:
        p = np.array(Chi_Frame[i])
        new_val = stats.chi2.pdf(p , dof)
        df[i] = new_row
        
    df = df.rename_axis(axis_name, axis="columns")
    return(df)


# Initial Data Importation and Cleaning:

In [91]:
#imports ufo dataset from our data.world repo
ufoset = pd.read_csv('https://query.data.world/s/t5l7slkbhurybmuxkfgncobbaknf7i')
%time

Wall time: 0 ns


In [92]:
#subsets data by selected states, removes every column but State and Text

## ALTER FOR DIFFERENT STATES HERE ##
states = ["CA","NV","AR","NM", "NC"]


subset_ufoset = ufoset.loc[ufoset['state'].isin(states)]

encounters = subset_ufoset[['text','state']]

#Word Vectors
SelectStates_Xvect = encounters['text'].values.tolist()
SelectStates_Svect = encounters['state'].values.tolist()

print("Lists created.")
%time

Lists created.
Wall time: 0 ns


In [93]:
# generates index for each item in the corpora (in this case it's just rank) and I'll use this for scoring later
ranks = []

for i in range(0,len(SelectStates_Xvect)):
    ranks.append(i)

print("Rank Index Created")
%time


Rank Index Created
Wall time: 0 ns


# Begin Text Processing with Term Vectors, Stopwords, and Stemming:

In [94]:
#Creates Term Vectors for all word vectors

SelectStates_term = Term_Vectors(SelectStates_Xvect)

print("Term Vectors  Complete.")
%time

Term Vectors  Complete.
Wall time: 0 ns


In [95]:
stopword = nltk.corpus.stopwords.words('english')
custom_words = ['summary','SUMMARY',"'","-","saw", "like", "see", "could", "looked", "seen", "foot", "would","nuforc"]
stopword += custom_words

print("Stop Words Created.")
%time

Stop Words Created.
Wall time: 0 ns


In [96]:
#Stop Word filter for all Vectors

SelectStates_stop = Stop_Word(SelectStates_term,stopword)

print("Stop Words filter Applied to Term Vectors.")
%time

Stop Words filter Applied to Term Vectors.
Wall time: 0 ns


In [97]:
#Lemmatizing for All Vectors
#Results look way cleaner than porter stemming

SelectStates_lem = lemmatizer(SelectStates_stop)

print("Lemmatization Complete.")
%time

Lemmatization Complete.
Wall time: 0 ns


In [98]:
allwords_tokens = flatten(SelectStates_stop)
allwords_stemmed = flatten(SelectStates_lem)

vocab_frame = pd.DataFrame({'words': allwords_tokens}, index = allwords_stemmed)

print("Vocab Vector Complete")
%time

Vocab Vector Complete
Wall time: 0 ns


# tfidf Vectorization & K-Means Clustering

In [99]:
#TFIDF
SelectStates_tfidf = TfidfVectorizer(SelectStates_lem, decode_error = "replace", max_features = 200000, max_df = 0.90, min_df = 0.10)



print("Tfidf Vectors Complete.")
%time


Tfidf Vectors Complete.
Wall time: 0 ns


In [100]:
##Document Similarity Matrices

#Converts Items into Unicode
SelectStates_Uni = Unicoder(SelectStates_lem)

#Creates Similarity Matrix
SelectStates_matrix = SelectStates_tfidf.fit_transform(SelectStates_Uni)


print("Similarity Matrices Complete.")
%time


Similarity Matrices Complete.
Wall time: 0 ns


In [101]:
#Get term names

select_terms = SelectStates_tfidf.get_feature_names()

print("Term Names Complete.")
%time

Term Names Complete.
Wall time: 0 ns


In [102]:
chunk_size = round(2500/len(states)) 
matrix_len = SelectStates_matrix.shape[0] # Not sparse numpy.ndarray

def similarity_cosine_by_chunk(start, end):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(X=SelectStates_matrix[start:end], Y=SelectStates_matrix) # scikit-learn function


for chunk_start in range(0, matrix_len, chunk_size):
    cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
    if chunk_start == 0: SelectStates_cosine = cosine_similarity_chunk
    else: SelectStates_cosine = np.concatenate((SelectStates_cosine, cosine_similarity_chunk), axis=0)

print("Cosine Similarity Calculated")
#%time

Cosine Similarity Calculated


In [103]:
##Cosine Similaritiy Matrix  -- Old

#SelectStates_dist = 1 - cosine_similarity(SelectStates_matrix)

In [104]:
##Creates Inverse Similaritiy Distance Matrix

SelectStates_dist = 1 - SelectStates_cosine

print("Inverse Similaritiy Distance Matrix Calculated")
#%time

Inverse Similaritiy Distance Matrix Calculated


In [None]:
## KMeans Clustering

##ALTER FOR DIFFERENT CLUSTERING HERE##
num_clusters = 4

SelectStates_kmeans = KMeans(num_clusters,random_state =0).fit(SelectStates_matrix)

print("K-Means Clustering Complete")
%time

In [None]:
#Get Cluster Labels

SelectStates_clusters = SelectStates_kmeans.labels_.tolist()

print("Cluster Labels Complete.")
%time

In [None]:
##Creates UFO Dataframe

UFO_dict = { 'state': SelectStates_Svect, 'rank': ranks, 'text': SelectStates_Xvect, 'cluster': SelectStates_clusters}

UFO_frame = pd.DataFrame(UFO_dict, index = [SelectStates_clusters] , columns = ['rank', 'state', 'cluster'])

%time

In [None]:
## Document/Cluster Breakdown
print(UFO_frame['cluster'].value_counts())

%time

In [None]:
##Most Common 15 words per cluster 

common_words = SelectStates_kmeans.cluster_centers_.argsort()[:,-1:-16:-1]

for num, centroid in enumerate(common_words):
    cluster_sum = str(num) + ' : ' + ', '.join(select_terms[word] for word in centroid)
    print(cluster_sum)
    #cluster_names[cn] = cluster_sum
    #cn += 1
    
%time

# Visualizations

In [None]:
##Plotting Dictionaries


#set up colors per clusters using a dict
cluster_colors = colordict(num_clusters)

#set up cluster names (5 most common)
common_words = SelectStates_kmeans.cluster_centers_.argsort()[:,-1:-6:-1]

cluster_names = {} 
cn = 0

for num, centroid in enumerate(common_words):
    cluster_sum = str(num) + ' : ' + ', '.join(select_terms[word] for word in centroid)
    print(cluster_sum)
    cluster_names[cn] = cluster_sum
    cn += 1


custom_clusters = {0 : 'one moving bright object second appeared',
                1 : 'witness provides information remain totally anonymous',
                2 : 'object moving high speed flying direction',
                3 : 'large light craft shaped aircraft sound',
                4 : 'bright light object moving around'}


#Uncomment Below to apply custom names:

#cluster_names = custom_clusters

%time

In [None]:
#PCA 2 components

pca = PCA(n_components=2)

pos = pca.fit_transform(SelectStates_dist)

xs, ys = pos[:, 0], pos[:, 1]

%time

In [None]:
#PCA 3 components

pca = PCA(n_components=3)

pos = pca.fit_transform(SelectStates_dist)

x_s, y_s, z_s = pos[:, 0], pos[:, 1],pos[:, 2]

%time

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=x_s, y=y_s, z=z_s, label=SelectStates_clusters, state=encounters['state'])) 

#Exporting Final x,y,z dataframe
filepath = "~/Documents/GitHub/Blue2_HW6_UFO_Text/"   #Update as needed
file = filepath + "ufo_plot_data.csv"

df.to_csv(file)

#group by cluster
groups = df.groupby('label')

In [None]:
## State/Cluster Analysis Plot
from scipy.stats.distributions import chi2

StateChart = pd.crosstab(df.state,df.label).rename_axis('state').rename_axis("clusters", axis="columns")
print("OBSERVED VALUES")
print(StateChart)
print("")

chi2, p, dof, ex = scs.chi2_contingency(StateChart)

print("")
print("EXPECTED VALUES")
Exp_Frame=pd.DataFrame(np.round(ex,1),index =StateChart.index[0:]).rename_axis("clusters", axis="columns")
print(Exp_Frame)
print("")
print("ChiSqr:",round(chi2,3)," P value:", format(p, '.3e'))
print("")

In [None]:
## 2D PLOT

#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

# set up plot
fig, ax = plt.subplots(figsize=(30, 20)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
  
    mk_state = group.state.values.tolist()
    
    ax.plot(group.x, group.y, marker="o", linestyle='', ms=5, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the state title
#for i in range(len(df)):
#    ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['state'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

%time

In [None]:
## 3D PLOT

#Jupyter plot options
%matplotlib notebook
#%matplotlib inline 

# set up plot
fig = plt.figure(figsize=(10,10))
#ax = Axes3D(fig)
ax = fig.add_subplot(111, projection='3d')

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, group.z, marker="o", linestyle='', ms=3, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)
    ax.tick_params(\
        axis= 'z',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y,z position with the label as the state
#for i in range(len(df)):
#    ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['z'], df.iloc[i]['state'], size=8)  

for angle in range(0, 360):
    ax.view_init(30, angle)
    plt.draw()
    plt.pause(.001)
    
%time

In [None]:
##WORD CLOUDS
    
flat_vect = flatten(SelectStates_lem)
string_vect = StringOnly(flat_vect)

#long_string = pancake(string_vect)

long_string = " ".join(str(x) for x in string_vect)

#Basic
show_wordcloud(long_string)

#Masked
response = requests.get("https://raw.githubusercontent.com/dgdelisss/UFO_Sightings_TextMining/master/ufo_mask.png")
img = Image.open(BytesIO(response.content))
img_mask = np.array(img)

show_wordcloud(long_string, mask=img_mask,max_words=200)

%time