In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD, NMF
import nltk      
from nltk.stem import WordNetLemmatizer  

class LemmaTokenizer(object):     
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
  


In [19]:
#NLP function
def nlp (doc,stopwords='english',n_topics=10, function=TfidfVectorizer):
    vectorizer = function(stop_words=stopwords)#,tokenizer=LemmaTokenizer())
    doc_cnt = vectorizer.fit_transform(doc)
    feature_words = vectorizer.get_feature_names()
    nmf = NMF(n_components = n_topics, random_state=1).fit(doc_cnt)
    
    
    return nmf, doc_cnt, feature_words

#print top words for each topic:
def print_top_words(model, feature_names, n_top_words):
    #print model.components_
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx+1))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))




In [4]:
#df = pd.read_csv("ufo_unstructured.csv", header= None)
df2 = pd.read_csv("ufo_data.csv")
df2.dropna(inplace=True)
content = df2.ix[:,2].values
print content.shape

(102172,)


In [5]:
df2.head()

Unnamed: 0,_id,State,User Comments
0,S133978,NM,Light seen over mountain's east of Camp McGreg...
1,S133977,BC,Light in sky stationary. Not a airplane or an...
2,S133974,TX,"Flying saucer descends, possibly lands in Nort..."
3,S133964,AZ,"While letting my dog out, a very bright white ..."
4,S133962,NE,A fire ball was moving in the atmosphere while...


In [None]:
lemma = nltk.wordnet.WordNetLemmatizer()
doc_lem=[]

for doc in content:
    tokens = nltk.word_tokenize(doc.lower().replace(".","").replace(',',''))
    doc_lem.append(' '.join([str(lemma.lemmatize(w)) for w in tokens]))# if not w in stopwords.words('english')]))

In [15]:

stop_words = ENGLISH_STOP_WORDS.union(['sky','looked','like','noticed','did','know','saw','said','look','got','went','east','south','west','north','ufo'])

#Count Vectorization
model, doc_term_mat, feat_words = nlp(doc = content, stopwords=stop_words, n_comp=5, function=CountVectorizer)
print_top_words(model, feat_words,10)




Topic #1:
lights red white orange flashing triangle formation shaped moving triangular
Topic #2:
light white red green blue ball appeared flashing moved disappeared
Topic #3:
object shaped appeared white large flying moving red speed observed
Topic #4:
just seen time craft night moving thought looking right house
Topic #5:
bright moving orange white star fast slowly disappeared objects slow


In [20]:
#TFIDF Vectorization
model, doc_term_mat, feat_words = nlp(doc = content, stopwords=stop_words, n_topics=5, function=TfidfVectorizer)
print_top_words(model, feat_words,10)

Topic #1:
lights red triangle flashing formation white green blue shape hovering
Topic #2:
light bright white green blue red ball flash disappeared moved
Topic #3:
object shaped flying craft large seen triangular hovering low cigar
Topic #4:
orange glowing fireball orbs ball orb objects sphere balls reddish
Topic #5:
moving fast slow slowly star objects high speed stars night


In [None]:
#attempt to link the most popular words with corresponding comments
feat_words = np.array(feat_words)
index = np.where( feat_words == 'green' )[0][0]
index_col = doc_term_mat.todense()
print index_col
print np.argsort(index_col)
index_cols = np.argsort(index_col)[-5:][::-1]
print index_cols
#max_ = np.max(index_col)
#index_max_ = np.where( index_col == max_ )[0][0]
print index_cols
top_5_list = [content[i] for i in index_cols]

top_5_list

In [168]:
#lemmatizer test
lemma = nltk.wordnet.WordNetLemmatizer()
doc_tokens=[]

for doc in content:
    tokens = nltk.word_tokenize(doc.lower().replace(".","").replace(',',''))
    print doc
    doc_tokens.append(' '.join([str(lemma.lemmatize(w)) for w in tokens]))# if not w in stopwords.words('english')]))
    print doc_tokens
    break
#print doc_tokens[:10]

Light seen over mountain's east of Camp McGregor.  It hovered in one spot.  It looked like a helicopter light at first.  But it was way to bright.  Then it went to a higher height.  Stayed there for a while.  Then disappeared.


((NUFORC Note:  Report appears to us to be consistent with the sighting of Venus.  PD))
["light seen over mountain 's east of camp mcgregor it hovered in one spot it looked like a helicopter light at first but it wa way to bright then it went to a higher height stayed there for a while then disappeared ( ( nuforc note : report appears to u to be consistent with the sighting of venus pd ) )"]
