# Term Frequency - Inverse Document Frequency (TF-IDF)

In [37]:
from collections import defaultdict
import csv
import sys
csv.field_size_limit(sys.maxsize)

data = "/Users/skhederian/restaurant-health/format_reviews.csv"

#Combine reviews with the same violation count into the same document
reviews = defaultdict(list)
with open(data, "r") as review_file:
    reader = csv.reader(review_file, delimiter=',')
    next(reader)
    for row in reader:
        reviews[row[3]].append(row[4])
        
for violations, string_agg in reviews.items():
    reviews[violations] = "".join(string_agg)

#Append documents to corpus
corpus = []

for id, review in reviews.items():
    corpus.append(review)

In [50]:
#Import vectorizer with unigrams and 2 or 3-word phrases
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words = 'english')

In [51]:
#Fit to our corpus
tfidf_matrix =  tf.fit_transform(corpus)
feature_names = tf.get_feature_names() 

In [52]:
#63 documents with 18,934,205 unique features (min_df=0) or 23,388 features (min_df=.25)
tfidf_matrix

<63x23388 sparse matrix of type '<class 'numpy.float64'>'
	with 471677 stored elements in Compressed Sparse Row format>

In [48]:
dense = tfidf_matrix.todense()
len(dense[0].tolist()[0])

18934205

In [43]:
review = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(review)), review) if pair[1] > 0]

In [44]:
len(phrase_scores)

6097385

In [46]:
#features from the first document with the highest scores
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
   print('{0: <20} {1}'.format(phrase, score))

good                 0.27642277580659774
food                 0.2751819664463293
place                0.2627337531021071
great                0.19200194876901963
like                 0.18933972577643401
just                 0.17149422005262743
really               0.14411414126020586
service              0.14001383115339852
time                 0.1297836945976781
ve                   0.11036372515014335
bar                  0.1101007121618974
restaurant           0.10430854776831745
menu                 0.0993997095618625
got                  0.09593945648757572
little               0.09273507588494441
don                  0.09259440940696709
nice                 0.09092918032390895
ordered              0.09087724156779775
chicken              0.08909425444152475
night                0.08872113360609017


Coding reference from http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/