In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [32]:
from lsa import *

In [2]:
import os.path
import pandas as pd

In [3]:
df = pd.read_csv('tmp.csv')

### https://medium.com/@adi_enasoaie/easy-lsi-pipeline-using-scikit-learn-a073f2484408

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer, SnowballStemmer
from nltk.stem import LancasterStemmer
from sklearn.decomposition import TruncatedSVD


In [64]:
x_train

0       Patrick Sullivan (Jeffrey Dean Morgan) is look...
1       On Thanksgiving Day, 1983, student Marty Pasca...
2       Spencer Davenport and his sister Katherine mus...
3       Jamal Walker (Martin Lawrence) is an everyday ...
4       The people of Harford Road are firmly divided ...
                              ...                        
2050    Beavis and Butt-head introduced the film by ex...
2051    Sinbad and his pirate crew attempt to steal th...
2052    Within the kingdom of Emir, in the town of Fre...
2053    The film is set in New America in the year 209...
2054    A 20-year-old man named Joseph "Jody" Summers ...
Name: content, Length: 2055, dtype: object

In [45]:
class Tokenizer(object):
    def __init__(self):
        self.tok = RegexpTokenizer(r'\w+')
        self.stemmer = SnowballStemmer('english')
    def __call__(self, doc):
        return [self.stemmer.stem(token) 
                for token in self.tok.tokenize(doc)]

In [67]:
vectorizer = TfidfVectorizer(tokenizer=Tokenizer(),
                             stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True,
                                )

In [7]:
svd_model = TruncatedSVD(n_components=500, 
                         algorithm='randomized',
                         n_iter=10, random_state=42)

In [9]:
df = df.reindex()

In [10]:
x_train = df['content']

In [11]:
from sklearn.pipeline import Pipeline
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(x_train)

  'stop_words.' % sorted(inconsistent))


In [12]:
df[df.title.str.contains('Star Trek')]

Unnamed: 0,Release Year,title,Origin/Ethnicity,author,Cast,Genre,link,content,movies
420,1996,Star Trek: First Contact,American,Jonathan Frakes,"Patrick Stewart, Brent Spiner, LeVar Burton, J...",science fiction,https://en.wikipedia.org/wiki/Star_Trek:_First...,It is the 24th century. Captain Jean Luc Picar...,3
536,1998,Star Trek: Insurrection,American,Jonathan Frakes,"Patrick Stewart, F. Murray Abraham, Donna Murphy",science fiction,https://en.wikipedia.org/wiki/Star_Trek:_Insur...,Lieutenant Commander Data (Brent Spiner) is te...,3
894,2002,Star Trek Nemesis,American,Stuart Baird,"Patrick Stewart, Jonathan Frakes, Brent Spiner...",science fiction,https://en.wikipedia.org/wiki/Star_Trek_Nemesis,"On Romulus, members of the Romulan Imperial Se...",3
1761,1994,Star Trek Generations,American,David Carson,"Patrick Stewart, William Shatner",science fiction,https://en.wikipedia.org/wiki/Star_Trek_Genera...,"In the year 2293, retired Captain James T. Kir...",1


In [13]:
q = x_train.loc[420]

In [24]:
query_vector = svd_transformer.transform([q])

In [25]:
query_vector.shape

(1, 500)

In [26]:
from sklearn.metrics import pairwise_distances
distance_matrix = pairwise_distances(query_vector, 
                                     svd_matrix, 
                                     metric='cosine', 
                                     n_jobs=-1)

In [27]:
df.shape

(2055, 10)

In [28]:
distance_matrix.shape

(1, 2055)

In [29]:
df['dist'] = distance_matrix[0]

In [30]:
df.sort_values('dist')

Unnamed: 0,Release Year,title,Origin/Ethnicity,author,Cast,Genre,link,content,movies,dist
420,1996,Star Trek: First Contact,American,Jonathan Frakes,"Patrick Stewart, Brent Spiner, LeVar Burton, J...",science fiction,https://en.wikipedia.org/wiki/Star_Trek:_First...,It is the 24th century. Captain Jean Luc Picar...,3,0.000000
894,2002,Star Trek Nemesis,American,Stuart Baird,"Patrick Stewart, Jonathan Frakes, Brent Spiner...",science fiction,https://en.wikipedia.org/wiki/Star_Trek_Nemesis,"On Romulus, members of the Romulan Imperial Se...",3,0.149749
536,1998,Star Trek: Insurrection,American,Jonathan Frakes,"Patrick Stewart, F. Murray Abraham, Donna Murphy",science fiction,https://en.wikipedia.org/wiki/Star_Trek:_Insur...,Lieutenant Commander Data (Brent Spiner) is te...,3,0.173106
1761,1994,Star Trek Generations,American,David Carson,"Patrick Stewart, William Shatner",science fiction,https://en.wikipedia.org/wiki/Star_Trek_Genera...,"In the year 2293, retired Captain James T. Kir...",1,0.180354
1226,2016,Independence Day: Resurgence,American,Roland Emmerich,Liam Hemsworth\r\nMaika Monroe\r\nJeff Goldblum,science fiction,https://en.wikipedia.org/wiki/Independence_Day...,Twenty years after the devastating alien invas...,4,0.624323
...,...,...,...,...,...,...,...,...,...,...
1698,1991,Road to Ruin,American,Charlotte Brandon,"Peter Weller, Carey Lowell",comedy,https://en.wikipedia.org/wiki/Road_to_Ruin_(19...,Peter Weller plays a wealthy American business...,1,0.998459
267,1993,The Young Americans,British,Danny Cannon,"Harvey Keitel, Iain Glen",crime drama,https://en.wikipedia.org/wiki/The_Young_Americ...,Harvey Keitel plays an American cop who travel...,2,1.000289
892,1993,Excessive Force,American,Jon Hess,"Thomas Ian Griffith, Lance Henriksen",action,https://en.wikipedia.org/wiki/Excessive_Force_...,Terry McCain (Thomas Ian Griffith) is a police...,1,1.002482
884,2017,Donald Cried,American,Kristopher Avedisian,Kristopher Avedisian (director/screenplay); Kr...,comedy,https://en.wikipedia.org/wiki/Donald_Cried,A Manhattanite banker returns to his hometown ...,1,1.004935


### Vectorizer output

In [87]:
out = vectorizer.fit_transform(x_train)

In [88]:
pd.DataFrame(out.A, columns=vectorizer.get_feature_names())['god']

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
2050    0.000000
2051    0.018991
2052    0.000000
2053    0.000000
2054    0.000000
Name: god, Length: 2055, dtype: float64

NameError: name 'df_reduced' is not defined