In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from scipy import stats
# Cause plots to be displayed in the notebook:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
import spacy

2023-04-24 10:19:55.870624: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)

In [5]:
lda

<gensim.models.ldamodel.LdaModel at 0x165ae2450>

In [8]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'

In [9]:
# combining all the documents into a list:

corpus = [D1, D2, D3, D4, D5]

In [10]:
# the complete corpus as below:

corpus

['I want to watch a movie this weekend.',
 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.',
 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.',
 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!',
 'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.']

In [11]:
# Apply Preprocessing on the Corpus using nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [8]:
clean_corpus

[['want', 'watch', 'movie', 'weekend'],
 ['went',
  'shopping',
  'yesterday',
  'new',
  'zealand',
  'world',
  'test',
  'championship',
  'beating',
  'india',
  'eight',
  'wicket',
  'southampton'],
 ['don’t',
  'watch',
  'cricket',
  'netflix',
  'amazon',
  'prime',
  'good',
  'movie',
  'watch'],
 ['movie',
  'nice',
  'way',
  'chill',
  'however',
  'time',
  'would',
  'like',
  'paint',
  'read',
  'good',
  'book',
  'it’s',
  'long'],
 ['blueberry',
  'milkshake',
  'good',
  'try',
  'reading',
  'dr',
  'joe',
  'dispenza’s',
  'book',
  'work',
  'gamechanger',
  'book',
  'helped',
  'learn',
  'much',
  'thought',
  'impact',
  'biology',
  'rewire',
  'brain']]

In [16]:
nlp = spacy.load('en_core_web_md')

In [32]:
def convert_text(text):
    '''
    Use techniques learned in previous labs. Remove StopWords, Punctuation, Lemmatize, lowercase etc.
    '''
    doc = nlp(text)
    # convert all tokens to lowercase
    # doc_lower = [token.lower_ for token in doc]
  
    # Removes StopWords, Punctuation and Lemmatize inline 
    text=' '.join(['' if (t.is_stop |  t.is_punct )else t.lemma_.lower() for t in doc]) 
    
    text = re.sub(r'\n', ' ', text)  # remove newline
    text = re.sub(r'\s+', ' ', text)  # clean up spacing
    
    return text

In [33]:
ctext=[convert_text(doc) for doc in corpus]
[doc.split() for doc in ctext] 

[['want', 'watch', 'movie', 'weekend'],
 ['go',
  'shop',
  'yesterday',
  'new',
  'zealand',
  'win',
  'world',
  'test',
  'championship',
  'beat',
  'india',
  'wicket',
  'southampton'],
 ['watch', 'cricket', 'netflix', 'amazon', 'prime', 'good', 'movie', 'watch'],
 ['movie',
  'nice',
  'way',
  'chill',
  'time',
  'like',
  'paint',
  'read',
  'good',
  'book',
  'long'],
 ['blueberry',
  'milkshake',
  'good',
  'try',
  'read',
  'dr.',
  'joe',
  'dispenza',
  'book',
  'work',
  'game',
  'changer',
  'book',
  'help',
  'learn',
  'thought',
  'impact',
  'biology',
  'rewire',
  'brain']]