In [1]:
# load the packages
import json  
import re
from collections import Counter
import copy
import string
import time
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize  
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from textblob import TextBlob
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import StanfordNERTagger
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

from sklearn.metrics import classification_report

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)


%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

SEED = 42

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiaying/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jiaying/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jiaying/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jiaying/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('desc_clean.csv', lineterminator='\n')

In [3]:
df.description.fillna('', inplace = True)

In [4]:
# count vectorizer
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(df.description)
print(dtm_tf.shape)

(120746, 46433)


In [5]:
dtm_tf_df = pd.DataFrame(dtm_tf.todense(), columns=tf_vectorizer.get_feature_names())

In [11]:
s = dtm_tf_df.sum().sort_values(ascending = False)
common_words = s[:100].index

In [12]:
common_words

Index(['video', 'music', 'new', 'twitter', 'videos', 'facebook', 'channel',
       'subscribe', 'instagram', 'youtube', 'like', 'news', 'watch',
       'official', 'live', 'follow', 'just', 'make', 'love', 'use', 'late',
       'know', 'time', 'world', 'dont', 'best', 'website', 'night', 'latest',
       'episode', 'play', 'day', 'life', 'jimmy', 'production', 'want',
       'available', 'check', 'entertainment', 'free', 'people', 'film',
       'links', 'song', 'episodes', 'movie', 'social', 'exclusive', 'way',
       'star', 'voice', 'got', 'box', 'google', 'album', 'today', 'click',
       'series', 'code', 'game', 'media', 'thanks', 'food', 'producer',
       'family', 'content', 'link', 'real', 'tumblr', 'merch', 'james', 'home',
       'things', 'try', 'original', 'nbc', 'look', 'kimmel', 'visit', 'sports',
       'favorite', 'think', 'big', 'cbs', 'season', 'youre', 'director',
       'makeup', 'stream', 'performing', 'come', 'tmz', 'spotify', 'talk',
       'story', 'support', 

In [19]:
customized_stopwords = stop_words.union(set(common_words))

In [25]:
# tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                    stop_words = customized_stopwords,
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    max_df = 0.5, 
                                    min_df = 10)
dtm_tfidf = tfidf_vectorizer.fit_transform(df.description)
print(dtm_tfidf.shape)

(120746, 46492)


In [26]:
# lda
lda_tf = LatentDirichletAllocation(n_components=20, random_state=SEED)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=SEED)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [28]:
# topic modeling
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)