In [6]:
import re
import os
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [8]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
TOPICS_PATH = os.path.join(DATA_DIR, 'topics')

In [4]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','rt'])

In [9]:
dfs_train =  pd.read_csv(os.path.join(SAVE_PATH, "lda_train_data"), header=0, parse_dates=['trend_date'])
dfs_test =  pd.read_csv(os.path.join(SAVE_PATH, "lda_test_data"), header=0, parse_dates=['trend_date'])
dfsLDA = dfs_train.loc[:,["trend","text"]]
dfsLDA.dropna(inplace=True)
trend_doc = dfsLDA.groupby(['trend'])['text'].apply(lambda x: ','.join(x)).reset_index()

In [11]:
target_doc = pd.read_csv(os.path.join(DATA_DIR, 'categories'), header=0)
target_doc.head(2)

Unnamed: 0,Category ID,Category Name
0,0,Art & Design
1,1,Books


In [18]:
pprint(trend_doc.loc[5].text)

('day  afghanistan vs westindies  to win correct predictions  ,afg vs wi dream '
 'team today match world cup  afghanistan vs west indies dream tips,rt  today '
 'its afghanistans final encounter the tournament might not have gone how '
 'theyd hoped but theyre still heroes to their fans ,rt  fact the best teams '
 'will make it to next round in about the right order    good for cricket if  '
 'beats  not easy we will end at no  better than eected good for  paks pivotal '
 'moment ur disgraceful fall agsinst,while and have emerged as well liked '
 'captains their tactics and strategy have been questionable in various games '
 'today is a good chance for them to get it right lots of eectations from '
 'others like rashid pooran hope hetmyer too')


In [19]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each