In [1]:
#Activity 01
import langdetect
import matplotlib.pyplot
import nltk
import numpy
import pandas
import pyLDAvis
import pyLDAvis.sklearn
import regex
import sklearn

In [2]:
path = 'latimeshealth.txt'
df = pandas.read_csv(path, sep="|", header=None)
df.columns = ["id", "datetime", "tweettext"]

In [3]:
def dataframe_quick_look(df, nrows):
    print("SHAPE:\n{shape}\n".format(shape=df.shape))
    print("COLUMN NAMES:\n{names}\n".format(names=df.columns))
    print("HEAD:\n{head}\n".format(head=df.head(nrows)))
dataframe_quick_look(df, nrows=2)

SHAPE:
(4171, 3)

COLUMN NAMES:
Index(['id', 'datetime', 'tweettext'], dtype='object')

HEAD:
                   id                        datetime  \
0  576760256031682561  Sat Mar 14 15:02:15 +0000 2015   
1  576715414811471872  Sat Mar 14 12:04:04 +0000 2015   

                                           tweettext  
0  Five new running shoes that aim to go the extr...  
1  Gym Rat: Disq class at Crunch is intense worko...  



In [4]:
raw = df['tweettext'].tolist()
print("HEADLINES:\n{lines}\n".format(lines=raw[:5]))
print("LENGTH:\n{length}\n".format(length=len(raw)))

HEADLINES:
['Five new running shoes that aim to go the extra mile http://lat.ms/1ELp3wU', 'Gym Rat: Disq class at Crunch is intense workout on pulley system http://lat.ms/1EKOFdr', 'Noshing through thousands of ideas at Natural Products Expo West http://lat.ms/1EHqywg', 'Natural Products Expo also explores beauty, supplements and more http://lat.ms/1EHqyfE', 'Free Fitness Weekends in South Bay beach cities aim to spark activity http://lat.ms/1EH3SMC']

LENGTH:
4171



In [7]:
def do_language_identifying(txt):
    try:
        the_language = langdetect.detect(txt)
    except:
        the_language = 'none'
    return the_language


def do_lemmatizing(wrd):
    out = nltk.corpus.wordnet.morphy(wrd)
    return (wrd if out is None else out)


def do_tweet_cleaning(txt):
# identify language of tweet
# return null if language not english
    lg = do_language_identifying(txt)
    if lg != 'en':
        return None

# split the string on whitespace
    out = txt.split(' ')

# identify screen names
# replace with SCREENNAME
    out = ['SCREENNAME' if i.startswith('@') else i for i in out]

# identify urls
# replace with URL
    out = ['URL' if bool(regex.search('http[s]?://', i)) else i for i in out]

# remove all punctuation
    out = [regex.sub('[^\\w\\s]|\n', '', i) for i in out]

# make all non-keywords lowercase
    keys = ['SCREENNAME', 'URL']
    out = [i.lower() if i not in keys else i for i in out]

# remove keywords
    out = [i for i in out if i not in keys]

# remove stopwords
    list_stop_words = nltk.corpus.stopwords.words('english')
    list_stop_words = [    regex.sub('[^\\w\\s]', '', i) for i in list_stop_words]

    out = [i for i in out if i not in list_stop_words]

# lemmatizing
    out = [do_lemmatizing(i) for i in out]

# keep words 4 or more characters long
    out = [i for i in out if len(i) >= 5]

    return out


In [8]:
clean = list(map(do_tweet_cleaning, raw))

In [9]:
clean = list(filter(None.__ne__, clean))
print("HEADLINES:\n{lines}\n".format(lines=clean[:5]))
print("LENGTH:\n{length}\n".format(length=len(clean)))

HEADLINES:
[['running', 'shoes', 'extra'], ['class', 'crunch', 'intense', 'workout', 'pulley', 'system'], ['thousand', 'natural', 'product'], ['natural', 'product', 'explore', 'beauty', 'supplement'], ['fitness', 'weekend', 'south', 'beach', 'spark', 'activity']]

LENGTH:
4093



In [10]:
clean_sentences = [" ".join(i) for i in clean]
print(clean_sentences[0:10])



In [11]:
#Activity 02
number_words = 10
number_docs = 10
number_features = 1000

In [12]:
vectorizer1 = sklearn.feature_extraction.text.CountVectorizer(
    analyzer="word",
    max_df=0.95, 
    min_df=10, 
    max_features=number_features
)
clean_vec1 = vectorizer1.fit_transform(clean_sentences)
print(clean_vec1[0])

feature_names_vec1 = vectorizer1.get_feature_names()

  (0, 320)	1


In [13]:
def perplexity_by_ntopic(data, ntopics):
    output_dict = {
        "Number Of Topics": [], 
        "Perplexity Score": []
    }
    for t in ntopics:
        lda = sklearn.decomposition.LatentDirichletAllocation(
            n_components=t,
            learning_method="online",
            random_state=0
        )
        lda.fit(data)
        output_dict["Number Of Topics"].append(t)
        output_dict["Perplexity Score"].append(lda.perplexity(data))
    output_df = pandas.DataFrame(output_dict)
    index_min_perplexity = output_df["Perplexity Score"].idxmin()
    output_num_topics = output_df.loc[
        index_min_perplexity,  # index
        "Number Of Topics"  # column
    ]
    return (output_df, output_num_topics)
df_perplexity, optimal_num_topics = perplexity_by_ntopic(
    clean_vec1, 
    ntopics=[i for i in range(1, 21) if i % 2 == 0]
)
print(df_perplexity)


   Number Of Topics  Perplexity Score
0                 2        349.004885
1                 4        404.137619
2                 6        440.677441
3                 8        464.222793
4                10        478.094739
5                12        493.116250
6                14        506.144776
7                16        524.674504
8                18        530.975575
9                20        535.461393


In [14]:
lda = sklearn.decomposition.LatentDirichletAllocation(
    n_components=optimal_num_topics,
    learning_method="online",
    random_state=0
)
lda.fit(clean_vec1)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=2, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
def get_topics(mod, vec, names, docs, ndocs, nwords):
    # word to topic matrix
    W = mod.components_
    W_norm = W / W.sum(axis=1)[:, numpy.newaxis]
    # topic to document matrix
    H = mod.transform(vec)
    W_dict = {}
    H_dict = {}
    for tpc_idx, tpc_val in enumerate(W_norm):
        topic = "Topic{}".format(tpc_idx)
        # formatting w
        W_indices = tpc_val.argsort()[::-1][:nwords]
        W_names_values = [
            (round(tpc_val[j], 4), names[j]) 
            for j in W_indices
        ]
        W_dict[topic] = W_names_values
        # formatting h
        H_indices = H[:, tpc_idx].argsort()[::-1][:ndocs]
        H_names_values = [
        (round(H[:, tpc_idx][j], 4), docs[j]) 
            for j in H_indices
        ]
        H_dict[topic] = H_names_values
    W_df = pandas.DataFrame(
        W_dict, 
        index=["Word" + str(i) for i in range(nwords)]
    )
    H_df = pandas.DataFrame(
        H_dict,
        index=["Doc" + str(i) for i in range(ndocs)]
    )
    return (W_df, H_df)

W_df, H_df = get_topics(
    mod=lda,
    vec=clean_vec1,
    names=feature_names_vec1,
    docs=raw,
    ndocs=number_docs, 
    nwords=number_words
)

print(W_df)


                     Topic0              Topic1
Word0      (0.0417, latfit)     (0.0817, study)
Word1      (0.0336, health)    (0.0306, cancer)
Word2      (0.0242, people)   (0.0212, patient)
Word3       (0.0203, could)     (0.0172, death)
Word4       (0.0192, brain)    (0.017, obesity)
Word5   (0.018, researcher)    (0.0168, doctor)
Word6       (0.0176, woman)     (0.0166, heart)
Word7       (0.016, report)   (0.0148, disease)
Word8  (0.0143, california)    (0.0144, weight)
Word9   (0.0125, scientist)  (0.0115, research)


In [16]:
print(H_df)

                                                 Topic0  \
Doc0  (0.9443, Want your legs to look good in those ...   
Doc1  (0.9442, 11% of hospital patients got care the...   
Doc2  (0.9373, Spend time with dad this Father’s Day...   
Doc3  (0.9373, Hve fun! That's an order. It's import...   
Doc4  (0.9372, Need a new challenge for your ab work...   
Doc5  (0.9368, ZMapp goes 18-for-18 in treating monk...   
Doc6  (0.9367, Anti-vaccination activists target hig...   
Doc7  (0.9337, RT @latimesscience: @xprize pulled th...   
Doc8  (0.9285, About 75% of homeless people smoke, a...   
Doc9  (0.9284, Yogi crunches can give you flat abs a...   

                                                 Topic1  
Doc0  (0.9498, Computer problems are delaying nursin...  
Doc1  (0.9457, Trans fats? DONE. Will the @US_FDA go...  
Doc2  (0.9414, Supplements to boost "low T" increase...  
Doc3  (0.9372, Study: The 2009 H1N1 "swine flu" pand...  
Doc4  (0.9363, Doctors often delay vaccines for youn...  
Do

In [17]:
lda_plot = pyLDAvis.sklearn.prepare(lda, clean_vec1, vectorizer1, R=10)
pyLDAvis.display(lda_plot)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
#ACTIVITY 03
vectorizer2 = sklearn.feature_extraction.text.TfidfVectorizer(
    analyzer="word",
    max_df=0.5, 
    min_df=20, 
    max_features=number_features,
    smooth_idf=False
)
clean_vec2 = vectorizer2.fit_transform(clean_sentences)
print(clean_vec2[0])

feature_names_vec2 = vectorizer2.get_feature_names()




In [19]:
nmf = sklearn.decomposition.NMF(
    n_components=optimal_num_topics,
    init="nndsvda",
    solver="mu",
    beta_loss="frobenius",
    random_state=0, 
    alpha=0.1, 
    l1_ratio=0.5
)
nmf.fit(clean_vec2)


NMF(alpha=0.1, beta_loss='frobenius', init='nndsvda', l1_ratio=0.5,
  max_iter=200, n_components=2, random_state=0, shuffle=False, solver='mu',
  tol=0.0001, verbose=0)

In [20]:
W_df, H_df = get_topics(
    mod=nmf,
    vec=clean_vec2,
    names=feature_names_vec2,
    docs=raw,
    ndocs=number_docs, 
    nwords=number_words
)

print(W_df)

                  Topic0                Topic1
Word0    (0.3794, study)      (0.5955, latfit)
Word1   (0.0256, cancer)       (0.0487, steps)
Word2   (0.0207, people)       (0.0446, today)
Word3  (0.0183, obesity)    (0.0402, exercise)
Word4    (0.0183, brain)  (0.0273, healthtips)
Word5   (0.0182, health)     (0.0258, workout)
Word6  (0.0175, suggest)     (0.0203, getting)
Word7   (0.0167, weight)     (0.0192, fitness)
Word8    (0.0152, woman)       (0.0143, great)
Word9     (0.013, death)     (0.0131, morning)
