### Relevant Resources Used: https://predictivehacks.com/topic-modelling-with-nmf-in-python/

# Import Libraries

In [82]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import NMF

# Read and Clean Dataset

In [47]:
df = pd.read_csv('../data/test_data.csv')
print('unique topic words on testing data', df.kindle2.unique())

unique topic words on testing data ['kindle2' 'aig' 'jquery' 'twitter' 'obama' 'nike' 'lebron' 'iphone app'
 'visa' 'fredwilson' '"booz allen"' '40d' 'google' 'itchy' 'stanford'
 'lyx' 'Danny Gokey' 'sleep' 'san francisco' 'star trek'
 'Malcolm Gladwell' 'espn' '"twitter api"' 'yahoo' 'scrapbooking'
 'wolfram alpha' 'weka' '50d' 'lambda calculus' 'east palo alto' 'lakers'
 'north korea' 'pelosi' 'bailout' 'insects' 'mcdonalds' 'exam' 'cheney'
 'republican' 'twitter api' 'jquery book' 'goodby silverstein' 'wieden'
 'g2' 'googleio' 'viral marketing' '"night at the museum"' 'gm'
 'time warner' 'china' 'surgery' 'dentist' 'baseball' 'sony' 'safeway'
 'eating' 'warren buffet' 'notre dame school' 'federer' '"naive bayes"'
 'car warranty call' 'at&t' 'wave sandbox' 'bing' 'summize' 'world cup'
 'world cup 2010' 'fred wilson' 'indian election' 'india election'
 'comcast' 'shoreline amphitheatre' 'mashable' 'hitler' 'yankees'
 'driving' 'visa card' 'Bobby Flay' 'latex' 'iran' 'aapl']


In [17]:
def remove_at(text):
    # A username can only contain alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores,
    # as noted above. Check to make sure your desired username doesn't contain any symbols, dashes, or spaces.
    pattern = r'@([A-Za-z0-9_])+'
    # Replace all occurrences of @username with an empty string
    # https://towardsdatascience.com/topic-modeling-and-sentiment-analysis-on-twitter-data-using-spark-a145bfcc433
    text = re.sub(pattern, '', text)
    pattern = r'http\S+'
    text = re.sub(pattern, '', text)
    pattern = r'bit.ly/\S+'
    # replace all links with empty string
    text = re.sub(pattern, '', text)
    pattern = r'#([A-Za-z]+[A-Za-z0-9-_]+)'
    # replace all hashtags with empty string
    text = re.sub(pattern, '', text)
    return text

# DataFrame Organization

In [18]:
df.columns = ['Polarity', 'ID', 'Date', 'Topic', 'User', 'Text']
df['Text'] = df['Text'].apply(remove_at)

In [19]:
df.head()

Unnamed: 0,Polarity,ID,Date,Topic,User,Text
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the ...it fucking rock..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,You'll love your Kindle2. I've had mine for a...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,Fair enough. But i have the Kindle2 and I th...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,no. it is too big. I'm quite happy with the K...


In [40]:
X = df['Text'][:500]

# 1) tf-idf Vectorization

In [48]:
vectorizer = TfidfVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

## NMF Model

In [42]:
model = NMF(n_components=10, random_state=1)
model.fit(X_vectorized)

NMF(n_components=10, random_state=1)

### Create DataFrame displaying each topic (component) along with its corresponding factorization matrix based on each feature name (words minus stop words based on tf-idf vectorization)

In [55]:
components_features_matrix = pd.DataFrame(model.components_, columns=vectorizer.get_feature_names())
components_features_matrix

Unnamed: 0,00,000,10,100,1000,12,15mp,16,16209,17,...,york,youtube,yr,yuan,yummmmmy,zealots,zero,zomg,zoom,zydrunas
0,0.0,0.000334,0.060965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045642,0.000649,0.0,0.0,0.000552,0.0,0.0,0.0
1,0.0,0.000568,0.020619,0.0,0.000317,0.0,0.0,0.000735,0.015048,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001692,0.0,0.0,0.0
2,0.013405,0.003114,0.001532,0.020555,0.000146,0.013405,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.004277,0.00012,0.000329,0.004277,...,0.04189,0.0,0.0,3e-05,0.0,0.0,0.003183,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001417,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.006055,0.009733,0.008719,0.0,0.0,0.005341,0.003675,0.0,0.005341,...,0.006504,0.009484,0.0,0.007779,0.0,0.0,0.0,0.0,0.008273,0.0
6,0.0,0.0,0.0,0.0,0.072837,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000993,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028355,0.086741
8,0.0,0.003648,0.001603,0.004748,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001179,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.012442,0.00926,0.0,0.0,0.002801,0.0,0.0,0.002801,...,0.0,0.002308,0.0,0.001046,0.003723,0.010034,0.002172,0.030807,0.012222,0.0


## Print Each Topic's Most Important Words Based on NMF

In [67]:
for component in range(components_features_matrix.shape[0]):
    print("Topic {}'s top 15 words:".format(component))
    words = components_features_matrix.iloc[component].nlargest(15)
    print(words)

Topic 0's top 15 words:
night       1.336949
museum      1.244221
watching    0.343952
saw         0.268381
loved       0.258900
good        0.228320
movie       0.221372
pretty      0.204842
trek        0.196196
star        0.192996
awesome     0.189122
movies      0.188471
new         0.182788
went        0.156321
old         0.134661
Name: 0, dtype: float64
Topic 1's top 15 words:
time        0.932259
warner      0.801095
cable       0.334539
internet    0.232703
suck        0.151193
rt          0.106580
damn        0.105330
phone       0.104910
problems    0.100967
hd          0.098793
line        0.096825
slogan      0.092082
day         0.090521
worst       0.088042
epic        0.081569
Name: 1, dtype: float64
Topic 2's top 15 words:
twitter    1.021786
api        0.947841
playing    0.246436
testing    0.234403
use        0.155396
hello      0.144731
curl       0.117297
java       0.117297
loves      0.100893
curses     0.098083
limit      0.098083
remote     0.087445
update    

# 2) And now we repeat with Count Vectorization

In [70]:
vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [71]:
model = NMF(n_components=10, random_state=1)
model.fit(X_vectorized)

NMF(n_components=10, random_state=1)

In [74]:
components_features_matrix = pd.DataFrame(model.components_, columns=vectorizer.get_feature_names())

In [73]:
for component in range(components_features_matrix.shape[0]):
    print("Topic {}'s top 15 words:".format(component))
    words = components_features_matrix.iloc[component].nlargest(15)
    print(words)

Topic 0's top 15 words:
time        2.555006
warner      2.096993
cable       0.674406
internet    0.403957
watch       0.208149
phone       0.175457
hd          0.166548
suck        0.165790
damn        0.161135
worst       0.159091
slogan      0.155378
rt          0.154266
today       0.153406
want        0.145353
com         0.141308
Name: 0, dtype: float64
Topic 1's top 15 words:
night       2.588311
museum      2.042813
star        0.445398
trek        0.439411
movie       0.421398
saw         0.417217
awesome     0.270109
old         0.261911
pretty      0.232758
went        0.232325
seeing      0.230646
going       0.228613
movies      0.220252
today       0.210538
watching    0.200677
Name: 1, dtype: float64
Topic 2's top 15 words:
love       2.646885
kindle2    0.369236
50d        0.288313
40d        0.274481
place      0.242141
getting    0.198128
model      0.196092
ooooh      0.196092
got        0.186729
lol        0.181222
ve         0.179898
canon      0.146639
jquery    

# 3) tf-idf with regularization
### regularation terms (W and H), factorization matrices) are multiplied with a constant, l1 penality is applied as well

In [77]:
vectorizer = TfidfVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [79]:
model = NMF(n_components=10, random_state=1, alpha=0.1, l1_ratio=0.1, shuffle=True)
model.fit(X_vectorized)

NMF(alpha=0.1, l1_ratio=0.1, n_components=10, random_state=1, shuffle=True)

In [80]:
components_features_matrix = pd.DataFrame(model.components_, columns=vectorizer.get_feature_names())

In [81]:
for component in range(components_features_matrix.shape[0]):
    print("Topic {}'s top 15 words:".format(component))
    words = components_features_matrix.iloc[component].nlargest(15)
    print(words)

Topic 0's top 15 words:
night       1.013857
museum      0.947391
watching    0.257489
saw         0.200455
loved       0.195407
good        0.179517
movie       0.164187
pretty      0.151902
trek        0.147900
star        0.145104
awesome     0.139071
new         0.138826
movies      0.137963
went        0.114509
giggling    0.098210
Name: 0, dtype: float64
Topic 1's top 15 words:
time        1.013948
warner      0.872050
cable       0.362346
internet    0.250236
suck        0.161887
rt          0.111968
damn        0.110561
phone       0.109403
problems    0.106761
hd          0.103824
line        0.102327
slogan      0.097575
worst       0.091124
day         0.089710
epic        0.085929
Name: 1, dtype: float64
Topic 2's top 15 words:
twitter    1.012066
api        0.941841
playing    0.240927
testing    0.231024
use        0.147759
hello      0.141075
curl       0.113186
java       0.113186
loves      0.095805
curses     0.093674
limit      0.093674
remote     0.082977
update    

# 4) Count Vectorization with regularization

In [90]:
vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [91]:
model = NMF(n_components=10, random_state=1, alpha=0.1, l1_ratio=0.1, shuffle=True)
model.fit(X_vectorized)

NMF(alpha=0.1, l1_ratio=0.1, n_components=10, random_state=1, shuffle=True)

In [92]:
components_features_matrix = pd.DataFrame(model.components_, columns=vectorizer.get_feature_names())

In [93]:
for component in range(components_features_matrix.shape[0]):
    print("Topic {}'s top 15 words:".format(component))
    words = components_features_matrix.iloc[component].nlargest(15)
    print(words)

Topic 0's top 15 words:
time        2.289942
warner      1.879547
cable       0.603994
internet    0.361816
watch       0.186067
phone       0.156519
hd          0.148666
suck        0.147885
damn        0.143513
worst       0.141829
rt          0.140019
slogan      0.138804
today       0.137195
want        0.129482
com         0.125577
Name: 0, dtype: float64
Topic 1's top 15 words:
night       2.132800
museum      1.684180
star        0.365595
trek        0.360868
movie       0.346587
saw         0.343294
awesome     0.220899
old         0.214398
pretty      0.190938
went        0.190493
seeing      0.189211
going       0.186195
movies      0.180739
today       0.171784
watching    0.164036
Name: 1, dtype: float64
Topic 2's top 15 words:
love       2.569669
kindle2    0.356354
50d        0.279409
40d        0.265280
place      0.233999
getting    0.191362
model      0.189466
ooooh      0.189466
got        0.179888
lol        0.173431
ve         0.173266
canon      0.142471
jquery    