### Relevant Resources Used: https://predictivehacks.com/topic-modelling-with-nmf-in-python/

# Import Libraries

In [10]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Read and Clean Dataset

In [3]:
df = pd.read_csv('../data/test_data.csv')
df.head()

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...


In [4]:
def remove_at(text):
    # A username can only contain alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores,
    # as noted above. Check to make sure your desired username doesn't contain any symbols, dashes, or spaces.
    pattern = r'@([A-Za-z0-9_])+'
    # Replace all occurrences of @username with an empty string
    # https://towardsdatascience.com/topic-modeling-and-sentiment-analysis-on-twitter-data-using-spark-a145bfcc433
    text = re.sub(pattern, '', text)
    pattern = r'http\S+'
    text = re.sub(pattern, '', text)
    pattern = r'bit.ly/\S+'
    # replace all links with empty string
    text = re.sub(pattern, '', text)
    pattern = r'#([A-Za-z]+[A-Za-z0-9-_]+)'
    # replace all hashtags with empty string
    text = re.sub(pattern, '', text)
    return text

# DataFrame Organization

In [5]:
df.columns = ['Polarity', 'ID', 'Date', 'Topic', 'User', 'Text']
df['Text'] = df['Text'].apply(remove_at)

In [7]:
df.head()

Unnamed: 0,Polarity,ID,Date,Topic,User,Text
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the ...it fucking rock..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,You'll love your Kindle2. I've had mine for a...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,Fair enough. But i have the Kindle2 and I th...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,no. it is too big. I'm quite happy with the K...


In [8]:
X = df['Text']

# tf-idf Vectorization

In [9]:
vec = TfidfVectorizer(stop_words='english')
X_vectorized = vec.fit_transform(X)

# NMF Model

In [37]:
model = NMF(n_components=5, random_state=1)
model.fit(X_vectorized)

NMF(n_components=5, random_state=1)

### Create DataFrame displaying each topic (component) along with its corresponding factorization matrix based on each feature name (words minus stop words based on tf-idf vectorization)

In [40]:
components_features_df = pd.DataFrame(model.components_, columns=vec.get_feature_names())
components_features_df

Unnamed: 0,00,000,10,100,1000,12,15mp,16,16209,17,...,york,youtube,yr,yuan,yummmmmy,zealots,zero,zomg,zoom,zydrunas
0,0.0,0.000264,0.057891,0.0,0.001848,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.041623,0.000268,0.0,4e-06,0.00046,0.0,0.0,0.000738
1,0.0,0.000801,0.02128,0.0,0.001902,0.0,0.0,0.000772,0.014889,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001699,0.0,0.0,0.0
2,0.013301,0.003277,0.001716,0.020638,9.2e-05,0.013301,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.002469,0.008118,0.009357,0.002416,0.0,0.007148,0.002181,0.0,0.007148,...,0.025443,0.005372,0.0,0.005012,0.001049,0.002974,0.002685,0.006872,0.021651,0.024249
4,0.0,0.000168,0.0,0.0,0.0,0.0,0.0,0.001061,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0.0


# Print Each Topic's Most Important Words Based on NMF

In [47]:
for component in range(components_features_df.shape[0]):
    print("Topic {}'s top 15 words:".format(component))
    print(components_features_df.iloc[component].nlargest(15))
    print('')

Topic 0's top 10 words:
night       1.233609
museum      1.143133
watching    0.324769
saw         0.250566
loved       0.236510
good        0.234996
movie       0.201797
pretty      0.190274
movies      0.179441
awesome     0.171557
new         0.162093
went        0.155189
trek        0.155017
star        0.152226
old         0.122307
Name: 0, dtype: float64

Topic 1's top 10 words:
time        0.928073
warner      0.796050
cable       0.332121
internet    0.232462
suck        0.153189
phone       0.117274
rt          0.115941
damn        0.115315
problems    0.099587
hd          0.098604
line        0.097334
slogan      0.092571
worst       0.087115
day         0.087058
amp         0.086223
Name: 1, dtype: float64

Topic 2's top 10 words:
twitter    1.014102
api        0.941420
playing    0.245158
testing    0.232526
use        0.153817
hello      0.143566
curl       0.116388
java       0.116388
loves      0.100040
curses     0.097291
limit      0.097291
remote     0.086756
update  