In [21]:
import pandas as pd
import wordcloud
from wordcloud import WordCloud
from wordcloud import STOPWORDS

import matplotlib as plt

In [148]:
full_dataset = pd.read_csv(r'C:/Users/mupsi/Desktop/crowd_curl_onlyme/NotGit/full_dataset_28jun.csv', dtype='string', index_col=0)

In [159]:
# generate a word cloud to identify potential topics, iterate stopwords for better insights
cloud_text = ','.join(list(full_dataset['Text'].values))

cloud_stop_words = ['curl','used', 'think', 'got','know','curls','curly','good','leave','want','thing', 'x200b','doesn','advice','head','going','hair', 'product', '2b','2c','3a','3b','3c','4a','4b','4c','lot','seem','really','aren', 'couldn', 'didn', 'doesn', 'don', 'hadn', 'hasn', 'haven', 'isn', 'let', 'll', 'mustn', 're', 'shan', 'shouldn', 've', 'wasn', 'weren', 'won', 'wouldn']+list(STOPWORDS)
wordcloud = WordCloud(background_color="black", contour_color='steelblue',stopwords = cloud_stop_words)

wordcloud.generate(cloud_text)
wordcloud.to_image()

KeyError: 'Text'

In [32]:
# potential topics may be: porosity, dryness/frizz, moisture, co-wash

In [168]:
list(full_dataset.columns)

['Link',
 'Raw_Text',
 'Hairtype',
 'Processed_Text',
 'lemmatized_txt',
 'lemmatized_for_tfidf_stopped']

# Begin topic modeling with ScikitLearn

In [48]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import numpy as np
# following https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/ tutorial

In [169]:
# instantiate vectorizer, vectorize lemmatized text - no n-grams, as this confuses the LDA
# initiate with default stopwords and see what happens
vectorizer = CountVectorizer(analyzer='word', min_df=10, strip_accents = 'unicode', max_features=2500)
vectorized_txt = vectorizer.fit_transform(full_dataset['Processed_Text'])
data_dense = vectorized_txt.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  1.989769995855781 %


In [167]:
# try LDA on 5 topics (components)
lda_model = LatentDirichletAllocation(n_components=5, random_state=100)
lda_output = lda_model.fit_transform(vectorized_txt)
print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [51]:
# looking for high LL & low perplexity
print("Log Likelihood: ", lda_model.score(vectorized_txt))
print("Perplexity: ", lda_model.perplexity(vectorized_txt))

Log Likelihood:  -14143674.879629878
Perplexity:  965.5464892913528


In [132]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=25):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=25)        

# Topic-Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 15,Word 16,Word 17,Word 18,Word 19,Word 20,Word 21,Word 22,Word 23,Word 24
Topic 0,hair,day,like,wash,product,curl,look,dry,feel,really,...,also,time,wave,low,make,even,tried,use,help,work
Topic 1,hair,dry,gel,conditioner,curl,scrunch,routine,wash,use,air,...,air dry,shower,plop,hand,minute,apply,moisture,product,rinse,porosity
Topic 2,hair,curl,type,like,http,look,pattern,curl pattern,top,head,...,side,hair type,one,curl type,look like,product,www,imgur,also,http imgur
Topic 3,hair,product,conditioner,shampoo,curl,use,moisture,oil,gel,leave,...,shea moisture,cream,coconut,porosity,low,cg,devacurl,shampoo conditioner,one,really
Topic 4,hair,curly,curl,cut,year,curly hair,like,straight,know,time,...,product,method,girl,long,help,started,cg,got,look,month


In [210]:
# some stopwords escaped - iterate until confounding stopwords have been removed
# can't remove based on POS tagging because nouns (shampoo), verbs (wash), and adjectives (frizzy) might all be important
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['1a','2a','2b','2c','3a','3b','3c','4a','4b','4c','loc','lco','stumbling','upon','kknow',\
                'bringing','www','com','dont','get', 'amp', 'look','ww','x200b','[imgur]\w+', 'curly','hair','curl','curls','say','really', \
                  'little','lot','[http]\w+','http','like','think','look like','http imgur','go','also','back',\
                  'find','good','need','thanks','well','still','upside','might','tried','imgur','use','using',\
                  'anyone','line','much','used','would','one','side','got','always','put','even','every','la',\
                  'head','ago','product','conditioner','shampoo','youtube','people','water','way','going'\
                 'end','feel','could'])
len(stopwords),

(258,)

In [251]:
custom_stopwords = stopwords # new list so you can reset the original if necessary
vectorizer = CountVectorizer(analyzer='word', stop_words = custom_stopwords, strip_accents = 'unicode')
vectorized_txt = vectorizer.fit_transform(full_dataset['lemmatized_txt'])

In [53]:
# could be some coherent topics here but better to see if there are better models

In [227]:
list(full_dataset.columns)

['Link',
 'Raw_Text',
 'Hairtype',
 'Processed_Text',
 'lemmatized_txt',
 'lemmatized_for_tfidf_stopped']

In [217]:
# Gridsearch for the best model - this takes a bit of time

search_params = {'n_components': [5,8,10,11,12], 'learning_decay':[.5, .7, .9],  'random_state':[100]}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid = search_params)
model.fit(vectorized_txt)

GridSearchCV(cv=None, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 tota

In [218]:
# best model?
best_lda_model = model.best_estimator_
model.best_params_

{'learning_decay': 0.5, 'n_components': 5, 'random_state': 100}

In [230]:
print("Log Likelihood: ", best_lda_model.score(vectorized_txt))
print("Perplexity: ", best_lda_model.perplexity(vectorized_txt))

Log Likelihood:  -13738315.706025582
Perplexity:  1710.9103302043777


In [231]:
# coherent topics for that model?
show_topics(vectorizer, best_lda_model, n_words = 30)

[array(['gel', 'dry', 'leave', 'scrunch', 'cream', 'day', 'wet', 'frizz',
        'porosity', 'mousse', 'low', 'routine', 'hold', 'hand', 'fine',
        'wave', 'work', 'oil', 'air', 'moisture', 'plop', 'diffuse',
        'root', 'apply', 'help', 'amount', 'cantu', 'try', 'wash',
        'devacurl'], dtype='<U232'),
 array(['wash', 'dry', 'day', 'routine', 'shower', 'week', 'comb', 'brush',
        'moisture', 'wet', 'time', 'air', 'leave', 'shea', 'gel', 'minute',
        'scrunch', 'let', 'rinse', 'condition', 'coconut', 'plop', 'oil',
        'help', 'scalp', 'towel', 'finger', 'detangle', 'co', 'morning'],
       dtype='<U232'),
 array(['cut', 'straight', 'know', 'wavy', 'wave', 'day', 'time', 'year',
        'type', 'want', 'help', 'long', 'length', 'make', 'top', 'layer',
        'pattern', 'dry', 'never', 'end', 'cg', 'work', 'right', 'frizzy',
        'method', 'since', 'see', 'started', 'maybe', 'pretty'],
       dtype='<U232'),
 array(['cg', 'moisture', 'low', 'protein', 'wa

In [None]:
# First impressions of topic possibilities:
# T0: Washing & Moisturizing Routines
# T1: Drying & Detangling
# T2: Growing Advice
# T3: Curlygirl & Devacurl
# T4: Moisturizing Treatments and Masks

###### What happens with 8 topics?

In [234]:
custom_stopwords = stopwords # new list so you can reset the original if necessary
vectorizer = CountVectorizer(analyzer='word', stop_words = custom_stopwords, strip_accents = 'unicode')
vectorized_txt = vectorizer.fit_transform(full_dataset['lemmatized_txt'])

In [235]:
# FENCED IN RE-RUN WITH 8 TOPICS:
lda_model = LatentDirichletAllocation(n_components=8, random_state=100)
lda_output = lda_model.fit_transform(vectorized_txt)
print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [236]:
show_topics(vectorizer, lda_model, n_words=20)

[array(['gel', 'cream', 'leave', 'oil', 'moisture', 'coconut', 'shea',
        'porosity', 'cantu', 'work', 'mousse', 'frizz', 'fine', 'hold',
        'heavy', 'low', 'love', 'try', 'kinky', 'styling'], dtype='<U232'),
 array(['type', 'jpg', 'help', 'post', 'reddit', 'see', 'different',
        'know', 'gt', 'picture', 'pattern', 'black', 'comment', 'chart',
        'someone', 'wavy', 'redd', 'naturallycurly', 'texture', 'sure'],
       dtype='<U232'),
 array(['straight', 'year', 'know', 'wavy', 'time', 'wave', 'top', 'want',
        'cut', 'long', 'layer', 'help', 'pattern', 'length', 'never',
        'day', 'make', 'since', 'started', 'type'], dtype='<U232'),
 array(['cg', 'moisture', 'shea', 'free', 'sulfate', 'silicone', 'scalp',
        'year', 'month', 'color', 'wash', 'coconut', 'devacurl', 'started',
        'help', 'week', 'clarifying', 'looking', 'know', 'love'],
       dtype='<U232'),
 array(['porosity', 'protein', 'low', 'oil', 'moisture', 'high', 'deep',
        'treatment

In [None]:
# Topics:
# 0 Styling, Moisturizing, Detangling Kinky/Frizzy Hair
# 1 Advice for Textured Hair
# 2 Cutting/Growing
# 3 Curlygirl/Devacurl/Sulfate-free
# 4 Moisture/Masks
# 5 Daily Routines
# 6 Styling/Volume
# 7 Drying Routines

#This makes more intuitive sense than 5 topics but let's see:

In [239]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [240]:
pyLDAvis.sklearn.prepare(lda_model, vectorized_txt, vectorizer, mds ='tsne') # 8 topics

In [241]:
pyLDAvis.sklearn.prepare(best_lda_model, vectorized_txt, vectorizer, mds ='tsne') # 5 topics

In [None]:
# 5 topics may have a better log/perplexity score, but 8 makes more intuitive sense

In [278]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38608 entries, 0 to 38607
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Link                          38608 non-null  string
 1   Raw_Text                      38608 non-null  string
 2   Hairtype                      38608 non-null  string
 3   Processed_Text                38608 non-null  string
 4   lemmatized_txt                38608 non-null  string
 5   lemmatized_for_tfidf_stopped  38608 non-null  string
dtypes: string(6)
memory usage: 3.3+ MB


In [250]:
# Assign top topic to each document - each topic gets a column
topic_values = lda_model.transform(vectorized_txt)
topic_values.shape

(38608, 8)

In [319]:
def topic_values_to_df(model = lda_model, df = full_dataset):
    topic_values= model.transform(vectorized_txt)
    top_df_fn = pd.DataFrame((np.argpartition(-topic_values, 4, axis=1)[:, :4]), columns = ['First_Topic','Second_Topic','Third_Topic','Fourth_Topic'])
    return top_df_fn
    # df_top_topics = pd.concat([df, top_df_fn], axis=1)
   # return df_top_topics

In [331]:
topic_values_results = topic_values_to_df()

In [321]:
topic_values_results.head()

Unnamed: 0,First_Topic,Second_Topic,Third_Topic,Fourth_Topic
0,1,5,6,2
1,1,0,3,4
2,1,5,4,2
3,5,7,0,2
4,6,7,2,0


In [332]:
# fix axis problems for pd.concat:
topic_values_results.reset_index(drop=True, inplace=True)

In [338]:
full_dropped = full_dataset

In [339]:
df_topics = pd.concat([topic_values_results, full_dropped], axis=1)
df_topics.head()

Unnamed: 0,First_Topic,Second_Topic,Third_Topic,Fourth_Topic,Link,Raw_Text,Hairtype,Processed_Text,lemmatized_txt,lemmatized_for_tfidf_stopped
0,1,5,6,2,/r/curlyhair/comments/8pjwgr/help_devacurl_alt...,I haven't stumbled upon the loc/lco method tha...,Dreadlocks,stumbled upon loc lco method tha...,stumbled upon loc lco method thanks bringing u...,stumbled upon loc lco method thanks bringing n...
1,1,0,3,4,/r/curlyhair/comments/8q3nsi/curly_trouble/e0g...,"To me, you look like a 3c but it's much more i...",Dreadlocks,me look like 3c much importa...,me look like 3c much important know porosity t...,look like 3c much important know porosity http...
2,1,5,4,2,/r/curlyhair/comments/8q3nsi/curly_trouble/e0g...,"Thanks! So, I kknow about the LOC/LCO but I am...",Dreadlocks,thanks so kknow loc lco sure ...,thanks so kknow loc lco sure suppose follow tr...,thanks kknow loc lco sure suppose follow try p...
3,5,7,0,2,/r/curlyhair/comments/8r1qlk/the_weekly_no_que...,I'm struggling with my hair so hard lately! B...,Dreadlocks,i m struggling hair hard lately ...,i m struggling hair hard lately background wen...,struggling hair hard lately background went cg...
4,6,7,2,0,/r/curlyhair/comments/8s23x7/would_a_collar_bo...,the top 4-5” of my head are more wavy and what...,Dreadlocks,top 4 5 head wavy what s rin...,top 4 5 head wavy what s ringlet cut hair woul...,top 4 5 head wavy ringlet cut hair would wavy ...


In [341]:
# Replace topic numbers with words:
topic_names_for_replacement = {0: 'Styling/Moisturizing Kinky Hair', 1: 'Advice for Textured Hair', 2: 'Cutting/Growing', \
                        3: 'Curlygirl/Devacurl/Sulfate-Free', 4: 'Moisture/Masks', 5: 'Daily Routines',\
                        6: 'Styling/Volume', 7:'Drying Routines' }

# Reassign:
df_topics['First_Topic_Name'] = df_topics['First_Topic'].map(topic_names_for_replacement)
df_topics['Second_Topic_Name'] = df_topics['Second_Topic'].map(topic_names_for_replacement)
df_topics['Third_Topic_Name'] = df_topics['Third_Topic'].map(topic_names_for_replacement)
df_topics['Fourth_Topic_Name'] = df_topics['Fourth_Topic'].map(topic_names_for_replacement)

In [342]:
df_topics.head()

Unnamed: 0,First_Topic,Second_Topic,Third_Topic,Fourth_Topic,Link,Raw_Text,Hairtype,Processed_Text,lemmatized_txt,lemmatized_for_tfidf_stopped,First_Topic_Name,Second_Topic_Name,Third_Topic_Name,Fourth_Topic_Name
0,1,5,6,2,/r/curlyhair/comments/8pjwgr/help_devacurl_alt...,I haven't stumbled upon the loc/lco method tha...,Dreadlocks,stumbled upon loc lco method tha...,stumbled upon loc lco method thanks bringing u...,stumbled upon loc lco method thanks bringing n...,Advice for Textured Hair,Daily Routines,Styling/Volume,Cutting/Growing
1,1,0,3,4,/r/curlyhair/comments/8q3nsi/curly_trouble/e0g...,"To me, you look like a 3c but it's much more i...",Dreadlocks,me look like 3c much importa...,me look like 3c much important know porosity t...,look like 3c much important know porosity http...,Advice for Textured Hair,Styling/Moisturizing Kinky Hair,Curlygirl/Devacurl/Sulfate-Free,Moisture/Masks
2,1,5,4,2,/r/curlyhair/comments/8q3nsi/curly_trouble/e0g...,"Thanks! So, I kknow about the LOC/LCO but I am...",Dreadlocks,thanks so kknow loc lco sure ...,thanks so kknow loc lco sure suppose follow tr...,thanks kknow loc lco sure suppose follow try p...,Advice for Textured Hair,Daily Routines,Moisture/Masks,Cutting/Growing
3,5,7,0,2,/r/curlyhair/comments/8r1qlk/the_weekly_no_que...,I'm struggling with my hair so hard lately! B...,Dreadlocks,i m struggling hair hard lately ...,i m struggling hair hard lately background wen...,struggling hair hard lately background went cg...,Daily Routines,Drying Routines,Styling/Moisturizing Kinky Hair,Cutting/Growing
4,6,7,2,0,/r/curlyhair/comments/8s23x7/would_a_collar_bo...,the top 4-5” of my head are more wavy and what...,Dreadlocks,top 4 5 head wavy what s rin...,top 4 5 head wavy what s ringlet cut hair woul...,top 4 5 head wavy ringlet cut hair would wavy ...,Styling/Volume,Drying Routines,Cutting/Growing,Styling/Moisturizing Kinky Hair


In [344]:
df_topics.to_csv(r'~final_dataset_for_streamlit.csv')