In [None]:
import nltk
import pandas as pd
import numpy as np
import re
import string
from newspaper import Article
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

In [None]:
article = Article("https://time.com/4793331/instagram-social-media-mental-health/")
article.download()
article.parse()
corpus = article.text

In [None]:
def clean(text):
    text = text.lower()
    text = re.sub('\n','',text) 
    text = re.sub('[.*?\â£]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

clean_corpus = [clean(corpus)]
clean_corpus

In [None]:
cv = CountVectorizer(ngram_range = (1,1), stop_words='english')
data_cv = cv.fit_transform(clean_corpus)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
count = data_dtm.index
count.name = 'count'
data_dtm.head()

print(cv.get_feature_names())

In [None]:
# Correlating text to colors that affect mood
purple_neg = ['mystery', 'moodiness', 'boredom', 'confusion', 'disconnection']
purple_pos = ['connection', 'wisdom', 'spirituality', 'royalty', 'nobility', 'luxury', 'ambition', 'wealth', 'awaken']

blue_neg = ['coldness', 'masculinity', 'disgust', 'conflict', 'aggression']
blue_pos = ['intuition', 'imagination', 'iranquility', 'security', 'integrity', 'peace', 'loyalty', 'faith', 'intelligence', 'spiritual']

teal_neg = ['femininity', 'hostility', 'conflict', 'aggression', 'insecurity']
teal_pos = ['communication', 'expression', 'spiritual', 'healing', 'protection', 'sophisticated', 'cleanse', 'expression']

green_neg = ['envy', 'jealousy', 'guilt', 'fear', 'judgmental', 'unforgiving', 'anxiety']
green_pos = ['compassion', 'trust', 'freshness', 'environment', 'new', 'money', 'fertile', 'health', 'grounded', 'reconnecting', 'balanced']

yellow_neg = ['irresponsible', 'instability', 'grief', 'addiction', 'insecurity', 'depression']
yellow_pos = ['confident', 'bright', 'sunny', 'energetic', 'warm', 'happy', 'perky', 'joy', 'intellect']

orange_neg = ['ignorance', 'sluggishness', 'shame', 'compulsiveness', 'loneliness', 'dependence']
orange_pos = ['courage', 'confidence', 'friendliness', 'success', 'creativity', 'openness', 'sexual']

red_neg = ['anger', 'unsafe', 'warned', 'anxious', 'volatile', 'hopelessness']
red_pos = ['love', 'passion', 'energy', 'power', 'strength', 'heat', 'desire', 'safe', 'instinctive', 'security', 'liberating']


purple_neg_dtm = data_dtm.filter(purple_neg)
purple_pos_dtm = data_dtm.filter(purple_pos)

blue_neg_dtm = data_dtm.filter(blue_neg)
blue_pos_dtm = data_dtm.filter(blue_pos)

teal_neg_dtm = data_dtm.filter(teal_neg)
teal_pos_dtm = data_dtm.filter(teal_pos)

green_neg_dtm = data_dtm.filter(green_neg)
green_pos_dtm = data_dtm.filter(green_pos)

yellow_neg_dtm = data_dtm.filter(yellow_neg)
yellow_pos_dtm = data_dtm.filter(yellow_pos)

orange_neg_dtm = data_dtm.filter(orange_neg)
orange_pos_dtm = data_dtm.filter(orange_pos)

red_neg_dtm = data_dtm.filter(red_neg)
red_pos_dtm = data_dtm.filter(red_pos)

In [None]:
colors = ['purple','teal','green','yellow','orange','red']
scores = [4,2,33,1,1,1]
sliceColors = ['pink', 'chocolate', 'ivory','yellow']
plt.pie(scores,labels=colors,colors = colors,autopct='%1.1f%%')
plt.title('Medium Article Color Scoring')
plt.axis('equal')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()

cv_unigram = CountVectorizer(ngram_range = (1,1), stop_words='english')
cv_bigram = CountVectorizer(ngram_range = (2,2), stop_words='english')

def unigram_themes(data):
    data_cv_stop = cv_unigram.fit_transform(data)
    transformed_weights = transformer.fit_transform(data_cv_stop)
    weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': cv_unigram.get_feature_names(), 'weight': weights})
    return weights_df.sort_values(by='weight', ascending=False).head(7)
    
def bigram_themes(data):
    data_cv_stop = cv_bigram.fit_transform(data)
    transformed_weights = transformer.fit_transform(data_cv_stop)
    weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': cv_bigram.get_feature_names(), 'weight': weights})
    return weights_df.sort_values(by='weight', ascending=False).head(7)


#unigram_themes(clean_corpus)
bigram_themes(clean_corpus)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
n_topics = 2
n_words = 4

cv = CountVectorizer(ngram_range = (1,1), stop_words = 'english')

def print_LDA_topics(model, count_vectorizer, n_top_words):
    words = cv.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        

data_cv = cv.fit_transform(clean_corpus)
lda = LDA(n_components=n_topics, n_jobs=-1)
lda.fit(data_cv)
print_LDA_topics(lda, cv, n_words)